]> git.proxmox.com Git - mirror_qemu.git/blame - nbd/server.c
Merge tag 'pull-request-2023-12-20' of https://gitlab.com/thuth/qemu into staging
[mirror_qemu.git] / nbd / server.c
CommitLineData
75818250 1/*
a7c8ed36 2 * Copyright Red Hat
7a5ca864
FB
3 * Copyright (C) 2005 Anthony Liguori <anthony@codemonkey.ws>
4 *
798bfe00 5 * Network Block Device Server Side
7a5ca864
FB
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; under version 2 of the License.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
8167ee88 17 * along with this program; if not, see <http://www.gnu.org/licenses/>.
75818250 18 */
7a5ca864 19
d38ea87a 20#include "qemu/osdep.h"
56ee8626 21
e2c1c34f 22#include "block/block_int.h"
56ee8626 23#include "block/export.h"
e2c1c34f 24#include "block/dirty-bitmap.h"
da34e65c 25#include "qapi/error.h"
dc5e9ac7 26#include "qemu/queue.h"
9588463e 27#include "trace.h"
798bfe00 28#include "nbd-internal.h"
416e34bd 29#include "qemu/units.h"
5df022cf 30#include "qemu/memalign.h"
ca441480 31
e7b1948d 32#define NBD_META_ID_BASE_ALLOCATION 0
71719cd5 33#define NBD_META_ID_ALLOCATION_DEPTH 1
3b1f244c 34/* Dirty bitmaps use 'NBD_META_ID_DIRTY_BITMAP + i', so keep this id last. */
71719cd5 35#define NBD_META_ID_DIRTY_BITMAP 2
3d068aff 36
416e34bd
EB
37/*
38 * NBD_MAX_BLOCK_STATUS_EXTENTS: 1 MiB of extents data. An empirical
3d068aff
VSO
39 * constant. If an increase is needed, note that the NBD protocol
40 * recommends no larger than 32 mb, so that the client won't consider
416e34bd
EB
41 * the reply as a denial of service attack.
42 */
43#define NBD_MAX_BLOCK_STATUS_EXTENTS (1 * MiB / 8)
e7b1948d 44
ca441480
PB
45static int system_errno_to_nbd_errno(int err)
46{
47 switch (err) {
48 case 0:
49 return NBD_SUCCESS;
50 case EPERM:
c0301fcc 51 case EROFS:
ca441480
PB
52 return NBD_EPERM;
53 case EIO:
54 return NBD_EIO;
55 case ENOMEM:
56 return NBD_ENOMEM;
57#ifdef EDQUOT
58 case EDQUOT:
59#endif
60 case EFBIG:
61 case ENOSPC:
62 return NBD_ENOSPC;
bae245d1
EB
63 case EOVERFLOW:
64 return NBD_EOVERFLOW;
0a479545
EB
65 case ENOTSUP:
66#if ENOTSUP != EOPNOTSUPP
67 case EOPNOTSUPP:
68#endif
69 return NBD_ENOTSUP;
b6f5d3b5
EB
70 case ESHUTDOWN:
71 return NBD_ESHUTDOWN;
ca441480
PB
72 case EINVAL:
73 default:
74 return NBD_EINVAL;
75 }
76}
77
9a304d29
PB
78/* Definitions for opaque data types */
79
315f78ab 80typedef struct NBDRequestData NBDRequestData;
9a304d29 81
315f78ab 82struct NBDRequestData {
9a304d29
PB
83 NBDClient *client;
84 uint8_t *data;
29b6c3b3 85 bool complete;
9a304d29
PB
86};
87
88struct NBDExport {
56ee8626 89 BlockExport common;
0ddf08db 90
ee0a19ec 91 char *name;
b1a75b33 92 char *description;
9d26dfcb 93 uint64_t size;
7423f417 94 uint16_t nbdflags;
4b9441f6 95 QTAILQ_HEAD(, NBDClient) clients;
ee0a19ec 96 QTAILQ_ENTRY(NBDExport) next;
958c717d 97
cd7fca95 98 BlockBackend *eject_notifier_blk;
741cc431 99 Notifier eject_notifier;
3d068aff 100
71719cd5 101 bool allocation_depth;
3b1f244c
EB
102 BdrvDirtyBitmap **export_bitmaps;
103 size_t nr_export_bitmaps;
9a304d29
PB
104};
105
ee0a19ec
PB
106static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports);
107
fd358d83
EB
108/*
109 * NBDMetaContexts represents a list of meta contexts in use,
e7b1948d 110 * as selected by NBD_OPT_SET_META_CONTEXT. Also used for
fd358d83
EB
111 * NBD_OPT_LIST_META_CONTEXT.
112 */
113struct NBDMetaContexts {
114 const NBDExport *exp; /* associated export */
47ec485e 115 size_t count; /* number of negotiated contexts */
e7b1948d 116 bool base_allocation; /* export base:allocation context (block status) */
71719cd5 117 bool allocation_depth; /* export qemu:allocation-depth */
3b1f244c
EB
118 bool *bitmaps; /*
119 * export qemu:dirty-bitmap:<export bitmap name>,
120 * sized by exp->nr_export_bitmaps
121 */
fd358d83 122};
e7b1948d 123
9a304d29
PB
124struct NBDClient {
125 int refcount;
0c9390d9 126 void (*close_fn)(NBDClient *client, bool negotiated);
9a304d29
PB
127
128 NBDExport *exp;
f95910fe 129 QCryptoTLSCreds *tlscreds;
b25e12da 130 char *tlsauthz;
1c778ef7
DB
131 QIOChannelSocket *sioc; /* The underlying data channel */
132 QIOChannel *ioc; /* The current I/O channel which may differ (eg TLS) */
9a304d29
PB
133
134 Coroutine *recv_coroutine;
135
136 CoMutex send_lock;
137 Coroutine *send_coroutine;
138
f148ae7d
SL
139 bool read_yielding;
140 bool quiescing;
141
4b9441f6 142 QTAILQ_ENTRY(NBDClient) next;
9a304d29 143 int nb_requests;
ff2b68aa 144 bool closing;
5c54e7fa 145
6e280648
EB
146 uint32_t check_align; /* If non-zero, check for aligned client requests */
147
ac132d05 148 NBDMode mode;
fd358d83 149 NBDMetaContexts contexts; /* Negotiated meta contexts */
9a304d29 150
0cfae925
VSO
151 uint32_t opt; /* Current option being negotiated */
152 uint32_t optlen; /* remaining length of data in ioc for the option being
153 negotiated now */
154};
7a5ca864 155
ff82911c 156static void nbd_client_receive_next_request(NBDClient *client);
958c717d 157
6b8c01e7 158/* Basic flow for negotiation
7a5ca864
FB
159
160 Server Client
7a5ca864 161 Negotiate
6b8c01e7
PB
162
163 or
164
165 Server Client
166 Negotiate #1
167 Option
168 Negotiate #2
169
170 ----
171
172 followed by
173
174 Server Client
7a5ca864
FB
175 Request
176 Response
177 Request
178 Response
179 ...
180 ...
181 Request (type == 2)
6b8c01e7 182
7a5ca864
FB
183*/
184
1d17922a
VSO
185static inline void set_be_option_rep(NBDOptionReply *rep, uint32_t option,
186 uint32_t type, uint32_t length)
187{
188 stq_be_p(&rep->magic, NBD_REP_MAGIC);
189 stl_be_p(&rep->option, option);
190 stl_be_p(&rep->type, type);
191 stl_be_p(&rep->length, length);
192}
193
526e5c65
EB
194/* Send a reply header, including length, but no payload.
195 * Return -errno on error, 0 on success. */
0cfae925
VSO
196static int nbd_negotiate_send_rep_len(NBDClient *client, uint32_t type,
197 uint32_t len, Error **errp)
6b8c01e7 198{
1d17922a 199 NBDOptionReply rep;
6b8c01e7 200
1d17922a 201 trace_nbd_negotiate_send_rep_len(client->opt, nbd_opt_lookup(client->opt),
3736cc5b 202 type, nbd_rep_lookup(type), len);
f95910fe 203
f37708f6 204 assert(len < NBD_MAX_BUFFER_SIZE);
2fd2c840 205
1d17922a
VSO
206 set_be_option_rep(&rep, client->opt, type, len);
207 return nbd_write(client->ioc, &rep, sizeof(rep), errp);
f5076b5a 208}
6b8c01e7 209
526e5c65
EB
210/* Send a reply header with default 0 length.
211 * Return -errno on error, 0 on success. */
0cfae925 212static int nbd_negotiate_send_rep(NBDClient *client, uint32_t type,
2fd2c840 213 Error **errp)
526e5c65 214{
0cfae925 215 return nbd_negotiate_send_rep_len(client, type, 0, errp);
526e5c65
EB
216}
217
36683283
EB
218/* Send an error reply.
219 * Return -errno on error, 0 on success. */
9edc6313 220static int G_GNUC_PRINTF(4, 0)
41f5dfaf
EB
221nbd_negotiate_send_rep_verr(NBDClient *client, uint32_t type,
222 Error **errp, const char *fmt, va_list va)
36683283 223{
795d946d 224 ERRP_GUARD();
df18c04e 225 g_autofree char *msg = NULL;
36683283
EB
226 int ret;
227 size_t len;
228
36683283 229 msg = g_strdup_vprintf(fmt, va);
36683283 230 len = strlen(msg);
5c4fe018 231 assert(len < NBD_MAX_STRING_SIZE);
9588463e 232 trace_nbd_negotiate_send_rep_err(msg);
0cfae925 233 ret = nbd_negotiate_send_rep_len(client, type, len, errp);
36683283 234 if (ret < 0) {
df18c04e 235 return ret;
36683283 236 }
0cfae925 237 if (nbd_write(client->ioc, msg, len, errp) < 0) {
2fd2c840 238 error_prepend(errp, "write failed (error message): ");
df18c04e 239 return -EIO;
36683283 240 }
2fd2c840 241
df18c04e 242 return 0;
36683283
EB
243}
244
5c4fe018
EB
245/*
246 * Return a malloc'd copy of @name suitable for use in an error reply.
247 */
248static char *
249nbd_sanitize_name(const char *name)
250{
251 if (strnlen(name, 80) < 80) {
252 return g_strdup(name);
253 }
254 /* XXX Should we also try to sanitize any control characters? */
255 return g_strdup_printf("%.80s...", name);
256}
257
41f5dfaf
EB
258/* Send an error reply.
259 * Return -errno on error, 0 on success. */
9edc6313 260static int G_GNUC_PRINTF(4, 5)
41f5dfaf
EB
261nbd_negotiate_send_rep_err(NBDClient *client, uint32_t type,
262 Error **errp, const char *fmt, ...)
263{
264 va_list va;
265 int ret;
266
267 va_start(va, fmt);
268 ret = nbd_negotiate_send_rep_verr(client, type, errp, fmt, va);
269 va_end(va);
270 return ret;
271}
272
894e0280
EB
273/* Drop remainder of the current option, and send a reply with the
274 * given error type and message. Return -errno on read or write
275 * failure; or 0 if connection is still live. */
9edc6313 276static int G_GNUC_PRINTF(4, 0)
2e425fd5
VSO
277nbd_opt_vdrop(NBDClient *client, uint32_t type, Error **errp,
278 const char *fmt, va_list va)
894e0280
EB
279{
280 int ret = nbd_drop(client->ioc, client->optlen, errp);
894e0280
EB
281
282 client->optlen = 0;
283 if (!ret) {
894e0280 284 ret = nbd_negotiate_send_rep_verr(client, type, errp, fmt, va);
894e0280
EB
285 }
286 return ret;
287}
288
9edc6313 289static int G_GNUC_PRINTF(4, 5)
2e425fd5
VSO
290nbd_opt_drop(NBDClient *client, uint32_t type, Error **errp,
291 const char *fmt, ...)
292{
293 int ret;
294 va_list va;
295
296 va_start(va, fmt);
297 ret = nbd_opt_vdrop(client, type, errp, fmt, va);
298 va_end(va);
299
300 return ret;
301}
302
9edc6313 303static int G_GNUC_PRINTF(3, 4)
2e425fd5
VSO
304nbd_opt_invalid(NBDClient *client, Error **errp, const char *fmt, ...)
305{
306 int ret;
307 va_list va;
308
309 va_start(va, fmt);
310 ret = nbd_opt_vdrop(client, NBD_REP_ERR_INVALID, errp, fmt, va);
311 va_end(va);
312
313 return ret;
314}
315
894e0280 316/* Read size bytes from the unparsed payload of the current option.
d1e2c3e7 317 * If @check_nul, require that no NUL bytes appear in buffer.
894e0280
EB
318 * Return -errno on I/O error, 0 if option was completely handled by
319 * sending a reply about inconsistent lengths, or 1 on success. */
320static int nbd_opt_read(NBDClient *client, void *buffer, size_t size,
d1e2c3e7 321 bool check_nul, Error **errp)
894e0280
EB
322{
323 if (size > client->optlen) {
2e425fd5
VSO
324 return nbd_opt_invalid(client, errp,
325 "Inconsistent lengths in option %s",
326 nbd_opt_lookup(client->opt));
894e0280
EB
327 }
328 client->optlen -= size;
d1e2c3e7
EB
329 if (qio_channel_read_all(client->ioc, buffer, size, errp) < 0) {
330 return -EIO;
331 }
332
333 if (check_nul && strnlen(buffer, size) != size) {
334 return nbd_opt_invalid(client, errp,
335 "Unexpected embedded NUL in option %s",
336 nbd_opt_lookup(client->opt));
337 }
338 return 1;
894e0280
EB
339}
340
e7b1948d
VSO
341/* Drop size bytes from the unparsed payload of the current option.
342 * Return -errno on I/O error, 0 if option was completely handled by
343 * sending a reply about inconsistent lengths, or 1 on success. */
344static int nbd_opt_skip(NBDClient *client, size_t size, Error **errp)
345{
346 if (size > client->optlen) {
347 return nbd_opt_invalid(client, errp,
348 "Inconsistent lengths in option %s",
349 nbd_opt_lookup(client->opt));
350 }
351 client->optlen -= size;
352 return nbd_drop(client->ioc, size, errp) < 0 ? -EIO : 1;
353}
354
12296459
VSO
355/* nbd_opt_read_name
356 *
357 * Read a string with the format:
93676c88 358 * uint32_t len (<= NBD_MAX_STRING_SIZE)
12296459
VSO
359 * len bytes string (not 0-terminated)
360 *
9d7ab222 361 * On success, @name will be allocated.
12296459
VSO
362 * If @length is non-null, it will be set to the actual string length.
363 *
364 * Return -errno on I/O error, 0 if option was completely handled by
365 * sending a reply about inconsistent lengths, or 1 on success.
366 */
9d7ab222 367static int nbd_opt_read_name(NBDClient *client, char **name, uint32_t *length,
12296459
VSO
368 Error **errp)
369{
370 int ret;
371 uint32_t len;
9d7ab222 372 g_autofree char *local_name = NULL;
12296459 373
9d7ab222 374 *name = NULL;
d1e2c3e7 375 ret = nbd_opt_read(client, &len, sizeof(len), false, errp);
12296459
VSO
376 if (ret <= 0) {
377 return ret;
378 }
80c7c2b0 379 len = cpu_to_be32(len);
12296459 380
93676c88 381 if (len > NBD_MAX_STRING_SIZE) {
12296459
VSO
382 return nbd_opt_invalid(client, errp,
383 "Invalid name length: %" PRIu32, len);
384 }
385
9d7ab222 386 local_name = g_malloc(len + 1);
d1e2c3e7 387 ret = nbd_opt_read(client, local_name, len, true, errp);
12296459
VSO
388 if (ret <= 0) {
389 return ret;
390 }
9d7ab222 391 local_name[len] = '\0';
12296459
VSO
392
393 if (length) {
394 *length = len;
395 }
9d7ab222 396 *name = g_steal_pointer(&local_name);
12296459
VSO
397
398 return 1;
399}
400
526e5c65
EB
401/* Send a single NBD_REP_SERVER reply to NBD_OPT_LIST, including payload.
402 * Return -errno on error, 0 on success. */
0cfae925 403static int nbd_negotiate_send_rep_list(NBDClient *client, NBDExport *exp,
2fd2c840 404 Error **errp)
32d7d2e0 405{
795d946d 406 ERRP_GUARD();
b1a75b33 407 size_t name_len, desc_len;
526e5c65 408 uint32_t len;
b1a75b33
EB
409 const char *name = exp->name ? exp->name : "";
410 const char *desc = exp->description ? exp->description : "";
0cfae925 411 QIOChannel *ioc = client->ioc;
2e5c9ad6 412 int ret;
32d7d2e0 413
9588463e 414 trace_nbd_negotiate_send_rep_list(name, desc);
b1a75b33
EB
415 name_len = strlen(name);
416 desc_len = strlen(desc);
93676c88 417 assert(name_len <= NBD_MAX_STRING_SIZE && desc_len <= NBD_MAX_STRING_SIZE);
526e5c65 418 len = name_len + desc_len + sizeof(len);
0cfae925 419 ret = nbd_negotiate_send_rep_len(client, NBD_REP_SERVER, len, errp);
2e5c9ad6
VSO
420 if (ret < 0) {
421 return ret;
32d7d2e0 422 }
526e5c65 423
32d7d2e0 424 len = cpu_to_be32(name_len);
2fd2c840
VSO
425 if (nbd_write(ioc, &len, sizeof(len), errp) < 0) {
426 error_prepend(errp, "write failed (name length): ");
b1a75b33
EB
427 return -EINVAL;
428 }
2fd2c840
VSO
429
430 if (nbd_write(ioc, name, name_len, errp) < 0) {
431 error_prepend(errp, "write failed (name buffer): ");
32d7d2e0
HB
432 return -EINVAL;
433 }
2fd2c840
VSO
434
435 if (nbd_write(ioc, desc, desc_len, errp) < 0) {
436 error_prepend(errp, "write failed (description buffer): ");
32d7d2e0
HB
437 return -EINVAL;
438 }
2fd2c840 439
32d7d2e0
HB
440 return 0;
441}
442
526e5c65
EB
443/* Process the NBD_OPT_LIST command, with a potential series of replies.
444 * Return -errno on error, 0 on success. */
e68c35cf 445static int nbd_negotiate_handle_list(NBDClient *client, Error **errp)
32d7d2e0 446{
32d7d2e0 447 NBDExport *exp;
0cfae925 448 assert(client->opt == NBD_OPT_LIST);
32d7d2e0 449
32d7d2e0
HB
450 /* For each export, send a NBD_REP_SERVER reply. */
451 QTAILQ_FOREACH(exp, &exports, next) {
0cfae925 452 if (nbd_negotiate_send_rep_list(client, exp, errp)) {
32d7d2e0
HB
453 return -EINVAL;
454 }
455 }
456 /* Finish with a NBD_REP_ACK. */
0cfae925 457 return nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
32d7d2e0
HB
458}
459
fd358d83 460static void nbd_check_meta_export(NBDClient *client, NBDExport *exp)
e7b1948d 461{
fd358d83
EB
462 if (exp != client->contexts.exp) {
463 client->contexts.count = 0;
47ec485e 464 }
e7b1948d
VSO
465}
466
f37708f6
EB
467/* Send a reply to NBD_OPT_EXPORT_NAME.
468 * Return -errno on error, 0 on success. */
dbb38caa 469static int nbd_negotiate_handle_export_name(NBDClient *client, bool no_zeroes,
2fd2c840 470 Error **errp)
f5076b5a 471{
795d946d 472 ERRP_GUARD();
9d7ab222 473 g_autofree char *name = NULL;
5f66d060 474 char buf[NBD_REPLY_EXPORT_NAME_SIZE] = "";
23e099c3
EB
475 size_t len;
476 int ret;
dbb38caa 477 uint16_t myflags;
6b8c01e7 478
f5076b5a
HB
479 /* Client sends:
480 [20 .. xx] export name (length bytes)
5f66d060
EB
481 Server replies:
482 [ 0 .. 7] size
483 [ 8 .. 9] export flags
484 [10 .. 133] reserved (0) [unless no_zeroes]
f5076b5a 485 */
9588463e 486 trace_nbd_negotiate_handle_export_name();
9c1d2614
EB
487 if (client->mode >= NBD_MODE_EXTENDED) {
488 error_setg(errp, "Extended headers already negotiated");
489 return -EINVAL;
490 }
93676c88 491 if (client->optlen > NBD_MAX_STRING_SIZE) {
2fd2c840 492 error_setg(errp, "Bad length received");
d9faeed8 493 return -EINVAL;
6b8c01e7 494 }
9d7ab222 495 name = g_malloc(client->optlen + 1);
e6798f06 496 if (nbd_read(client->ioc, name, client->optlen, "export name", errp) < 0) {
32f158a6 497 return -EIO;
6b8c01e7 498 }
0cfae925
VSO
499 name[client->optlen] = '\0';
500 client->optlen = 0;
6b8c01e7 501
9588463e 502 trace_nbd_negotiate_handle_export_name_request(name);
9344e5f5 503
6b8c01e7
PB
504 client->exp = nbd_export_find(name);
505 if (!client->exp) {
2fd2c840 506 error_setg(errp, "export not found");
d9faeed8 507 return -EINVAL;
6b8c01e7 508 }
fd358d83 509 nbd_check_meta_export(client, client->exp);
6b8c01e7 510
dbb38caa 511 myflags = client->exp->nbdflags;
ac132d05 512 if (client->mode >= NBD_MODE_STRUCTURED) {
dbb38caa
EB
513 myflags |= NBD_FLAG_SEND_DF;
514 }
2dcbb11b
EB
515 if (client->mode >= NBD_MODE_EXTENDED && client->contexts.count) {
516 myflags |= NBD_FLAG_BLOCK_STAT_PAYLOAD;
517 }
dbb38caa 518 trace_nbd_negotiate_new_style_size_flags(client->exp->size, myflags);
23e099c3 519 stq_be_p(buf, client->exp->size);
dbb38caa 520 stw_be_p(buf + 8, myflags);
23e099c3
EB
521 len = no_zeroes ? 10 : sizeof(buf);
522 ret = nbd_write(client->ioc, buf, len, errp);
523 if (ret < 0) {
524 error_prepend(errp, "write failed: ");
525 return ret;
526 }
527
6b8c01e7 528 QTAILQ_INSERT_TAIL(&client->exp->clients, client, next);
c69de1be 529 blk_exp_ref(&client->exp->common);
d9faeed8
VSO
530
531 return 0;
6b8c01e7
PB
532}
533
f37708f6
EB
534/* Send a single NBD_REP_INFO, with a buffer @buf of @length bytes.
535 * The buffer does NOT include the info type prefix.
536 * Return -errno on error, 0 if ready to send more. */
0cfae925 537static int nbd_negotiate_send_info(NBDClient *client,
f37708f6
EB
538 uint16_t info, uint32_t length, void *buf,
539 Error **errp)
540{
541 int rc;
542
543 trace_nbd_negotiate_send_info(info, nbd_info_lookup(info), length);
0cfae925 544 rc = nbd_negotiate_send_rep_len(client, NBD_REP_INFO,
f37708f6
EB
545 sizeof(info) + length, errp);
546 if (rc < 0) {
547 return rc;
548 }
80c7c2b0 549 info = cpu_to_be16(info);
f37708f6
EB
550 if (nbd_write(client->ioc, &info, sizeof(info), errp) < 0) {
551 return -EIO;
552 }
553 if (nbd_write(client->ioc, buf, length, errp) < 0) {
554 return -EIO;
555 }
556 return 0;
557}
558
a16a7907
EB
559/* nbd_reject_length: Handle any unexpected payload.
560 * @fatal requests that we quit talking to the client, even if we are able
561 * to successfully send an error reply.
562 * Return:
563 * -errno transmission error occurred or @fatal was requested, errp is set
564 * 0 error message successfully sent to client, errp is not set
565 */
0cfae925 566static int nbd_reject_length(NBDClient *client, bool fatal, Error **errp)
a16a7907
EB
567{
568 int ret;
569
0cfae925 570 assert(client->optlen);
2e425fd5
VSO
571 ret = nbd_opt_invalid(client, errp, "option '%s' has unexpected length",
572 nbd_opt_lookup(client->opt));
a16a7907 573 if (fatal && !ret) {
894e0280 574 error_setg(errp, "option '%s' has unexpected length",
0cfae925 575 nbd_opt_lookup(client->opt));
a16a7907
EB
576 return -EINVAL;
577 }
578 return ret;
579}
580
f37708f6
EB
581/* Handle NBD_OPT_INFO and NBD_OPT_GO.
582 * Return -errno on error, 0 if ready for next option, and 1 to move
583 * into transmission phase. */
dbb38caa 584static int nbd_negotiate_handle_info(NBDClient *client, Error **errp)
f37708f6
EB
585{
586 int rc;
9d7ab222 587 g_autofree char *name = NULL;
f37708f6
EB
588 NBDExport *exp;
589 uint16_t requests;
590 uint16_t request;
bbc35fc2 591 uint32_t namelen = 0;
f37708f6 592 bool sendname = false;
0c1d50bd
EB
593 bool blocksize = false;
594 uint32_t sizes[3];
f37708f6 595 char buf[sizeof(uint64_t) + sizeof(uint16_t)];
6e280648 596 uint32_t check_align = 0;
dbb38caa 597 uint16_t myflags;
f37708f6
EB
598
599 /* Client sends:
600 4 bytes: L, name length (can be 0)
601 L bytes: export name
602 2 bytes: N, number of requests (can be 0)
603 N * 2 bytes: N requests
604 */
9d7ab222 605 rc = nbd_opt_read_name(client, &name, &namelen, errp);
894e0280
EB
606 if (rc <= 0) {
607 return rc;
f37708f6 608 }
f37708f6
EB
609 trace_nbd_negotiate_handle_export_name_request(name);
610
d1e2c3e7 611 rc = nbd_opt_read(client, &requests, sizeof(requests), false, errp);
894e0280
EB
612 if (rc <= 0) {
613 return rc;
f37708f6 614 }
80c7c2b0 615 requests = be16_to_cpu(requests);
f37708f6 616 trace_nbd_negotiate_handle_info_requests(requests);
f37708f6 617 while (requests--) {
d1e2c3e7 618 rc = nbd_opt_read(client, &request, sizeof(request), false, errp);
894e0280
EB
619 if (rc <= 0) {
620 return rc;
f37708f6 621 }
80c7c2b0 622 request = be16_to_cpu(request);
f37708f6
EB
623 trace_nbd_negotiate_handle_info_request(request,
624 nbd_info_lookup(request));
0c1d50bd
EB
625 /* We care about NBD_INFO_NAME and NBD_INFO_BLOCK_SIZE;
626 * everything else is either a request we don't know or
627 * something we send regardless of request */
628 switch (request) {
629 case NBD_INFO_NAME:
f37708f6 630 sendname = true;
0c1d50bd
EB
631 break;
632 case NBD_INFO_BLOCK_SIZE:
633 blocksize = true;
634 break;
f37708f6
EB
635 }
636 }
894e0280
EB
637 if (client->optlen) {
638 return nbd_reject_length(client, false, errp);
639 }
f37708f6
EB
640
641 exp = nbd_export_find(name);
642 if (!exp) {
5c4fe018
EB
643 g_autofree char *sane_name = nbd_sanitize_name(name);
644
0cfae925
VSO
645 return nbd_negotiate_send_rep_err(client, NBD_REP_ERR_UNKNOWN,
646 errp, "export '%s' not present",
5c4fe018 647 sane_name);
f37708f6 648 }
fd358d83
EB
649 if (client->opt == NBD_OPT_GO) {
650 nbd_check_meta_export(client, exp);
651 }
f37708f6
EB
652
653 /* Don't bother sending NBD_INFO_NAME unless client requested it */
654 if (sendname) {
0cfae925 655 rc = nbd_negotiate_send_info(client, NBD_INFO_NAME, namelen, name,
f37708f6
EB
656 errp);
657 if (rc < 0) {
658 return rc;
659 }
660 }
661
662 /* Send NBD_INFO_DESCRIPTION only if available, regardless of
663 * client request */
664 if (exp->description) {
665 size_t len = strlen(exp->description);
666
93676c88 667 assert(len <= NBD_MAX_STRING_SIZE);
0cfae925 668 rc = nbd_negotiate_send_info(client, NBD_INFO_DESCRIPTION,
f37708f6
EB
669 len, exp->description, errp);
670 if (rc < 0) {
671 return rc;
672 }
673 }
674
0c1d50bd
EB
675 /* Send NBD_INFO_BLOCK_SIZE always, but tweak the minimum size
676 * according to whether the client requested it, and according to
677 * whether this is OPT_INFO or OPT_GO. */
b0245d64
EB
678 /* minimum - 1 for back-compat, or actual if client will obey it. */
679 if (client->opt == NBD_OPT_INFO || blocksize) {
37a4f70c 680 check_align = sizes[0] = blk_get_request_alignment(exp->common.blk);
b0245d64
EB
681 } else {
682 sizes[0] = 1;
683 }
684 assert(sizes[0] <= NBD_MAX_BUFFER_SIZE);
0c1d50bd
EB
685 /* preferred - Hard-code to 4096 for now.
686 * TODO: is blk_bs(blk)->bl.opt_transfer appropriate? */
b0245d64 687 sizes[1] = MAX(4096, sizes[0]);
0c1d50bd 688 /* maximum - At most 32M, but smaller as appropriate. */
37a4f70c 689 sizes[2] = MIN(blk_get_max_transfer(exp->common.blk), NBD_MAX_BUFFER_SIZE);
0c1d50bd 690 trace_nbd_negotiate_handle_info_block_size(sizes[0], sizes[1], sizes[2]);
80c7c2b0
PM
691 sizes[0] = cpu_to_be32(sizes[0]);
692 sizes[1] = cpu_to_be32(sizes[1]);
693 sizes[2] = cpu_to_be32(sizes[2]);
0cfae925 694 rc = nbd_negotiate_send_info(client, NBD_INFO_BLOCK_SIZE,
0c1d50bd
EB
695 sizeof(sizes), sizes, errp);
696 if (rc < 0) {
697 return rc;
698 }
699
f37708f6 700 /* Send NBD_INFO_EXPORT always */
dbb38caa 701 myflags = exp->nbdflags;
ac132d05 702 if (client->mode >= NBD_MODE_STRUCTURED) {
dbb38caa
EB
703 myflags |= NBD_FLAG_SEND_DF;
704 }
2dcbb11b
EB
705 if (client->mode >= NBD_MODE_EXTENDED &&
706 (client->contexts.count || client->opt == NBD_OPT_INFO)) {
707 myflags |= NBD_FLAG_BLOCK_STAT_PAYLOAD;
708 }
dbb38caa 709 trace_nbd_negotiate_new_style_size_flags(exp->size, myflags);
f37708f6 710 stq_be_p(buf, exp->size);
dbb38caa 711 stw_be_p(buf + 8, myflags);
0cfae925 712 rc = nbd_negotiate_send_info(client, NBD_INFO_EXPORT,
f37708f6
EB
713 sizeof(buf), buf, errp);
714 if (rc < 0) {
715 return rc;
716 }
717
099fbcd6
EB
718 /*
719 * If the client is just asking for NBD_OPT_INFO, but forgot to
720 * request block sizes in a situation that would impact
721 * performance, then return an error. But for NBD_OPT_GO, we
722 * tolerate all clients, regardless of alignments.
723 */
724 if (client->opt == NBD_OPT_INFO && !blocksize &&
37a4f70c 725 blk_get_request_alignment(exp->common.blk) > 1) {
0cfae925
VSO
726 return nbd_negotiate_send_rep_err(client,
727 NBD_REP_ERR_BLOCK_SIZE_REQD,
0c1d50bd
EB
728 errp,
729 "request NBD_INFO_BLOCK_SIZE to "
730 "use this export");
731 }
732
f37708f6 733 /* Final reply */
0cfae925 734 rc = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
f37708f6
EB
735 if (rc < 0) {
736 return rc;
737 }
738
0cfae925 739 if (client->opt == NBD_OPT_GO) {
f37708f6 740 client->exp = exp;
6e280648 741 client->check_align = check_align;
f37708f6 742 QTAILQ_INSERT_TAIL(&client->exp->clients, client, next);
c69de1be 743 blk_exp_ref(&client->exp->common);
f37708f6
EB
744 rc = 1;
745 }
746 return rc;
f37708f6
EB
747}
748
749
36683283
EB
750/* Handle NBD_OPT_STARTTLS. Return NULL to drop connection, or else the
751 * new channel for all further (now-encrypted) communication. */
f95910fe 752static QIOChannel *nbd_negotiate_handle_starttls(NBDClient *client,
2fd2c840 753 Error **errp)
f95910fe
DB
754{
755 QIOChannel *ioc;
756 QIOChannelTLS *tioc;
757 struct NBDTLSHandshakeData data = { 0 };
758
0cfae925
VSO
759 assert(client->opt == NBD_OPT_STARTTLS);
760
9588463e 761 trace_nbd_negotiate_handle_starttls();
f95910fe 762 ioc = client->ioc;
f95910fe 763
0cfae925 764 if (nbd_negotiate_send_rep(client, NBD_REP_ACK, errp) < 0) {
63d5ef86
EB
765 return NULL;
766 }
f95910fe
DB
767
768 tioc = qio_channel_tls_new_server(ioc,
769 client->tlscreds,
b25e12da 770 client->tlsauthz,
2fd2c840 771 errp);
f95910fe
DB
772 if (!tioc) {
773 return NULL;
774 }
775
0d73f725 776 qio_channel_set_name(QIO_CHANNEL(tioc), "nbd-server-tls");
9588463e 777 trace_nbd_negotiate_handle_starttls_handshake();
f95910fe
DB
778 data.loop = g_main_loop_new(g_main_context_default(), FALSE);
779 qio_channel_tls_handshake(tioc,
780 nbd_tls_handshake,
781 &data,
1939ccda 782 NULL,
f95910fe
DB
783 NULL);
784
785 if (!data.complete) {
786 g_main_loop_run(data.loop);
787 }
788 g_main_loop_unref(data.loop);
789 if (data.error) {
790 object_unref(OBJECT(tioc));
2fd2c840 791 error_propagate(errp, data.error);
f95910fe
DB
792 return NULL;
793 }
794
795 return QIO_CHANNEL(tioc);
796}
797
e7b1948d
VSO
798/* nbd_negotiate_send_meta_context
799 *
800 * Send one chunk of reply to NBD_OPT_{LIST,SET}_META_CONTEXT
801 *
802 * For NBD_OPT_LIST_META_CONTEXT @context_id is ignored, 0 is used instead.
803 */
804static int nbd_negotiate_send_meta_context(NBDClient *client,
805 const char *context,
806 uint32_t context_id,
807 Error **errp)
808{
809 NBDOptionReplyMetaContext opt;
810 struct iovec iov[] = {
811 {.iov_base = &opt, .iov_len = sizeof(opt)},
812 {.iov_base = (void *)context, .iov_len = strlen(context)}
813 };
814
93676c88 815 assert(iov[1].iov_len <= NBD_MAX_STRING_SIZE);
e7b1948d
VSO
816 if (client->opt == NBD_OPT_LIST_META_CONTEXT) {
817 context_id = 0;
818 }
819
2b53af25 820 trace_nbd_negotiate_meta_query_reply(context, context_id);
e7b1948d
VSO
821 set_be_option_rep(&opt.h, client->opt, NBD_REP_META_CONTEXT,
822 sizeof(opt) - sizeof(opt.h) + iov[1].iov_len);
823 stl_be_p(&opt.context_id, context_id);
824
825 return qio_channel_writev_all(client->ioc, iov, 2, errp) < 0 ? -EIO : 0;
826}
827
ebd57062
EB
828/*
829 * Return true if @query matches @pattern, or if @query is empty when
830 * the @client is performing _LIST_.
dbb8b396 831 */
ebd57062
EB
832static bool nbd_meta_empty_or_pattern(NBDClient *client, const char *pattern,
833 const char *query)
e7b1948d 834{
ebd57062
EB
835 if (!*query) {
836 trace_nbd_negotiate_meta_query_parse("empty");
837 return client->opt == NBD_OPT_LIST_META_CONTEXT;
e7b1948d 838 }
ebd57062 839 if (strcmp(query, pattern) == 0) {
b0769d8f 840 trace_nbd_negotiate_meta_query_parse(pattern);
ebd57062 841 return true;
e7b1948d 842 }
ebd57062
EB
843 trace_nbd_negotiate_meta_query_skip("pattern not matched");
844 return false;
e7b1948d
VSO
845}
846
b0769d8f 847/*
ebd57062 848 * Return true and adjust @str in place if it begins with @prefix.
b0769d8f 849 */
ebd57062 850static bool nbd_strshift(const char **str, const char *prefix)
b0769d8f 851{
ebd57062 852 size_t len = strlen(prefix);
b0769d8f 853
ebd57062
EB
854 if (strncmp(*str, prefix, len) == 0) {
855 *str += len;
856 return true;
b0769d8f 857 }
ebd57062 858 return false;
b0769d8f
VSO
859}
860
861/* nbd_meta_base_query
862 *
863 * Handle queries to 'base' namespace. For now, only the base:allocation
ebd57062 864 * context is available. Return true if @query has been handled.
b0769d8f 865 */
fd358d83 866static bool nbd_meta_base_query(NBDClient *client, NBDMetaContexts *meta,
ebd57062 867 const char *query)
b0769d8f 868{
ebd57062
EB
869 if (!nbd_strshift(&query, "base:")) {
870 return false;
871 }
872 trace_nbd_negotiate_meta_query_parse("base:");
873
874 if (nbd_meta_empty_or_pattern(client, "allocation", query)) {
875 meta->base_allocation = true;
876 }
877 return true;
b0769d8f
VSO
878}
879
ebd57062 880/* nbd_meta_qemu_query
3d068aff 881 *
ebd57062 882 * Handle queries to 'qemu' namespace. For now, only the qemu:dirty-bitmap:
71719cd5
EB
883 * and qemu:allocation-depth contexts are available. Return true if @query
884 * has been handled.
ebd57062 885 */
fd358d83 886static bool nbd_meta_qemu_query(NBDClient *client, NBDMetaContexts *meta,
ebd57062 887 const char *query)
3d068aff 888{
3b1f244c
EB
889 size_t i;
890
ebd57062
EB
891 if (!nbd_strshift(&query, "qemu:")) {
892 return false;
3d068aff 893 }
ebd57062 894 trace_nbd_negotiate_meta_query_parse("qemu:");
3d068aff 895
ebd57062 896 if (!*query) {
3d068aff 897 if (client->opt == NBD_OPT_LIST_META_CONTEXT) {
71719cd5 898 meta->allocation_depth = meta->exp->allocation_depth;
76df2b8d
EB
899 if (meta->exp->nr_export_bitmaps) {
900 memset(meta->bitmaps, 1, meta->exp->nr_export_bitmaps);
901 }
3d068aff
VSO
902 }
903 trace_nbd_negotiate_meta_query_parse("empty");
ebd57062 904 return true;
3d068aff
VSO
905 }
906
71719cd5
EB
907 if (strcmp(query, "allocation-depth") == 0) {
908 trace_nbd_negotiate_meta_query_parse("allocation-depth");
909 meta->allocation_depth = meta->exp->allocation_depth;
910 return true;
911 }
912
ebd57062
EB
913 if (nbd_strshift(&query, "dirty-bitmap:")) {
914 trace_nbd_negotiate_meta_query_parse("dirty-bitmap:");
3b1f244c 915 if (!*query) {
76df2b8d
EB
916 if (client->opt == NBD_OPT_LIST_META_CONTEXT &&
917 meta->exp->nr_export_bitmaps) {
3b1f244c
EB
918 memset(meta->bitmaps, 1, meta->exp->nr_export_bitmaps);
919 }
920 trace_nbd_negotiate_meta_query_parse("empty");
ebd57062
EB
921 return true;
922 }
3b1f244c
EB
923
924 for (i = 0; i < meta->exp->nr_export_bitmaps; i++) {
925 const char *bm_name;
926
927 bm_name = bdrv_dirty_bitmap_name(meta->exp->export_bitmaps[i]);
928 if (strcmp(bm_name, query) == 0) {
929 meta->bitmaps[i] = true;
930 trace_nbd_negotiate_meta_query_parse(query);
931 return true;
932 }
ebd57062 933 }
3b1f244c 934 trace_nbd_negotiate_meta_query_skip("no dirty-bitmap match");
ebd57062 935 return true;
3d068aff
VSO
936 }
937
71719cd5 938 trace_nbd_negotiate_meta_query_skip("unknown qemu context");
ebd57062 939 return true;
3d068aff
VSO
940}
941
e7b1948d
VSO
942/* nbd_negotiate_meta_query
943 *
944 * Parse namespace name and call corresponding function to parse body of the
945 * query.
946 *
93676c88 947 * The only supported namespaces are 'base' and 'qemu'.
e7b1948d 948 *
e7b1948d
VSO
949 * Return -errno on I/O error, 0 if option was completely handled by
950 * sending a reply about inconsistent lengths, or 1 on success. */
951static int nbd_negotiate_meta_query(NBDClient *client,
fd358d83 952 NBDMetaContexts *meta, Error **errp)
e7b1948d
VSO
953{
954 int ret;
ebd57062 955 g_autofree char *query = NULL;
e7b1948d
VSO
956 uint32_t len;
957
d1e2c3e7 958 ret = nbd_opt_read(client, &len, sizeof(len), false, errp);
e7b1948d
VSO
959 if (ret <= 0) {
960 return ret;
961 }
80c7c2b0 962 len = cpu_to_be32(len);
e7b1948d 963
93676c88
EB
964 if (len > NBD_MAX_STRING_SIZE) {
965 trace_nbd_negotiate_meta_query_skip("length too long");
966 return nbd_opt_skip(client, len, errp);
967 }
e7b1948d 968
ebd57062
EB
969 query = g_malloc(len + 1);
970 ret = nbd_opt_read(client, query, len, true, errp);
e7b1948d
VSO
971 if (ret <= 0) {
972 return ret;
973 }
ebd57062 974 query[len] = '\0';
3d068aff 975
ebd57062
EB
976 if (nbd_meta_base_query(client, meta, query)) {
977 return 1;
978 }
979 if (nbd_meta_qemu_query(client, meta, query)) {
980 return 1;
e7b1948d
VSO
981 }
982
3d068aff 983 trace_nbd_negotiate_meta_query_skip("unknown namespace");
ebd57062 984 return 1;
e7b1948d
VSO
985}
986
987/* nbd_negotiate_meta_queries
988 * Handle NBD_OPT_LIST_META_CONTEXT and NBD_OPT_SET_META_CONTEXT
989 *
990 * Return -errno on I/O error, or 0 if option was completely handled. */
fd358d83 991static int nbd_negotiate_meta_queries(NBDClient *client, Error **errp)
e7b1948d
VSO
992{
993 int ret;
9d7ab222 994 g_autofree char *export_name = NULL;
cd1675f8
RH
995 /* Mark unused to work around https://bugs.llvm.org/show_bug.cgi?id=3888 */
996 g_autofree G_GNUC_UNUSED bool *bitmaps = NULL;
fd358d83
EB
997 NBDMetaContexts local_meta = {0};
998 NBDMetaContexts *meta;
e7b1948d 999 uint32_t nb_queries;
3b1f244c 1000 size_t i;
47ec485e 1001 size_t count = 0;
e7b1948d 1002
ac132d05
EB
1003 if (client->opt == NBD_OPT_SET_META_CONTEXT &&
1004 client->mode < NBD_MODE_STRUCTURED) {
e7b1948d
VSO
1005 return nbd_opt_invalid(client, errp,
1006 "request option '%s' when structured reply "
1007 "is not negotiated",
1008 nbd_opt_lookup(client->opt));
1009 }
1010
1011 if (client->opt == NBD_OPT_LIST_META_CONTEXT) {
1012 /* Only change the caller's meta on SET. */
1013 meta = &local_meta;
fd358d83
EB
1014 } else {
1015 meta = &client->contexts;
e7b1948d
VSO
1016 }
1017
3b1f244c 1018 g_free(meta->bitmaps);
e7b1948d
VSO
1019 memset(meta, 0, sizeof(*meta));
1020
9d7ab222 1021 ret = nbd_opt_read_name(client, &export_name, NULL, errp);
e7b1948d
VSO
1022 if (ret <= 0) {
1023 return ret;
1024 }
1025
af736e54
VSO
1026 meta->exp = nbd_export_find(export_name);
1027 if (meta->exp == NULL) {
5c4fe018
EB
1028 g_autofree char *sane_name = nbd_sanitize_name(export_name);
1029
e7b1948d 1030 return nbd_opt_drop(client, NBD_REP_ERR_UNKNOWN, errp,
5c4fe018 1031 "export '%s' not present", sane_name);
e7b1948d 1032 }
3b1f244c
EB
1033 meta->bitmaps = g_new0(bool, meta->exp->nr_export_bitmaps);
1034 if (client->opt == NBD_OPT_LIST_META_CONTEXT) {
1035 bitmaps = meta->bitmaps;
1036 }
e7b1948d 1037
d1e2c3e7 1038 ret = nbd_opt_read(client, &nb_queries, sizeof(nb_queries), false, errp);
e7b1948d
VSO
1039 if (ret <= 0) {
1040 return ret;
1041 }
80c7c2b0 1042 nb_queries = cpu_to_be32(nb_queries);
2b53af25 1043 trace_nbd_negotiate_meta_context(nbd_opt_lookup(client->opt),
af736e54 1044 export_name, nb_queries);
e7b1948d
VSO
1045
1046 if (client->opt == NBD_OPT_LIST_META_CONTEXT && !nb_queries) {
1047 /* enable all known contexts */
1048 meta->base_allocation = true;
71719cd5 1049 meta->allocation_depth = meta->exp->allocation_depth;
76df2b8d
EB
1050 if (meta->exp->nr_export_bitmaps) {
1051 memset(meta->bitmaps, 1, meta->exp->nr_export_bitmaps);
1052 }
e7b1948d
VSO
1053 } else {
1054 for (i = 0; i < nb_queries; ++i) {
1055 ret = nbd_negotiate_meta_query(client, meta, errp);
1056 if (ret <= 0) {
1057 return ret;
1058 }
1059 }
1060 }
1061
1062 if (meta->base_allocation) {
1063 ret = nbd_negotiate_send_meta_context(client, "base:allocation",
1064 NBD_META_ID_BASE_ALLOCATION,
1065 errp);
1066 if (ret < 0) {
1067 return ret;
1068 }
47ec485e 1069 count++;
e7b1948d
VSO
1070 }
1071
71719cd5
EB
1072 if (meta->allocation_depth) {
1073 ret = nbd_negotiate_send_meta_context(client, "qemu:allocation-depth",
1074 NBD_META_ID_ALLOCATION_DEPTH,
1075 errp);
1076 if (ret < 0) {
1077 return ret;
1078 }
1079 count++;
1080 }
1081
3b1f244c
EB
1082 for (i = 0; i < meta->exp->nr_export_bitmaps; i++) {
1083 const char *bm_name;
1084 g_autofree char *context = NULL;
1085
1086 if (!meta->bitmaps[i]) {
1087 continue;
1088 }
1089
1090 bm_name = bdrv_dirty_bitmap_name(meta->exp->export_bitmaps[i]);
1091 context = g_strdup_printf("qemu:dirty-bitmap:%s", bm_name);
02e87e3b
EB
1092
1093 ret = nbd_negotiate_send_meta_context(client, context,
3b1f244c 1094 NBD_META_ID_DIRTY_BITMAP + i,
3d068aff
VSO
1095 errp);
1096 if (ret < 0) {
1097 return ret;
1098 }
47ec485e 1099 count++;
3d068aff
VSO
1100 }
1101
e7b1948d
VSO
1102 ret = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
1103 if (ret == 0) {
47ec485e 1104 meta->count = count;
e7b1948d
VSO
1105 }
1106
1107 return ret;
1108}
1109
1e120ffe 1110/* nbd_negotiate_options
f37708f6
EB
1111 * Process all NBD_OPT_* client option commands, during fixed newstyle
1112 * negotiation.
1e120ffe 1113 * Return:
2fd2c840
VSO
1114 * -errno on error, errp is set
1115 * 0 on successful negotiation, errp is not set
1116 * 1 if client sent NBD_OPT_ABORT, i.e. on valid disconnect,
1117 * errp is not set
1e120ffe 1118 */
dbb38caa 1119static int nbd_negotiate_options(NBDClient *client, Error **errp)
f5076b5a 1120{
9c122ada 1121 uint32_t flags;
26afa868 1122 bool fixedNewstyle = false;
23e099c3 1123 bool no_zeroes = false;
9c122ada
HR
1124
1125 /* Client sends:
1126 [ 0 .. 3] client flags
1127
f37708f6 1128 Then we loop until NBD_OPT_EXPORT_NAME or NBD_OPT_GO:
9c122ada
HR
1129 [ 0 .. 7] NBD_OPTS_MAGIC
1130 [ 8 .. 11] NBD option
1131 [12 .. 15] Data length
1132 ... Rest of request
1133
1134 [ 0 .. 7] NBD_OPTS_MAGIC
1135 [ 8 .. 11] Second NBD option
1136 [12 .. 15] Data length
1137 ... Rest of request
1138 */
1139
e6798f06 1140 if (nbd_read32(client->ioc, &flags, "flags", errp) < 0) {
9c122ada
HR
1141 return -EIO;
1142 }
ac132d05 1143 client->mode = NBD_MODE_EXPORT_NAME;
621c4f4e 1144 trace_nbd_negotiate_options_flags(flags);
26afa868 1145 if (flags & NBD_FLAG_C_FIXED_NEWSTYLE) {
26afa868
DB
1146 fixedNewstyle = true;
1147 flags &= ~NBD_FLAG_C_FIXED_NEWSTYLE;
ac132d05 1148 client->mode = NBD_MODE_SIMPLE;
26afa868 1149 }
c203c59a 1150 if (flags & NBD_FLAG_C_NO_ZEROES) {
23e099c3 1151 no_zeroes = true;
c203c59a
EB
1152 flags &= ~NBD_FLAG_C_NO_ZEROES;
1153 }
26afa868 1154 if (flags != 0) {
2fd2c840 1155 error_setg(errp, "Unknown client flags 0x%" PRIx32 " received", flags);
621c4f4e 1156 return -EINVAL;
9c122ada
HR
1157 }
1158
f5076b5a 1159 while (1) {
9c122ada 1160 int ret;
7f9039cd 1161 uint32_t option, length;
f5076b5a
HB
1162 uint64_t magic;
1163
e6798f06 1164 if (nbd_read64(client->ioc, &magic, "opts magic", errp) < 0) {
f5076b5a
HB
1165 return -EINVAL;
1166 }
9588463e
VSO
1167 trace_nbd_negotiate_options_check_magic(magic);
1168 if (magic != NBD_OPTS_MAGIC) {
2fd2c840 1169 error_setg(errp, "Bad magic received");
f5076b5a
HB
1170 return -EINVAL;
1171 }
1172
e6798f06 1173 if (nbd_read32(client->ioc, &option, "option", errp) < 0) {
f5076b5a
HB
1174 return -EINVAL;
1175 }
0cfae925 1176 client->opt = option;
f5076b5a 1177
e6798f06 1178 if (nbd_read32(client->ioc, &length, "option length", errp) < 0) {
f5076b5a
HB
1179 return -EINVAL;
1180 }
894e0280 1181 assert(!client->optlen);
0cfae925 1182 client->optlen = length;
f5076b5a 1183
fdad35ef 1184 if (length > NBD_MAX_BUFFER_SIZE) {
b2578459 1185 error_setg(errp, "len (%" PRIu32 ") is larger than max len (%u)",
fdad35ef
EB
1186 length, NBD_MAX_BUFFER_SIZE);
1187 return -EINVAL;
1188 }
1189
3736cc5b
EB
1190 trace_nbd_negotiate_options_check_option(option,
1191 nbd_opt_lookup(option));
f95910fe
DB
1192 if (client->tlscreds &&
1193 client->ioc == (QIOChannel *)client->sioc) {
1194 QIOChannel *tioc;
1195 if (!fixedNewstyle) {
7f9039cd 1196 error_setg(errp, "Unsupported option 0x%" PRIx32, option);
f95910fe
DB
1197 return -EINVAL;
1198 }
7f9039cd 1199 switch (option) {
f95910fe 1200 case NBD_OPT_STARTTLS:
e68c35cf
EB
1201 if (length) {
1202 /* Unconditionally drop the connection if the client
1203 * can't start a TLS negotiation correctly */
0cfae925 1204 return nbd_reject_length(client, true, errp);
e68c35cf
EB
1205 }
1206 tioc = nbd_negotiate_handle_starttls(client, errp);
f95910fe
DB
1207 if (!tioc) {
1208 return -EIO;
1209 }
8cbee49e 1210 ret = 0;
f95910fe 1211 object_unref(OBJECT(client->ioc));
7d5b0d68 1212 client->ioc = tioc;
f95910fe
DB
1213 break;
1214
d1129a8a
EB
1215 case NBD_OPT_EXPORT_NAME:
1216 /* No way to return an error to client, so drop connection */
2fd2c840 1217 error_setg(errp, "Option 0x%x not permitted before TLS",
7f9039cd 1218 option);
d1129a8a
EB
1219 return -EINVAL;
1220
f95910fe 1221 default:
3e99ebb9
EB
1222 /* Let the client keep trying, unless they asked to
1223 * quit. Always try to give an error back to the
1224 * client; but when replying to OPT_ABORT, be aware
1225 * that the client may hang up before receiving the
1226 * error, in which case we are fine ignoring the
1227 * resulting EPIPE. */
1228 ret = nbd_opt_drop(client, NBD_REP_ERR_TLS_REQD,
1229 option == NBD_OPT_ABORT ? NULL : errp,
894e0280 1230 "Option 0x%" PRIx32
0b0bb124 1231 " not permitted before TLS", option);
7f9039cd 1232 if (option == NBD_OPT_ABORT) {
1e120ffe 1233 return 1;
b6f5d3b5 1234 }
d1129a8a 1235 break;
f95910fe
DB
1236 }
1237 } else if (fixedNewstyle) {
7f9039cd 1238 switch (option) {
26afa868 1239 case NBD_OPT_LIST:
e68c35cf 1240 if (length) {
0cfae925 1241 ret = nbd_reject_length(client, false, errp);
e68c35cf
EB
1242 } else {
1243 ret = nbd_negotiate_handle_list(client, errp);
1244 }
26afa868
DB
1245 break;
1246
1247 case NBD_OPT_ABORT:
b6f5d3b5
EB
1248 /* NBD spec says we must try to reply before
1249 * disconnecting, but that we must also tolerate
1250 * guests that don't wait for our reply. */
0cfae925 1251 nbd_negotiate_send_rep(client, NBD_REP_ACK, NULL);
1e120ffe 1252 return 1;
26afa868
DB
1253
1254 case NBD_OPT_EXPORT_NAME:
dbb38caa 1255 return nbd_negotiate_handle_export_name(client, no_zeroes,
23e099c3 1256 errp);
26afa868 1257
f37708f6
EB
1258 case NBD_OPT_INFO:
1259 case NBD_OPT_GO:
dbb38caa 1260 ret = nbd_negotiate_handle_info(client, errp);
f37708f6
EB
1261 if (ret == 1) {
1262 assert(option == NBD_OPT_GO);
1263 return 0;
1264 }
f37708f6
EB
1265 break;
1266
f95910fe 1267 case NBD_OPT_STARTTLS:
e68c35cf 1268 if (length) {
0cfae925 1269 ret = nbd_reject_length(client, false, errp);
e68c35cf 1270 } else if (client->tlscreds) {
0cfae925
VSO
1271 ret = nbd_negotiate_send_rep_err(client,
1272 NBD_REP_ERR_INVALID, errp,
36683283 1273 "TLS already enabled");
f95910fe 1274 } else {
0cfae925
VSO
1275 ret = nbd_negotiate_send_rep_err(client,
1276 NBD_REP_ERR_POLICY, errp,
36683283 1277 "TLS not configured");
63d5ef86 1278 }
d1129a8a 1279 break;
5c54e7fa
VSO
1280
1281 case NBD_OPT_STRUCTURED_REPLY:
1282 if (length) {
0cfae925 1283 ret = nbd_reject_length(client, false, errp);
9c1d2614
EB
1284 } else if (client->mode >= NBD_MODE_EXTENDED) {
1285 ret = nbd_negotiate_send_rep_err(
1286 client, NBD_REP_ERR_EXT_HEADER_REQD, errp,
1287 "extended headers already negotiated");
ac132d05 1288 } else if (client->mode >= NBD_MODE_STRUCTURED) {
5c54e7fa 1289 ret = nbd_negotiate_send_rep_err(
0cfae925 1290 client, NBD_REP_ERR_INVALID, errp,
5c54e7fa
VSO
1291 "structured reply already negotiated");
1292 } else {
0cfae925 1293 ret = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
ac132d05 1294 client->mode = NBD_MODE_STRUCTURED;
5c54e7fa
VSO
1295 }
1296 break;
1297
e7b1948d
VSO
1298 case NBD_OPT_LIST_META_CONTEXT:
1299 case NBD_OPT_SET_META_CONTEXT:
fd358d83 1300 ret = nbd_negotiate_meta_queries(client, errp);
e7b1948d
VSO
1301 break;
1302
9c1d2614
EB
1303 case NBD_OPT_EXTENDED_HEADERS:
1304 if (length) {
1305 ret = nbd_reject_length(client, false, errp);
1306 } else if (client->mode >= NBD_MODE_EXTENDED) {
1307 ret = nbd_negotiate_send_rep_err(
1308 client, NBD_REP_ERR_INVALID, errp,
1309 "extended headers already negotiated");
1310 } else {
1311 ret = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
1312 client->mode = NBD_MODE_EXTENDED;
1313 }
1314 break;
1315
26afa868 1316 default:
894e0280 1317 ret = nbd_opt_drop(client, NBD_REP_ERR_UNSUP, errp,
28fb494f 1318 "Unsupported option %" PRIu32 " (%s)",
894e0280 1319 option, nbd_opt_lookup(option));
156f6a10 1320 break;
26afa868
DB
1321 }
1322 } else {
1323 /*
1324 * If broken new-style we should drop the connection
1325 * for anything except NBD_OPT_EXPORT_NAME
1326 */
7f9039cd 1327 switch (option) {
26afa868 1328 case NBD_OPT_EXPORT_NAME:
dbb38caa 1329 return nbd_negotiate_handle_export_name(client, no_zeroes,
23e099c3 1330 errp);
26afa868
DB
1331
1332 default:
28fb494f 1333 error_setg(errp, "Unsupported option %" PRIu32 " (%s)",
3736cc5b 1334 option, nbd_opt_lookup(option));
26afa868 1335 return -EINVAL;
32d7d2e0 1336 }
f5076b5a 1337 }
8cbee49e
EB
1338 if (ret < 0) {
1339 return ret;
1340 }
f5076b5a
HB
1341 }
1342}
1343
1e120ffe
VSO
1344/* nbd_negotiate
1345 * Return:
2fd2c840
VSO
1346 * -errno on error, errp is set
1347 * 0 on successful negotiation, errp is not set
1348 * 1 if client sent NBD_OPT_ABORT, i.e. on valid disconnect,
1349 * errp is not set
1e120ffe 1350 */
2fd2c840 1351static coroutine_fn int nbd_negotiate(NBDClient *client, Error **errp)
7a5ca864 1352{
795d946d 1353 ERRP_GUARD();
5f66d060 1354 char buf[NBD_OLDSTYLE_NEGOTIATE_SIZE] = "";
2e5c9ad6 1355 int ret;
b2e3d87f 1356
5f66d060 1357 /* Old style negotiation header, no room for options
6b8c01e7
PB
1358 [ 0 .. 7] passwd ("NBDMAGIC")
1359 [ 8 .. 15] magic (NBD_CLIENT_MAGIC)
b2e3d87f 1360 [16 .. 23] size
5f66d060 1361 [24 .. 27] export flags (zero-extended)
6b8c01e7
PB
1362 [28 .. 151] reserved (0)
1363
5f66d060 1364 New style negotiation header, client can send options
6b8c01e7
PB
1365 [ 0 .. 7] passwd ("NBDMAGIC")
1366 [ 8 .. 15] magic (NBD_OPTS_MAGIC)
1367 [16 .. 17] server flags (0)
f37708f6 1368 ....options sent, ending in NBD_OPT_EXPORT_NAME or NBD_OPT_GO....
b2e3d87f
NT
1369 */
1370
1c778ef7 1371 qio_channel_set_blocking(client->ioc, false, NULL);
06e0f098 1372 qio_channel_set_follow_coroutine_ctx(client->ioc, true);
185b4338 1373
9588463e 1374 trace_nbd_negotiate_begin();
b2e3d87f 1375 memcpy(buf, "NBDMAGIC", 8);
f95910fe 1376
7f7dfe2a
VSO
1377 stq_be_p(buf + 8, NBD_OPTS_MAGIC);
1378 stw_be_p(buf + 16, NBD_FLAG_FIXED_NEWSTYLE | NBD_FLAG_NO_ZEROES);
b2e3d87f 1379
7f7dfe2a
VSO
1380 if (nbd_write(client->ioc, buf, 18, errp) < 0) {
1381 error_prepend(errp, "write failed: ");
1382 return -EINVAL;
1383 }
dbb38caa 1384 ret = nbd_negotiate_options(client, errp);
7f7dfe2a
VSO
1385 if (ret != 0) {
1386 if (ret < 0) {
1387 error_prepend(errp, "option negotiation failed: ");
6b8c01e7 1388 }
7f7dfe2a 1389 return ret;
b2e3d87f
NT
1390 }
1391
0cfae925 1392 assert(!client->optlen);
9588463e 1393 trace_nbd_negotiate_success();
d9faeed8
VSO
1394
1395 return 0;
7a5ca864
FB
1396}
1397
f148ae7d
SL
1398/* nbd_read_eof
1399 * Tries to read @size bytes from @ioc. This is a local implementation of
1400 * qio_channel_readv_all_eof. We have it here because we need it to be
1401 * interruptible and to know when the coroutine is yielding.
1402 * Returns 1 on success
1403 * 0 on eof, when no data was read (errp is not set)
1404 * negative errno on failure (errp is set)
1405 */
1406static inline int coroutine_fn
1407nbd_read_eof(NBDClient *client, void *buffer, size_t size, Error **errp)
1408{
1409 bool partial = false;
1410
1411 assert(size);
1412 while (size > 0) {
1413 struct iovec iov = { .iov_base = buffer, .iov_len = size };
1414 ssize_t len;
1415
1416 len = qio_channel_readv(client->ioc, &iov, 1, errp);
1417 if (len == QIO_CHANNEL_ERR_BLOCK) {
1418 client->read_yielding = true;
1419 qio_channel_yield(client->ioc, G_IO_IN);
1420 client->read_yielding = false;
1421 if (client->quiescing) {
1422 return -EAGAIN;
1423 }
1424 continue;
1425 } else if (len < 0) {
1426 return -EIO;
1427 } else if (len == 0) {
1428 if (partial) {
1429 error_setg(errp,
1430 "Unexpected end-of-file before all bytes were read");
1431 return -EIO;
1432 } else {
1433 return 0;
1434 }
1435 }
1436
1437 partial = true;
1438 size -= len;
1439 buffer = (uint8_t *) buffer + len;
1440 }
1441 return 1;
1442}
1443
d2223cdd
PB
1444static int coroutine_fn nbd_receive_request(NBDClient *client, NBDRequest *request,
1445 Error **errp)
75818250 1446{
c8720ca0
EB
1447 uint8_t buf[NBD_EXTENDED_REQUEST_SIZE];
1448 uint32_t magic, expect;
a0dc63a6 1449 int ret;
c8720ca0
EB
1450 size_t size = client->mode >= NBD_MODE_EXTENDED ?
1451 NBD_EXTENDED_REQUEST_SIZE : NBD_REQUEST_SIZE;
b2e3d87f 1452
c8720ca0 1453 ret = nbd_read_eof(client, buf, size, errp);
185b4338
PB
1454 if (ret < 0) {
1455 return ret;
1456 }
1644ccce
EB
1457 if (ret == 0) {
1458 return -EIO;
1459 }
185b4338 1460
c8720ca0
EB
1461 /*
1462 * Compact request
1463 * [ 0 .. 3] magic (NBD_REQUEST_MAGIC)
1464 * [ 4 .. 5] flags (NBD_CMD_FLAG_FUA, ...)
1465 * [ 6 .. 7] type (NBD_CMD_READ, ...)
1466 * [ 8 .. 15] cookie
1467 * [16 .. 23] from
1468 * [24 .. 27] len
1469 * Extended request
1470 * [ 0 .. 3] magic (NBD_EXTENDED_REQUEST_MAGIC)
1471 * [ 4 .. 5] flags (NBD_CMD_FLAG_FUA, NBD_CMD_FLAG_PAYLOAD_LEN, ...)
1472 * [ 6 .. 7] type (NBD_CMD_READ, ...)
1473 * [ 8 .. 15] cookie
1474 * [16 .. 23] from
1475 * [24 .. 31] len
b2e3d87f
NT
1476 */
1477
773dce3c 1478 magic = ldl_be_p(buf);
b626b51a
EB
1479 request->flags = lduw_be_p(buf + 4);
1480 request->type = lduw_be_p(buf + 6);
22efd811 1481 request->cookie = ldq_be_p(buf + 8);
773dce3c 1482 request->from = ldq_be_p(buf + 16);
c8720ca0
EB
1483 if (client->mode >= NBD_MODE_EXTENDED) {
1484 request->len = ldq_be_p(buf + 24);
1485 expect = NBD_EXTENDED_REQUEST_MAGIC;
1486 } else {
1487 request->len = (uint32_t)ldl_be_p(buf + 24); /* widen 32 to 64 bits */
1488 expect = NBD_REQUEST_MAGIC;
1489 }
b2e3d87f 1490
9588463e
VSO
1491 trace_nbd_receive_request(magic, request->flags, request->type,
1492 request->from, request->len);
b2e3d87f 1493
c8720ca0
EB
1494 if (magic != expect) {
1495 error_setg(errp, "invalid magic (got 0x%" PRIx32 ", expected 0x%"
1496 PRIx32 ")", magic, expect);
185b4338 1497 return -EINVAL;
b2e3d87f
NT
1498 }
1499 return 0;
75818250
TS
1500}
1501
41996e38
PB
1502#define MAX_NBD_REQUESTS 16
1503
ce33967a 1504void nbd_client_get(NBDClient *client)
1743b515
PB
1505{
1506 client->refcount++;
1507}
1508
ce33967a 1509void nbd_client_put(NBDClient *client)
1743b515
PB
1510{
1511 if (--client->refcount == 0) {
ff2b68aa 1512 /* The last reference should be dropped by client->close,
f53a829b 1513 * which is called by client_close.
ff2b68aa
PB
1514 */
1515 assert(client->closing);
1516
1c778ef7
DB
1517 object_unref(OBJECT(client->sioc));
1518 object_unref(OBJECT(client->ioc));
f95910fe
DB
1519 if (client->tlscreds) {
1520 object_unref(OBJECT(client->tlscreds));
1521 }
b25e12da 1522 g_free(client->tlsauthz);
6b8c01e7
PB
1523 if (client->exp) {
1524 QTAILQ_REMOVE(&client->exp->clients, client, next);
c69de1be 1525 blk_exp_unref(&client->exp->common);
6b8c01e7 1526 }
fd358d83 1527 g_free(client->contexts.bitmaps);
1743b515
PB
1528 g_free(client);
1529 }
1530}
1531
0c9390d9 1532static void client_close(NBDClient *client, bool negotiated)
1743b515 1533{
ff2b68aa
PB
1534 if (client->closing) {
1535 return;
1536 }
1537
1538 client->closing = true;
1539
1540 /* Force requests to finish. They will drop their own references,
1541 * then we'll close the socket and free the NBDClient.
1542 */
1c778ef7
DB
1543 qio_channel_shutdown(client->ioc, QIO_CHANNEL_SHUTDOWN_BOTH,
1544 NULL);
ff2b68aa
PB
1545
1546 /* Also tell the client, so that they release their reference. */
0c9390d9
EB
1547 if (client->close_fn) {
1548 client->close_fn(client, negotiated);
1743b515 1549 }
1743b515
PB
1550}
1551
315f78ab 1552static NBDRequestData *nbd_request_get(NBDClient *client)
d9a73806 1553{
315f78ab 1554 NBDRequestData *req;
72deddc5 1555
41996e38
PB
1556 assert(client->nb_requests <= MAX_NBD_REQUESTS - 1);
1557 client->nb_requests++;
1558
315f78ab 1559 req = g_new0(NBDRequestData, 1);
72deddc5
PB
1560 nbd_client_get(client);
1561 req->client = client;
d9a73806
PB
1562 return req;
1563}
1564
315f78ab 1565static void nbd_request_put(NBDRequestData *req)
d9a73806 1566{
72deddc5 1567 NBDClient *client = req->client;
e1adb27a 1568
2d821488
SH
1569 if (req->data) {
1570 qemu_vfree(req->data);
1571 }
1729404c 1572 g_free(req);
e1adb27a 1573
958c717d 1574 client->nb_requests--;
fd6afc50
SL
1575
1576 if (client->quiescing && client->nb_requests == 0) {
1577 aio_wait_kick();
1578 }
1579
ff82911c
PB
1580 nbd_client_receive_next_request(client);
1581
72deddc5 1582 nbd_client_put(client);
d9a73806
PB
1583}
1584
aadf99a7 1585static void blk_aio_attached(AioContext *ctx, void *opaque)
f2149281
HR
1586{
1587 NBDExport *exp = opaque;
1588 NBDClient *client;
1589
9588463e 1590 trace_nbd_blk_aio_attached(exp->name, ctx);
f2149281 1591
8612c686 1592 exp->common.ctx = ctx;
f2149281
HR
1593
1594 QTAILQ_FOREACH(client, &exp->clients, next) {
fd6afc50 1595 assert(client->nb_requests == 0);
f148ae7d
SL
1596 assert(client->recv_coroutine == NULL);
1597 assert(client->send_coroutine == NULL);
f148ae7d
SL
1598 }
1599}
1600
fd6afc50 1601static void blk_aio_detach(void *opaque)
f148ae7d
SL
1602{
1603 NBDExport *exp = opaque;
f148ae7d 1604
fd6afc50
SL
1605 trace_nbd_blk_aio_detach(exp->name, exp->common.ctx);
1606
fd6afc50
SL
1607 exp->common.ctx = NULL;
1608}
1609
1610static void nbd_drained_begin(void *opaque)
1611{
1612 NBDExport *exp = opaque;
1613 NBDClient *client;
1614
1615 QTAILQ_FOREACH(client, &exp->clients, next) {
f148ae7d 1616 client->quiescing = true;
fd6afc50
SL
1617 }
1618}
f148ae7d 1619
fd6afc50
SL
1620static void nbd_drained_end(void *opaque)
1621{
1622 NBDExport *exp = opaque;
1623 NBDClient *client;
f148ae7d 1624
fd6afc50
SL
1625 QTAILQ_FOREACH(client, &exp->clients, next) {
1626 client->quiescing = false;
1627 nbd_client_receive_next_request(client);
f2149281
HR
1628 }
1629}
1630
fd6afc50 1631static bool nbd_drained_poll(void *opaque)
f2149281
HR
1632{
1633 NBDExport *exp = opaque;
fd6afc50 1634 NBDClient *client;
f2149281 1635
fd6afc50
SL
1636 QTAILQ_FOREACH(client, &exp->clients, next) {
1637 if (client->nb_requests != 0) {
1638 /*
1639 * If there's a coroutine waiting for a request on nbd_read_eof()
1640 * enter it here so we don't depend on the client to wake it up.
1641 */
1642 if (client->recv_coroutine != NULL && client->read_yielding) {
7c1f51bf 1643 qio_channel_wake_read(client->ioc);
fd6afc50 1644 }
f2149281 1645
fd6afc50
SL
1646 return true;
1647 }
1648 }
f2149281 1649
fd6afc50 1650 return false;
f2149281
HR
1651}
1652
741cc431
HR
1653static void nbd_eject_notifier(Notifier *n, void *data)
1654{
1655 NBDExport *exp = container_of(n, NBDExport, eject_notifier);
61bc846d 1656
bc4ee65b 1657 blk_exp_request_shutdown(&exp->common);
741cc431
HR
1658}
1659
9b562c64
KW
1660void nbd_export_set_on_eject_blk(BlockExport *exp, BlockBackend *blk)
1661{
1662 NBDExport *nbd_exp = container_of(exp, NBDExport, common);
1663 assert(exp->drv == &blk_exp_nbd);
1664 assert(nbd_exp->eject_notifier_blk == NULL);
1665
1666 blk_ref(blk);
1667 nbd_exp->eject_notifier_blk = blk;
1668 nbd_exp->eject_notifier.notify = nbd_eject_notifier;
1669 blk_add_remove_bs_notifier(blk, &nbd_exp->eject_notifier);
1670}
1671
fd6afc50
SL
1672static const BlockDevOps nbd_block_ops = {
1673 .drained_begin = nbd_drained_begin,
1674 .drained_end = nbd_drained_end,
1675 .drained_poll = nbd_drained_poll,
1676};
1677
5b1cb497
KW
1678static int nbd_export_create(BlockExport *blk_exp, BlockExportOptions *exp_args,
1679 Error **errp)
af49bbbe 1680{
a6ff7989 1681 NBDExport *exp = container_of(blk_exp, NBDExport, common);
5b1cb497 1682 BlockExportOptionsNbd *arg = &exp_args->u.nbd;
8461b4d6 1683 const char *name = arg->name ?: exp_args->node_name;
331170e0 1684 BlockBackend *blk = blk_exp->blk;
b57e4de0 1685 int64_t size;
331170e0 1686 uint64_t perm, shared_perm;
5b1cb497 1687 bool readonly = !exp_args->writable;
e5fb29d5 1688 BlockDirtyBitmapOrStrList *bitmaps;
3b1f244c 1689 size_t i;
d7086422 1690 int ret;
cd7fca95 1691
372b69f5 1692 GLOBAL_STATE_CODE();
5b1cb497
KW
1693 assert(exp_args->type == BLOCK_EXPORT_TYPE_NBD);
1694
1695 if (!nbd_server_is_running()) {
1696 error_setg(errp, "NBD server not running");
1697 return -EINVAL;
1698 }
1699
8461b4d6
MA
1700 if (strlen(name) > NBD_MAX_STRING_SIZE) {
1701 error_setg(errp, "export name '%s' too long", name);
5b1cb497
KW
1702 return -EINVAL;
1703 }
1704
1705 if (arg->description && strlen(arg->description) > NBD_MAX_STRING_SIZE) {
1706 error_setg(errp, "description '%s' too long", arg->description);
1707 return -EINVAL;
1708 }
1709
8461b4d6
MA
1710 if (nbd_export_find(name)) {
1711 error_setg(errp, "NBD server already has export named '%s'", name);
5b1cb497
KW
1712 return -EEXIST;
1713 }
1714
331170e0 1715 size = blk_getlength(blk);
b57e4de0
KW
1716 if (size < 0) {
1717 error_setg_errno(errp, -size,
1718 "Failed to determine the NBD export's length");
a6ff7989 1719 return size;
b57e4de0
KW
1720 }
1721
8a7ce4f9
KW
1722 /* Don't allow resize while the NBD server is running, otherwise we don't
1723 * care what happens with the node. */
331170e0 1724 blk_get_perm(blk, &perm, &shared_perm);
331170e0 1725 ret = blk_set_perm(blk, perm, shared_perm & ~BLK_PERM_RESIZE, errp);
d7086422 1726 if (ret < 0) {
331170e0 1727 return ret;
d7086422 1728 }
331170e0 1729
4b9441f6 1730 QTAILQ_INIT(&exp->clients);
8461b4d6 1731 exp->name = g_strdup(name);
5b1cb497 1732 exp->description = g_strdup(arg->description);
dbb38caa
EB
1733 exp->nbdflags = (NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_FLUSH |
1734 NBD_FLAG_SEND_FUA | NBD_FLAG_SEND_CACHE);
58a6fdcc
EB
1735
1736 if (nbd_server_max_connections() != 1) {
1737 exp->nbdflags |= NBD_FLAG_CAN_MULTI_CONN;
1738 }
dbb38caa
EB
1739 if (readonly) {
1740 exp->nbdflags |= NBD_FLAG_READ_ONLY;
dbb38caa 1741 } else {
b491dbb7
EB
1742 exp->nbdflags |= (NBD_FLAG_SEND_TRIM | NBD_FLAG_SEND_WRITE_ZEROES |
1743 NBD_FLAG_SEND_FAST_ZERO);
dbb38caa 1744 }
7596bbb3 1745 exp->size = QEMU_ALIGN_DOWN(size, BDRV_SECTOR_SIZE);
98f44bbe 1746
372b69f5
KW
1747 bdrv_graph_rdlock_main_loop();
1748
cbad81ce 1749 for (bitmaps = arg->bitmaps; bitmaps; bitmaps = bitmaps->next) {
3b1f244c
EB
1750 exp->nr_export_bitmaps++;
1751 }
1752 exp->export_bitmaps = g_new0(BdrvDirtyBitmap *, exp->nr_export_bitmaps);
1753 for (i = 0, bitmaps = arg->bitmaps; bitmaps;
e5fb29d5
VSO
1754 i++, bitmaps = bitmaps->next)
1755 {
1756 const char *bitmap;
331170e0 1757 BlockDriverState *bs = blk_bs(blk);
678ba275 1758 BdrvDirtyBitmap *bm = NULL;
678ba275 1759
e5fb29d5
VSO
1760 switch (bitmaps->value->type) {
1761 case QTYPE_QSTRING:
1762 bitmap = bitmaps->value->u.local;
1763 while (bs) {
1764 bm = bdrv_find_dirty_bitmap(bs, bitmap);
1765 if (bm != NULL) {
1766 break;
1767 }
1768
1769 bs = bdrv_filter_or_cow_bs(bs);
678ba275
EB
1770 }
1771
e5fb29d5
VSO
1772 if (bm == NULL) {
1773 ret = -ENOENT;
1774 error_setg(errp, "Bitmap '%s' is not found",
1775 bitmaps->value->u.local);
1776 goto fail;
1777 }
678ba275 1778
e5fb29d5
VSO
1779 if (readonly && bdrv_is_writable(bs) &&
1780 bdrv_dirty_bitmap_enabled(bm)) {
1781 ret = -EINVAL;
1782 error_setg(errp, "Enabled bitmap '%s' incompatible with "
1783 "readonly export", bitmap);
1784 goto fail;
1785 }
1786 break;
1787 case QTYPE_QDICT:
1788 bitmap = bitmaps->value->u.external.name;
1789 bm = block_dirty_bitmap_lookup(bitmaps->value->u.external.node,
1790 bitmap, NULL, errp);
1791 if (!bm) {
1792 ret = -ENOENT;
1793 goto fail;
1794 }
1795 break;
1796 default:
1797 abort();
678ba275
EB
1798 }
1799
e5fb29d5 1800 assert(bm);
3b78a927 1801
e5fb29d5 1802 if (bdrv_dirty_bitmap_check(bm, BDRV_BITMAP_ALLOW_RO, errp)) {
a6ff7989 1803 ret = -EINVAL;
678ba275
EB
1804 goto fail;
1805 }
1806
3b1f244c 1807 exp->export_bitmaps[i] = bm;
cbad81ce 1808 assert(strlen(bitmap) <= BDRV_BITMAP_MAX_NAME_SIZE);
678ba275
EB
1809 }
1810
3b1f244c
EB
1811 /* Mark bitmaps busy in a separate loop, to simplify roll-back concerns. */
1812 for (i = 0; i < exp->nr_export_bitmaps; i++) {
1813 bdrv_dirty_bitmap_set_busy(exp->export_bitmaps[i], true);
1814 }
1815
dbc7b014
EB
1816 exp->allocation_depth = arg->allocation_depth;
1817
fd6afc50
SL
1818 /*
1819 * We need to inhibit request queuing in the block layer to ensure we can
1820 * be properly quiesced when entering a drained section, as our coroutines
1821 * servicing pending requests might enter blk_pread().
1822 */
1823 blk_set_disable_request_queuing(blk, true);
1824
aadf99a7 1825 blk_add_aio_context_notifier(blk, blk_aio_attached, blk_aio_detach, exp);
741cc431 1826
fd6afc50
SL
1827 blk_set_dev_ops(blk, &nbd_block_ops, exp);
1828
3fa4c765 1829 QTAILQ_INSERT_TAIL(&exports, exp, next);
c69de1be 1830
372b69f5
KW
1831 bdrv_graph_rdunlock_main_loop();
1832
a6ff7989 1833 return 0;
98f44bbe
HR
1834
1835fail:
372b69f5 1836 bdrv_graph_rdunlock_main_loop();
3b1f244c 1837 g_free(exp->export_bitmaps);
3fa4c765
EB
1838 g_free(exp->name);
1839 g_free(exp->description);
a6ff7989 1840 return ret;
af49bbbe
PB
1841}
1842
ee0a19ec
PB
1843NBDExport *nbd_export_find(const char *name)
1844{
1845 NBDExport *exp;
1846 QTAILQ_FOREACH(exp, &exports, next) {
1847 if (strcmp(name, exp->name) == 0) {
1848 return exp;
1849 }
1850 }
1851
1852 return NULL;
1853}
1854
61bc846d
EB
1855AioContext *
1856nbd_export_aio_context(NBDExport *exp)
1857{
8612c686 1858 return exp->common.ctx;
61bc846d
EB
1859}
1860
bc4ee65b 1861static void nbd_export_request_shutdown(BlockExport *blk_exp)
af49bbbe 1862{
bc4ee65b 1863 NBDExport *exp = container_of(blk_exp, NBDExport, common);
4b9441f6 1864 NBDClient *client, *next;
2c8d9f06 1865
c69de1be 1866 blk_exp_ref(&exp->common);
3fa4c765
EB
1867 /*
1868 * TODO: Should we expand QMP NbdServerRemoveNode enum to allow a
1869 * close mode that stops advertising the export to new clients but
1870 * still permits existing clients to run to completion? Because of
1871 * that possibility, nbd_export_close() can be called more than
1872 * once on an export.
1873 */
4b9441f6 1874 QTAILQ_FOREACH_SAFE(client, &exp->clients, next, next) {
0c9390d9 1875 client_close(client, true);
4b9441f6 1876 }
3fa4c765 1877 if (exp->name) {
3fa4c765
EB
1878 g_free(exp->name);
1879 exp->name = NULL;
1880 QTAILQ_REMOVE(&exports, exp, next);
1881 }
c69de1be 1882 blk_exp_unref(&exp->common);
2c8d9f06
PB
1883}
1884
c69de1be 1885static void nbd_export_delete(BlockExport *blk_exp)
2c8d9f06 1886{
3b1f244c 1887 size_t i;
c69de1be 1888 NBDExport *exp = container_of(blk_exp, NBDExport, common);
2c8d9f06 1889
c69de1be
KW
1890 assert(exp->name == NULL);
1891 assert(QTAILQ_EMPTY(&exp->clients));
d6268348 1892
c69de1be
KW
1893 g_free(exp->description);
1894 exp->description = NULL;
1895
dd5b6780
PB
1896 if (exp->eject_notifier_blk) {
1897 notifier_remove(&exp->eject_notifier);
1898 blk_unref(exp->eject_notifier_blk);
c69de1be 1899 }
dd5b6780
PB
1900 blk_remove_aio_context_notifier(exp->common.blk, blk_aio_attached,
1901 blk_aio_detach, exp);
1902 blk_set_disable_request_queuing(exp->common.blk, false);
3d068aff 1903
3b1f244c
EB
1904 for (i = 0; i < exp->nr_export_bitmaps; i++) {
1905 bdrv_dirty_bitmap_set_busy(exp->export_bitmaps[i], false);
2c8d9f06 1906 }
af49bbbe
PB
1907}
1908
56ee8626
KW
1909const BlockExportDriver blk_exp_nbd = {
1910 .type = BLOCK_EXPORT_TYPE_NBD,
a6ff7989 1911 .instance_size = sizeof(NBDExport),
56ee8626 1912 .create = nbd_export_create,
c69de1be 1913 .delete = nbd_export_delete,
bc4ee65b 1914 .request_shutdown = nbd_export_request_shutdown,
56ee8626
KW
1915};
1916
de79bfc3
VSO
1917static int coroutine_fn nbd_co_send_iov(NBDClient *client, struct iovec *iov,
1918 unsigned niov, Error **errp)
1919{
1920 int ret;
1921
1922 g_assert(qemu_in_coroutine());
1923 qemu_co_mutex_lock(&client->send_lock);
1924 client->send_coroutine = qemu_coroutine_self();
1925
1926 ret = qio_channel_writev_all(client->ioc, iov, niov, errp) < 0 ? -EIO : 0;
1927
1928 client->send_coroutine = NULL;
1929 qemu_co_mutex_unlock(&client->send_lock);
1930
1931 return ret;
1932}
1933
caad5384 1934static inline void set_be_simple_reply(NBDSimpleReply *reply, uint64_t error,
22efd811 1935 uint64_t cookie)
caad5384
VSO
1936{
1937 stl_be_p(&reply->magic, NBD_SIMPLE_REPLY_MAGIC);
1938 stl_be_p(&reply->error, error);
22efd811 1939 stq_be_p(&reply->cookie, cookie);
caad5384
VSO
1940}
1941
d2223cdd 1942static int coroutine_fn nbd_co_send_simple_reply(NBDClient *client,
66d4f4fe 1943 NBDRequest *request,
d2223cdd
PB
1944 uint32_t error,
1945 void *data,
b2578459 1946 uint64_t len,
d2223cdd 1947 Error **errp)
22045592 1948{
de79bfc3 1949 NBDSimpleReply reply;
14cea41d 1950 int nbd_err = system_errno_to_nbd_errno(error);
de79bfc3
VSO
1951 struct iovec iov[] = {
1952 {.iov_base = &reply, .iov_len = sizeof(reply)},
1953 {.iov_base = data, .iov_len = len}
1954 };
6fb2b972 1955
a7c8ed36 1956 assert(!len || !nbd_err);
b2578459 1957 assert(len <= NBD_MAX_BUFFER_SIZE);
ac132d05
EB
1958 assert(client->mode < NBD_MODE_STRUCTURED ||
1959 (client->mode == NBD_MODE_STRUCTURED &&
1960 request->type != NBD_CMD_READ));
22efd811 1961 trace_nbd_co_send_simple_reply(request->cookie, nbd_err,
66d4f4fe 1962 nbd_err_lookup(nbd_err), len);
22efd811 1963 set_be_simple_reply(&reply, nbd_err, request->cookie);
262db388 1964
a7c8ed36 1965 return nbd_co_send_iov(client, iov, 2, errp);
22045592
PB
1966}
1967
a7c8ed36
EB
1968/*
1969 * Prepare the header of a reply chunk for network transmission.
1970 *
1971 * On input, @iov is partially initialized: iov[0].iov_base must point
1972 * to an uninitialized NBDReply, while the remaining @niov elements
1973 * (if any) must be ready for transmission. This function then
1974 * populates iov[0] for transmission.
1975 */
1976static inline void set_be_chunk(NBDClient *client, struct iovec *iov,
1977 size_t niov, uint16_t flags, uint16_t type,
66d4f4fe 1978 NBDRequest *request)
5c54e7fa 1979{
a7c8ed36
EB
1980 size_t i, length = 0;
1981
1982 for (i = 1; i < niov; i++) {
1983 length += iov[i].iov_len;
1984 }
1985 assert(length <= NBD_MAX_BUFFER_SIZE + sizeof(NBDStructuredReadData));
1986
11d3355f
EB
1987 if (client->mode >= NBD_MODE_EXTENDED) {
1988 NBDExtendedReplyChunk *chunk = iov->iov_base;
1989
1990 iov[0].iov_len = sizeof(*chunk);
1991 stl_be_p(&chunk->magic, NBD_EXTENDED_REPLY_MAGIC);
1992 stw_be_p(&chunk->flags, flags);
1993 stw_be_p(&chunk->type, type);
1994 stq_be_p(&chunk->cookie, request->cookie);
1995 stq_be_p(&chunk->offset, request->from);
1996 stq_be_p(&chunk->length, length);
1997 } else {
1998 NBDStructuredReplyChunk *chunk = iov->iov_base;
1999
2000 iov[0].iov_len = sizeof(*chunk);
2001 stl_be_p(&chunk->magic, NBD_STRUCTURED_REPLY_MAGIC);
2002 stw_be_p(&chunk->flags, flags);
2003 stw_be_p(&chunk->type, type);
2004 stq_be_p(&chunk->cookie, request->cookie);
2005 stl_be_p(&chunk->length, length);
2006 }
5c54e7fa
VSO
2007}
2008
a7c8ed36 2009static int coroutine_fn nbd_co_send_chunk_done(NBDClient *client,
66d4f4fe 2010 NBDRequest *request,
a7c8ed36 2011 Error **errp)
ef8c887e 2012{
a7c8ed36 2013 NBDReply hdr;
ef8c887e 2014 struct iovec iov[] = {
a7c8ed36 2015 {.iov_base = &hdr},
ef8c887e
EB
2016 };
2017
22efd811 2018 trace_nbd_co_send_chunk_done(request->cookie);
a7c8ed36 2019 set_be_chunk(client, iov, 1, NBD_REPLY_FLAG_DONE,
66d4f4fe 2020 NBD_REPLY_TYPE_NONE, request);
ef8c887e
EB
2021 return nbd_co_send_iov(client, iov, 1, errp);
2022}
2023
a7c8ed36 2024static int coroutine_fn nbd_co_send_chunk_read(NBDClient *client,
66d4f4fe 2025 NBDRequest *request,
a7c8ed36
EB
2026 uint64_t offset,
2027 void *data,
b2578459 2028 uint64_t size,
a7c8ed36
EB
2029 bool final,
2030 Error **errp)
5c54e7fa 2031{
a7c8ed36 2032 NBDReply hdr;
efdc0c10 2033 NBDStructuredReadData chunk;
5c54e7fa 2034 struct iovec iov[] = {
a7c8ed36 2035 {.iov_base = &hdr},
5c54e7fa
VSO
2036 {.iov_base = &chunk, .iov_len = sizeof(chunk)},
2037 {.iov_base = data, .iov_len = size}
2038 };
2039
b2578459 2040 assert(size && size <= NBD_MAX_BUFFER_SIZE);
22efd811 2041 trace_nbd_co_send_chunk_read(request->cookie, offset, data, size);
a7c8ed36 2042 set_be_chunk(client, iov, 3, final ? NBD_REPLY_FLAG_DONE : 0,
66d4f4fe 2043 NBD_REPLY_TYPE_OFFSET_DATA, request);
5c54e7fa
VSO
2044 stq_be_p(&chunk.offset, offset);
2045
a7c8ed36 2046 return nbd_co_send_iov(client, iov, 3, errp);
5c54e7fa 2047}
ac132d05 2048
a7c8ed36 2049static int coroutine_fn nbd_co_send_chunk_error(NBDClient *client,
66d4f4fe 2050 NBDRequest *request,
a7c8ed36
EB
2051 uint32_t error,
2052 const char *msg,
2053 Error **errp)
60ace2ba 2054{
a7c8ed36 2055 NBDReply hdr;
60ace2ba
VSO
2056 NBDStructuredError chunk;
2057 int nbd_err = system_errno_to_nbd_errno(error);
2058 struct iovec iov[] = {
a7c8ed36 2059 {.iov_base = &hdr},
60ace2ba
VSO
2060 {.iov_base = &chunk, .iov_len = sizeof(chunk)},
2061 {.iov_base = (char *)msg, .iov_len = msg ? strlen(msg) : 0},
2062 };
2063
2064 assert(nbd_err);
22efd811 2065 trace_nbd_co_send_chunk_error(request->cookie, nbd_err,
a7c8ed36
EB
2066 nbd_err_lookup(nbd_err), msg ? msg : "");
2067 set_be_chunk(client, iov, 3, NBD_REPLY_FLAG_DONE,
66d4f4fe 2068 NBD_REPLY_TYPE_ERROR, request);
60ace2ba 2069 stl_be_p(&chunk.error, nbd_err);
a7c8ed36 2070 stw_be_p(&chunk.message_length, iov[2].iov_len);
60ace2ba 2071
a7c8ed36 2072 return nbd_co_send_iov(client, iov, 3, errp);
60ace2ba
VSO
2073}
2074
37e02aeb 2075/* Do a sparse read and send the structured reply to the client.
ff7e261b 2076 * Returns -errno if sending fails. blk_co_block_status_above() failure is
37e02aeb
VSO
2077 * reported to the client, at which point this function succeeds.
2078 */
418638d3 2079static int coroutine_fn nbd_co_send_sparse_read(NBDClient *client,
66d4f4fe 2080 NBDRequest *request,
418638d3
EB
2081 uint64_t offset,
2082 uint8_t *data,
b2578459 2083 uint64_t size,
418638d3
EB
2084 Error **errp)
2085{
2086 int ret = 0;
2087 NBDExport *exp = client->exp;
2088 size_t progress = 0;
2089
b2578459 2090 assert(size <= NBD_MAX_BUFFER_SIZE);
418638d3
EB
2091 while (progress < size) {
2092 int64_t pnum;
ff7e261b
EGE
2093 int status = blk_co_block_status_above(exp->common.blk, NULL,
2094 offset + progress,
2095 size - progress, &pnum, NULL,
2096 NULL);
e2de3256 2097 bool final;
418638d3
EB
2098
2099 if (status < 0) {
37e02aeb
VSO
2100 char *msg = g_strdup_printf("unable to check for holes: %s",
2101 strerror(-status));
2102
66d4f4fe 2103 ret = nbd_co_send_chunk_error(client, request, -status, msg, errp);
37e02aeb
VSO
2104 g_free(msg);
2105 return ret;
418638d3
EB
2106 }
2107 assert(pnum && pnum <= size - progress);
e2de3256 2108 final = progress + pnum == size;
418638d3 2109 if (status & BDRV_BLOCK_ZERO) {
a7c8ed36 2110 NBDReply hdr;
418638d3
EB
2111 NBDStructuredReadHole chunk;
2112 struct iovec iov[] = {
a7c8ed36 2113 {.iov_base = &hdr},
418638d3
EB
2114 {.iov_base = &chunk, .iov_len = sizeof(chunk)},
2115 };
2116
22efd811 2117 trace_nbd_co_send_chunk_read_hole(request->cookie,
66d4f4fe 2118 offset + progress, pnum);
a7c8ed36
EB
2119 set_be_chunk(client, iov, 2,
2120 final ? NBD_REPLY_FLAG_DONE : 0,
66d4f4fe 2121 NBD_REPLY_TYPE_OFFSET_HOLE, request);
418638d3
EB
2122 stq_be_p(&chunk.offset, offset + progress);
2123 stl_be_p(&chunk.length, pnum);
a7c8ed36 2124 ret = nbd_co_send_iov(client, iov, 2, errp);
418638d3 2125 } else {
d2223cdd
PB
2126 ret = blk_co_pread(exp->common.blk, offset + progress, pnum,
2127 data + progress, 0);
418638d3
EB
2128 if (ret < 0) {
2129 error_setg_errno(errp, -ret, "reading from file failed");
2130 break;
2131 }
66d4f4fe 2132 ret = nbd_co_send_chunk_read(client, request, offset + progress,
a7c8ed36 2133 data + progress, pnum, final, errp);
418638d3
EB
2134 }
2135
2136 if (ret < 0) {
2137 break;
2138 }
2139 progress += pnum;
2140 }
418638d3
EB
2141 return ret;
2142}
2143
89cbc7e3 2144typedef struct NBDExtentArray {
bcc16cc1 2145 NBDExtent64 *extents;
89cbc7e3
VSO
2146 unsigned int nb_alloc;
2147 unsigned int count;
2148 uint64_t total_length;
bcc16cc1 2149 bool extended;
89cbc7e3
VSO
2150 bool can_add;
2151 bool converted_to_be;
2152} NBDExtentArray;
2153
bcc16cc1
EB
2154static NBDExtentArray *nbd_extent_array_new(unsigned int nb_alloc,
2155 NBDMode mode)
89cbc7e3
VSO
2156{
2157 NBDExtentArray *ea = g_new0(NBDExtentArray, 1);
2158
bcc16cc1 2159 assert(mode >= NBD_MODE_STRUCTURED);
89cbc7e3 2160 ea->nb_alloc = nb_alloc;
bcc16cc1
EB
2161 ea->extents = g_new(NBDExtent64, nb_alloc);
2162 ea->extended = mode >= NBD_MODE_EXTENDED;
89cbc7e3
VSO
2163 ea->can_add = true;
2164
2165 return ea;
2166}
2167
2168static void nbd_extent_array_free(NBDExtentArray *ea)
2169{
2170 g_free(ea->extents);
2171 g_free(ea);
2172}
e0e7fe07 2173G_DEFINE_AUTOPTR_CLEANUP_FUNC(NBDExtentArray, nbd_extent_array_free)
89cbc7e3
VSO
2174
2175/* Further modifications of the array after conversion are abandoned */
2176static void nbd_extent_array_convert_to_be(NBDExtentArray *ea)
2177{
2178 int i;
2179
2180 assert(!ea->converted_to_be);
bcc16cc1 2181 assert(ea->extended);
89cbc7e3
VSO
2182 ea->can_add = false;
2183 ea->converted_to_be = true;
2184
2185 for (i = 0; i < ea->count; i++) {
bcc16cc1
EB
2186 ea->extents[i].length = cpu_to_be64(ea->extents[i].length);
2187 ea->extents[i].flags = cpu_to_be64(ea->extents[i].flags);
89cbc7e3
VSO
2188 }
2189}
2190
bcc16cc1
EB
2191/* Further modifications of the array after conversion are abandoned */
2192static NBDExtent32 *nbd_extent_array_convert_to_narrow(NBDExtentArray *ea)
2193{
2194 int i;
2195 NBDExtent32 *extents = g_new(NBDExtent32, ea->count);
2196
2197 assert(!ea->converted_to_be);
2198 assert(!ea->extended);
2199 ea->can_add = false;
2200 ea->converted_to_be = true;
2201
2202 for (i = 0; i < ea->count; i++) {
2203 assert((ea->extents[i].length | ea->extents[i].flags) <= UINT32_MAX);
2204 extents[i].length = cpu_to_be32(ea->extents[i].length);
2205 extents[i].flags = cpu_to_be32(ea->extents[i].flags);
2206 }
2207
2208 return extents;
2209}
2210
fb7afc79 2211/*
89cbc7e3
VSO
2212 * Add extent to NBDExtentArray. If extent can't be added (no available space),
2213 * return -1.
2214 * For safety, when returning -1 for the first time, .can_add is set to false,
314b9026
EB
2215 * and further calls to nbd_extent_array_add() will crash.
2216 * (this avoids the situation where a caller ignores failure to add one extent,
2217 * where adding another extent that would squash into the last array entry
2218 * would result in an incorrect range reported to the client)
fb7afc79 2219 */
89cbc7e3 2220static int nbd_extent_array_add(NBDExtentArray *ea,
bcc16cc1 2221 uint64_t length, uint32_t flags)
e7b1948d 2222{
89cbc7e3
VSO
2223 assert(ea->can_add);
2224
2225 if (!length) {
2226 return 0;
2227 }
bcc16cc1
EB
2228 if (!ea->extended) {
2229 assert(length <= UINT32_MAX);
2230 }
89cbc7e3
VSO
2231
2232 /* Extend previous extent if flags are the same */
2233 if (ea->count > 0 && flags == ea->extents[ea->count - 1].flags) {
bcc16cc1 2234 uint64_t sum = length + ea->extents[ea->count - 1].length;
89cbc7e3 2235
bcc16cc1
EB
2236 /*
2237 * sum cannot overflow: the block layer bounds image size at
2238 * 2^63, and ea->extents[].length comes from the block layer.
2239 */
2240 assert(sum >= length);
2241 if (sum <= UINT32_MAX || ea->extended) {
89cbc7e3
VSO
2242 ea->extents[ea->count - 1].length = sum;
2243 ea->total_length += length;
2244 return 0;
2245 }
2246 }
2247
2248 if (ea->count >= ea->nb_alloc) {
2249 ea->can_add = false;
2250 return -1;
2251 }
2252
2253 ea->total_length += length;
bcc16cc1 2254 ea->extents[ea->count] = (NBDExtent64) {.length = length, .flags = flags};
89cbc7e3 2255 ea->count++;
e7b1948d 2256
89cbc7e3
VSO
2257 return 0;
2258}
2259
ff7e261b 2260static int coroutine_fn blockstatus_to_extents(BlockBackend *blk,
6f58ac55
EGE
2261 uint64_t offset, uint64_t bytes,
2262 NBDExtentArray *ea)
89cbc7e3
VSO
2263{
2264 while (bytes) {
e7b1948d
VSO
2265 uint32_t flags;
2266 int64_t num;
ff7e261b
EGE
2267 int ret = blk_co_block_status_above(blk, NULL, offset, bytes, &num,
2268 NULL, NULL);
fb7afc79 2269
e7b1948d
VSO
2270 if (ret < 0) {
2271 return ret;
2272 }
2273
0da98568
NS
2274 flags = (ret & BDRV_BLOCK_DATA ? 0 : NBD_STATE_HOLE) |
2275 (ret & BDRV_BLOCK_ZERO ? NBD_STATE_ZERO : 0);
e7b1948d 2276
89cbc7e3
VSO
2277 if (nbd_extent_array_add(ea, num, flags) < 0) {
2278 return 0;
e7b1948d 2279 }
fb7afc79 2280
89cbc7e3
VSO
2281 offset += num;
2282 bytes -= num;
e7b1948d
VSO
2283 }
2284
e7b1948d
VSO
2285 return 0;
2286}
2287
ff7e261b 2288static int coroutine_fn blockalloc_to_extents(BlockBackend *blk,
6f58ac55
EGE
2289 uint64_t offset, uint64_t bytes,
2290 NBDExtentArray *ea)
71719cd5
EB
2291{
2292 while (bytes) {
2293 int64_t num;
ff7e261b
EGE
2294 int ret = blk_co_is_allocated_above(blk, NULL, false, offset, bytes,
2295 &num);
71719cd5
EB
2296
2297 if (ret < 0) {
2298 return ret;
2299 }
2300
2301 if (nbd_extent_array_add(ea, num, ret) < 0) {
2302 return 0;
2303 }
2304
2305 offset += num;
2306 bytes -= num;
2307 }
2308
2309 return 0;
2310}
2311
89cbc7e3
VSO
2312/*
2313 * nbd_co_send_extents
3d068aff 2314 *
89cbc7e3
VSO
2315 * @ea is converted to BE by the function
2316 * @last controls whether NBD_REPLY_FLAG_DONE is sent.
3d068aff 2317 */
d2223cdd 2318static int coroutine_fn
66d4f4fe 2319nbd_co_send_extents(NBDClient *client, NBDRequest *request, NBDExtentArray *ea,
d2223cdd 2320 bool last, uint32_t context_id, Error **errp)
e7b1948d 2321{
a7c8ed36 2322 NBDReply hdr;
bcc16cc1
EB
2323 NBDStructuredMeta meta;
2324 NBDExtendedMeta meta_ext;
2325 g_autofree NBDExtent32 *extents = NULL;
2326 uint16_t type;
2327 struct iovec iov[] = { {.iov_base = &hdr}, {0}, {0} };
e7b1948d 2328
bcc16cc1
EB
2329 if (client->mode >= NBD_MODE_EXTENDED) {
2330 type = NBD_REPLY_TYPE_BLOCK_STATUS_EXT;
2331
2332 iov[1].iov_base = &meta_ext;
2333 iov[1].iov_len = sizeof(meta_ext);
2334 stl_be_p(&meta_ext.context_id, context_id);
2335 stl_be_p(&meta_ext.count, ea->count);
2336
2337 nbd_extent_array_convert_to_be(ea);
2338 iov[2].iov_base = ea->extents;
2339 iov[2].iov_len = ea->count * sizeof(ea->extents[0]);
2340 } else {
2341 type = NBD_REPLY_TYPE_BLOCK_STATUS;
2342
2343 iov[1].iov_base = &meta;
2344 iov[1].iov_len = sizeof(meta);
2345 stl_be_p(&meta.context_id, context_id);
2346
2347 extents = nbd_extent_array_convert_to_narrow(ea);
2348 iov[2].iov_base = extents;
2349 iov[2].iov_len = ea->count * sizeof(extents[0]);
2350 }
89cbc7e3 2351
22efd811 2352 trace_nbd_co_send_extents(request->cookie, ea->count, context_id,
66d4f4fe 2353 ea->total_length, last);
bcc16cc1
EB
2354 set_be_chunk(client, iov, 3, last ? NBD_REPLY_FLAG_DONE : 0, type,
2355 request);
e7b1948d 2356
a7c8ed36 2357 return nbd_co_send_iov(client, iov, 3, errp);
e7b1948d
VSO
2358}
2359
2360/* Get block status from the exported device and send it to the client */
6f58ac55 2361static int
66d4f4fe 2362coroutine_fn nbd_co_send_block_status(NBDClient *client, NBDRequest *request,
ff7e261b 2363 BlockBackend *blk, uint64_t offset,
bcc16cc1 2364 uint64_t length, bool dont_fragment,
6f58ac55
EGE
2365 bool last, uint32_t context_id,
2366 Error **errp)
e7b1948d
VSO
2367{
2368 int ret;
416e34bd 2369 unsigned int nb_extents = dont_fragment ? 1 : NBD_MAX_BLOCK_STATUS_EXTENTS;
bcc16cc1
EB
2370 g_autoptr(NBDExtentArray) ea =
2371 nbd_extent_array_new(nb_extents, client->mode);
e7b1948d 2372
71719cd5 2373 if (context_id == NBD_META_ID_BASE_ALLOCATION) {
ff7e261b 2374 ret = blockstatus_to_extents(blk, offset, length, ea);
71719cd5 2375 } else {
ff7e261b 2376 ret = blockalloc_to_extents(blk, offset, length, ea);
71719cd5 2377 }
e7b1948d 2378 if (ret < 0) {
66d4f4fe 2379 return nbd_co_send_chunk_error(client, request, -ret,
a7c8ed36 2380 "can't get block status", errp);
e7b1948d
VSO
2381 }
2382
66d4f4fe 2383 return nbd_co_send_extents(client, request, ea, last, context_id, errp);
3d068aff
VSO
2384}
2385
dacbb6eb 2386/* Populate @ea from a dirty bitmap. */
89cbc7e3
VSO
2387static void bitmap_to_extents(BdrvDirtyBitmap *bitmap,
2388 uint64_t offset, uint64_t length,
dacbb6eb 2389 NBDExtentArray *es)
3d068aff 2390{
dacbb6eb
VSO
2391 int64_t start, dirty_start, dirty_count;
2392 int64_t end = offset + length;
2393 bool full = false;
bcc16cc1 2394 int64_t bound = es->extended ? INT64_MAX : INT32_MAX;
3d068aff
VSO
2395
2396 bdrv_dirty_bitmap_lock(bitmap);
2397
dacbb6eb 2398 for (start = offset;
bcc16cc1 2399 bdrv_dirty_bitmap_next_dirty_area(bitmap, start, end, bound,
dacbb6eb
VSO
2400 &dirty_start, &dirty_count);
2401 start = dirty_start + dirty_count)
2402 {
2403 if ((nbd_extent_array_add(es, dirty_start - start, 0) < 0) ||
2404 (nbd_extent_array_add(es, dirty_count, NBD_STATE_DIRTY) < 0))
2405 {
2406 full = true;
89cbc7e3
VSO
2407 break;
2408 }
3d068aff
VSO
2409 }
2410
dacbb6eb 2411 if (!full) {
c0b21f2e
EB
2412 /* last non dirty extent, nothing to do if array is now full */
2413 (void) nbd_extent_array_add(es, end - start, 0);
dacbb6eb 2414 }
3d068aff
VSO
2415
2416 bdrv_dirty_bitmap_unlock(bitmap);
3d068aff
VSO
2417}
2418
66d4f4fe
EB
2419static int coroutine_fn nbd_co_send_bitmap(NBDClient *client,
2420 NBDRequest *request,
2421 BdrvDirtyBitmap *bitmap,
2422 uint64_t offset,
bcc16cc1 2423 uint64_t length, bool dont_fragment,
66d4f4fe
EB
2424 bool last, uint32_t context_id,
2425 Error **errp)
3d068aff 2426{
416e34bd 2427 unsigned int nb_extents = dont_fragment ? 1 : NBD_MAX_BLOCK_STATUS_EXTENTS;
bcc16cc1
EB
2428 g_autoptr(NBDExtentArray) ea =
2429 nbd_extent_array_new(nb_extents, client->mode);
3d068aff 2430
dacbb6eb 2431 bitmap_to_extents(bitmap, offset, length, ea);
3d068aff 2432
66d4f4fe 2433 return nbd_co_send_extents(client, request, ea, last, context_id, errp);
e7b1948d
VSO
2434}
2435
2dcbb11b
EB
2436/*
2437 * nbd_co_block_status_payload_read
2438 * Called when a client wants a subset of negotiated contexts via a
2439 * BLOCK_STATUS payload. Check the payload for valid length and
2440 * contents. On success, return 0 with request updated to effective
2441 * length. If request was invalid but all payload consumed, return 0
2442 * with request->len and request->contexts->count set to 0 (which will
2443 * trigger an appropriate NBD_EINVAL response later on). Return
2444 * negative errno if the payload was not fully consumed.
2445 */
2446static int
2447nbd_co_block_status_payload_read(NBDClient *client, NBDRequest *request,
2448 Error **errp)
2449{
2450 uint64_t payload_len = request->len;
2451 g_autofree char *buf = NULL;
2452 size_t count, i, nr_bitmaps;
2453 uint32_t id;
2454
2455 if (payload_len > NBD_MAX_BUFFER_SIZE) {
2456 error_setg(errp, "len (%" PRIu64 ") is larger than max len (%u)",
2457 request->len, NBD_MAX_BUFFER_SIZE);
2458 return -EINVAL;
2459 }
2460
2461 assert(client->contexts.exp == client->exp);
2462 nr_bitmaps = client->exp->nr_export_bitmaps;
2463 request->contexts = g_new0(NBDMetaContexts, 1);
2464 request->contexts->exp = client->exp;
2465
2466 if (payload_len % sizeof(uint32_t) ||
2467 payload_len < sizeof(NBDBlockStatusPayload) ||
2468 payload_len > (sizeof(NBDBlockStatusPayload) +
2469 sizeof(id) * client->contexts.count)) {
2470 goto skip;
2471 }
2472
2473 buf = g_malloc(payload_len);
2474 if (nbd_read(client->ioc, buf, payload_len,
2475 "CMD_BLOCK_STATUS data", errp) < 0) {
2476 return -EIO;
2477 }
2478 trace_nbd_co_receive_request_payload_received(request->cookie,
2479 payload_len);
2480 request->contexts->bitmaps = g_new0(bool, nr_bitmaps);
2481 count = (payload_len - sizeof(NBDBlockStatusPayload)) / sizeof(id);
2482 payload_len = 0;
2483
2484 for (i = 0; i < count; i++) {
2485 id = ldl_be_p(buf + sizeof(NBDBlockStatusPayload) + sizeof(id) * i);
2486 if (id == NBD_META_ID_BASE_ALLOCATION) {
2487 if (!client->contexts.base_allocation ||
2488 request->contexts->base_allocation) {
2489 goto skip;
2490 }
2491 request->contexts->base_allocation = true;
2492 } else if (id == NBD_META_ID_ALLOCATION_DEPTH) {
2493 if (!client->contexts.allocation_depth ||
2494 request->contexts->allocation_depth) {
2495 goto skip;
2496 }
2497 request->contexts->allocation_depth = true;
2498 } else {
2499 unsigned idx = id - NBD_META_ID_DIRTY_BITMAP;
2500
2501 if (idx >= nr_bitmaps || !client->contexts.bitmaps[idx] ||
2502 request->contexts->bitmaps[idx]) {
2503 goto skip;
2504 }
2505 request->contexts->bitmaps[idx] = true;
2506 }
2507 }
2508
2509 request->len = ldq_be_p(buf);
2510 request->contexts->count = count;
2511 return 0;
2512
2513 skip:
2514 trace_nbd_co_receive_block_status_payload_compliance(request->from,
2515 request->len);
2516 request->len = request->contexts->count = 0;
2517 return nbd_drop(client->ioc, payload_len, errp);
2518}
2519
2a6e128b
VSO
2520/* nbd_co_receive_request
2521 * Collect a client request. Return 0 if request looks valid, -EIO to drop
f148ae7d
SL
2522 * connection right away, -EAGAIN to indicate we were interrupted and the
2523 * channel should be quiesced, and any other negative value to report an error
2524 * to the client (although the caller may still need to disconnect after
2525 * reporting the error).
2a6e128b 2526 */
8db7e2d6
EB
2527static int coroutine_fn nbd_co_receive_request(NBDRequestData *req,
2528 NBDRequest *request,
d2223cdd 2529 Error **errp)
a030b347 2530{
72deddc5 2531 NBDClient *client = req->client;
009cd866 2532 bool extended_with_payload;
8db7e2d6
EB
2533 bool check_length = false;
2534 bool check_rofs = false;
2535 bool allocate_buffer = false;
009cd866
EB
2536 bool payload_okay = false;
2537 uint64_t payload_len = 0;
8db7e2d6 2538 int valid_flags = NBD_CMD_FLAG_FUA;
f148ae7d 2539 int ret;
a030b347 2540
1c778ef7 2541 g_assert(qemu_in_coroutine());
ff82911c 2542 assert(client->recv_coroutine == qemu_coroutine_self());
f148ae7d
SL
2543 ret = nbd_receive_request(client, request, errp);
2544 if (ret < 0) {
314b9026 2545 return ret;
a030b347
PB
2546 }
2547
22efd811 2548 trace_nbd_co_receive_request_decode_type(request->cookie, request->type,
3736cc5b 2549 nbd_cmd_lookup(request->type));
009cd866
EB
2550 extended_with_payload = client->mode >= NBD_MODE_EXTENDED &&
2551 request->flags & NBD_CMD_FLAG_PAYLOAD_LEN;
2552 if (extended_with_payload) {
2553 payload_len = request->len;
2554 check_length = true;
2555 }
2556
8db7e2d6
EB
2557 switch (request->type) {
2558 case NBD_CMD_DISC:
29b6c3b3
EB
2559 /* Special case: we're going to disconnect without a reply,
2560 * whether or not flags, from, or len are bogus */
8db7e2d6 2561 req->complete = true;
ee898b87 2562 return -EIO;
29b6c3b3 2563
8db7e2d6
EB
2564 case NBD_CMD_READ:
2565 if (client->mode >= NBD_MODE_STRUCTURED) {
2566 valid_flags |= NBD_CMD_FLAG_DF;
eb38c3b6 2567 }
8db7e2d6
EB
2568 check_length = true;
2569 allocate_buffer = true;
2570 break;
eb38c3b6 2571
8db7e2d6 2572 case NBD_CMD_WRITE:
009cd866
EB
2573 if (client->mode >= NBD_MODE_EXTENDED) {
2574 if (!extended_with_payload) {
2575 /* The client is noncompliant. Trace it, but proceed. */
2576 trace_nbd_co_receive_ext_payload_compliance(request->from,
2577 request->len);
2578 }
2579 valid_flags |= NBD_CMD_FLAG_PAYLOAD_LEN;
2580 }
2581 payload_okay = true;
8db7e2d6
EB
2582 payload_len = request->len;
2583 check_length = true;
2584 allocate_buffer = true;
2585 check_rofs = true;
2586 break;
2587
2588 case NBD_CMD_FLUSH:
2589 break;
2590
2591 case NBD_CMD_TRIM:
2592 check_rofs = true;
2593 break;
2594
2595 case NBD_CMD_CACHE:
2596 check_length = true;
2597 break;
2598
2599 case NBD_CMD_WRITE_ZEROES:
2600 valid_flags |= NBD_CMD_FLAG_NO_HOLE | NBD_CMD_FLAG_FAST_ZERO;
2601 check_rofs = true;
2602 break;
2603
2604 case NBD_CMD_BLOCK_STATUS:
2dcbb11b
EB
2605 if (extended_with_payload) {
2606 ret = nbd_co_block_status_payload_read(client, request, errp);
2607 if (ret < 0) {
2608 return ret;
2609 }
2610 /* payload now consumed */
2611 check_length = false;
2612 payload_len = 0;
2613 valid_flags |= NBD_CMD_FLAG_PAYLOAD_LEN;
2614 } else {
2615 request->contexts = &client->contexts;
2616 }
8db7e2d6
EB
2617 valid_flags |= NBD_CMD_FLAG_REQ_ONE;
2618 break;
2619
2620 default:
2621 /* Unrecognized, will fail later */
2622 ;
2d821488 2623 }
7fa5c565 2624
8db7e2d6
EB
2625 /* Payload and buffer handling. */
2626 if (!payload_len) {
2627 req->complete = true;
2628 }
2629 if (check_length && request->len > NBD_MAX_BUFFER_SIZE) {
2630 /* READ, WRITE, CACHE */
2631 error_setg(errp, "len (%" PRIu64 ") is larger than max len (%u)",
2632 request->len, NBD_MAX_BUFFER_SIZE);
2633 return -EINVAL;
2634 }
009cd866
EB
2635 if (payload_len && !payload_okay) {
2636 /*
2637 * For now, we don't support payloads on other commands; but
2638 * we can keep the connection alive by ignoring the payload.
2639 * We will fail the command later with NBD_EINVAL for the use
2640 * of an unsupported flag (and not for access beyond bounds).
2641 */
2642 assert(request->type != NBD_CMD_WRITE);
2643 request->len = 0;
2644 }
8db7e2d6
EB
2645 if (allocate_buffer) {
2646 /* READ, WRITE */
2647 req->data = blk_try_blockalign(client->exp->common.blk,
2648 request->len);
2649 if (req->data == NULL) {
2650 error_setg(errp, "No memory");
2651 return -ENOMEM;
2652 }
2653 }
2654 if (payload_len) {
009cd866
EB
2655 if (payload_okay) {
2656 /* WRITE */
2657 assert(req->data);
2658 ret = nbd_read(client->ioc, req->data, payload_len,
2659 "CMD_WRITE data", errp);
2660 } else {
2661 ret = nbd_drop(client->ioc, payload_len, errp);
2662 }
8db7e2d6 2663 if (ret < 0) {
ee898b87 2664 return -EIO;
a030b347 2665 }
29b6c3b3 2666 req->complete = true;
22efd811 2667 trace_nbd_co_receive_request_payload_received(request->cookie,
8db7e2d6 2668 payload_len);
a030b347 2669 }
29b6c3b3 2670
fed5f8f8 2671 /* Sanity checks. */
8db7e2d6
EB
2672 if (client->exp->nbdflags & NBD_FLAG_READ_ONLY && check_rofs) {
2673 /* WRITE, TRIM, WRITE_ZEROES */
fed5f8f8
EB
2674 error_setg(errp, "Export is read-only");
2675 return -EROFS;
2676 }
2677 if (request->from > client->exp->size ||
9d26dfcb 2678 request->len > client->exp->size - request->from) {
b2578459 2679 error_setg(errp, "operation past EOF; From: %" PRIu64 ", Len: %" PRIu64
2fd2c840 2680 ", Size: %" PRIu64, request->from, request->len,
9d26dfcb 2681 client->exp->size);
fed5f8f8
EB
2682 return (request->type == NBD_CMD_WRITE ||
2683 request->type == NBD_CMD_WRITE_ZEROES) ? -ENOSPC : -EINVAL;
29b6c3b3 2684 }
6e280648
EB
2685 if (client->check_align && !QEMU_IS_ALIGNED(request->from | request->len,
2686 client->check_align)) {
2687 /*
2688 * The block layer gracefully handles unaligned requests, but
2689 * it's still worth tracing client non-compliance
2690 */
2691 trace_nbd_co_receive_align_compliance(nbd_cmd_lookup(request->type),
2692 request->from,
2693 request->len,
2694 client->check_align);
2695 }
5c54e7fa
VSO
2696 if (request->flags & ~valid_flags) {
2697 error_setg(errp, "unsupported flags for command %s (got 0x%x)",
2698 nbd_cmd_lookup(request->type), request->flags);
ee898b87 2699 return -EINVAL;
1f4d6d18 2700 }
29b6c3b3 2701
ee898b87 2702 return 0;
a030b347
PB
2703}
2704
6a417599
VSO
2705/* Send simple reply without a payload, or a structured error
2706 * @error_msg is ignored if @ret >= 0
2707 * Returns 0 if connection is still live, -errno on failure to talk to client
2708 */
2709static coroutine_fn int nbd_send_generic_reply(NBDClient *client,
66d4f4fe 2710 NBDRequest *request,
6a417599
VSO
2711 int ret,
2712 const char *error_msg,
2713 Error **errp)
2714{
ac132d05 2715 if (client->mode >= NBD_MODE_STRUCTURED && ret < 0) {
66d4f4fe 2716 return nbd_co_send_chunk_error(client, request, -ret, error_msg, errp);
11d3355f
EB
2717 } else if (client->mode >= NBD_MODE_EXTENDED) {
2718 return nbd_co_send_chunk_done(client, request, errp);
6a417599 2719 } else {
66d4f4fe 2720 return nbd_co_send_simple_reply(client, request, ret < 0 ? -ret : 0,
6a417599
VSO
2721 NULL, 0, errp);
2722 }
2723}
2724
2725/* Handle NBD_CMD_READ request.
2726 * Return -errno if sending fails. Other errors are reported directly to the
2727 * client as an error reply. */
2728static coroutine_fn int nbd_do_cmd_read(NBDClient *client, NBDRequest *request,
2729 uint8_t *data, Error **errp)
2730{
2731 int ret;
2732 NBDExport *exp = client->exp;
2733
7fa5c565 2734 assert(request->type == NBD_CMD_READ);
b2578459 2735 assert(request->len <= NBD_MAX_BUFFER_SIZE);
6a417599
VSO
2736
2737 /* XXX: NBD Protocol only documents use of FUA with WRITE */
2738 if (request->flags & NBD_CMD_FLAG_FUA) {
37a4f70c 2739 ret = blk_co_flush(exp->common.blk);
6a417599 2740 if (ret < 0) {
66d4f4fe 2741 return nbd_send_generic_reply(client, request, ret,
6a417599
VSO
2742 "flush failed", errp);
2743 }
2744 }
2745
ac132d05
EB
2746 if (client->mode >= NBD_MODE_STRUCTURED &&
2747 !(request->flags & NBD_CMD_FLAG_DF) && request->len)
2f454def 2748 {
66d4f4fe 2749 return nbd_co_send_sparse_read(client, request, request->from,
6a417599
VSO
2750 data, request->len, errp);
2751 }
2752
d2223cdd 2753 ret = blk_co_pread(exp->common.blk, request->from, request->len, data, 0);
7fa5c565 2754 if (ret < 0) {
66d4f4fe 2755 return nbd_send_generic_reply(client, request, ret,
6a417599
VSO
2756 "reading from file failed", errp);
2757 }
2758
ac132d05 2759 if (client->mode >= NBD_MODE_STRUCTURED) {
6a417599 2760 if (request->len) {
66d4f4fe 2761 return nbd_co_send_chunk_read(client, request, request->from, data,
a7c8ed36 2762 request->len, true, errp);
6a417599 2763 } else {
66d4f4fe 2764 return nbd_co_send_chunk_done(client, request, errp);
6a417599
VSO
2765 }
2766 } else {
66d4f4fe 2767 return nbd_co_send_simple_reply(client, request, 0,
6a417599
VSO
2768 data, request->len, errp);
2769 }
2770}
2771
7fa5c565
VSO
2772/*
2773 * nbd_do_cmd_cache
2774 *
2775 * Handle NBD_CMD_CACHE request.
2776 * Return -errno if sending fails. Other errors are reported directly to the
2777 * client as an error reply.
2778 */
2779static coroutine_fn int nbd_do_cmd_cache(NBDClient *client, NBDRequest *request,
2780 Error **errp)
2781{
2782 int ret;
2783 NBDExport *exp = client->exp;
2784
2785 assert(request->type == NBD_CMD_CACHE);
b2578459 2786 assert(request->len <= NBD_MAX_BUFFER_SIZE);
7fa5c565 2787
37a4f70c 2788 ret = blk_co_preadv(exp->common.blk, request->from, request->len,
7fa5c565
VSO
2789 NULL, BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH);
2790
66d4f4fe 2791 return nbd_send_generic_reply(client, request, ret,
7fa5c565
VSO
2792 "caching data failed", errp);
2793}
2794
6f302e60
VSO
2795/* Handle NBD request.
2796 * Return -errno if sending fails. Other errors are reported directly to the
2797 * client as an error reply. */
2798static coroutine_fn int nbd_handle_request(NBDClient *client,
2799 NBDRequest *request,
2800 uint8_t *data, Error **errp)
2801{
2802 int ret;
2803 int flags;
2804 NBDExport *exp = client->exp;
2805 char *msg;
3b1f244c 2806 size_t i;
6f302e60
VSO
2807
2808 switch (request->type) {
bc37b06a 2809 case NBD_CMD_CACHE:
7fa5c565
VSO
2810 return nbd_do_cmd_cache(client, request, errp);
2811
2812 case NBD_CMD_READ:
6f302e60
VSO
2813 return nbd_do_cmd_read(client, request, data, errp);
2814
2815 case NBD_CMD_WRITE:
2816 flags = 0;
2817 if (request->flags & NBD_CMD_FLAG_FUA) {
2818 flags |= BDRV_REQ_FUA;
2819 }
b2578459 2820 assert(request->len <= NBD_MAX_BUFFER_SIZE);
d2223cdd
PB
2821 ret = blk_co_pwrite(exp->common.blk, request->from, request->len, data,
2822 flags);
66d4f4fe 2823 return nbd_send_generic_reply(client, request, ret,
6f302e60
VSO
2824 "writing to file failed", errp);
2825
2826 case NBD_CMD_WRITE_ZEROES:
2827 flags = 0;
2828 if (request->flags & NBD_CMD_FLAG_FUA) {
2829 flags |= BDRV_REQ_FUA;
2830 }
2831 if (!(request->flags & NBD_CMD_FLAG_NO_HOLE)) {
2832 flags |= BDRV_REQ_MAY_UNMAP;
2833 }
b491dbb7
EB
2834 if (request->flags & NBD_CMD_FLAG_FAST_ZERO) {
2835 flags |= BDRV_REQ_NO_FALLBACK;
2836 }
d2223cdd
PB
2837 ret = blk_co_pwrite_zeroes(exp->common.blk, request->from, request->len,
2838 flags);
66d4f4fe 2839 return nbd_send_generic_reply(client, request, ret,
6f302e60
VSO
2840 "writing to file failed", errp);
2841
2842 case NBD_CMD_DISC:
2843 /* unreachable, thanks to special case in nbd_co_receive_request() */
2844 abort();
2845
2846 case NBD_CMD_FLUSH:
37a4f70c 2847 ret = blk_co_flush(exp->common.blk);
66d4f4fe 2848 return nbd_send_generic_reply(client, request, ret,
6f302e60
VSO
2849 "flush failed", errp);
2850
2851 case NBD_CMD_TRIM:
e3557422 2852 ret = blk_co_pdiscard(exp->common.blk, request->from, request->len);
890cbccb 2853 if (ret >= 0 && request->flags & NBD_CMD_FLAG_FUA) {
37a4f70c 2854 ret = blk_co_flush(exp->common.blk);
65529782 2855 }
66d4f4fe 2856 return nbd_send_generic_reply(client, request, ret,
6f302e60
VSO
2857 "discard failed", errp);
2858
e7b1948d 2859 case NBD_CMD_BLOCK_STATUS:
1dec4643 2860 assert(request->contexts);
bcc16cc1
EB
2861 assert(client->mode >= NBD_MODE_EXTENDED ||
2862 request->len <= UINT32_MAX);
1dec4643 2863 if (request->contexts->count) {
fb7afc79 2864 bool dont_fragment = request->flags & NBD_CMD_FLAG_REQ_ONE;
1dec4643 2865 int contexts_remaining = request->contexts->count;
fb7afc79 2866
2dcbb11b
EB
2867 if (!request->len) {
2868 return nbd_send_generic_reply(client, request, -EINVAL,
2869 "need non-zero length", errp);
2870 }
1dec4643 2871 if (request->contexts->base_allocation) {
66d4f4fe 2872 ret = nbd_co_send_block_status(client, request,
ff7e261b 2873 exp->common.blk,
37a4f70c 2874 request->from,
fb7afc79 2875 request->len, dont_fragment,
47ec485e 2876 !--contexts_remaining,
3d068aff
VSO
2877 NBD_META_ID_BASE_ALLOCATION,
2878 errp);
73e064cc
EB
2879 if (ret < 0) {
2880 return ret;
2881 }
2882 }
2883
1dec4643 2884 if (request->contexts->allocation_depth) {
66d4f4fe 2885 ret = nbd_co_send_block_status(client, request,
ff7e261b 2886 exp->common.blk,
71719cd5
EB
2887 request->from, request->len,
2888 dont_fragment,
2889 !--contexts_remaining,
2890 NBD_META_ID_ALLOCATION_DEPTH,
2891 errp);
2892 if (ret < 0) {
2893 return ret;
2894 }
2895 }
2896
1dec4643 2897 assert(request->contexts->exp == client->exp);
3b1f244c 2898 for (i = 0; i < client->exp->nr_export_bitmaps; i++) {
1dec4643 2899 if (!request->contexts->bitmaps[i]) {
3b1f244c
EB
2900 continue;
2901 }
66d4f4fe 2902 ret = nbd_co_send_bitmap(client, request,
3b1f244c 2903 client->exp->export_bitmaps[i],
3d068aff 2904 request->from, request->len,
47ec485e 2905 dont_fragment, !--contexts_remaining,
3b1f244c 2906 NBD_META_ID_DIRTY_BITMAP + i, errp);
73e064cc
EB
2907 if (ret < 0) {
2908 return ret;
2909 }
3d068aff
VSO
2910 }
2911
47ec485e
EB
2912 assert(!contexts_remaining);
2913
73e064cc 2914 return 0;
1dec4643
EB
2915 } else if (client->contexts.count) {
2916 return nbd_send_generic_reply(client, request, -EINVAL,
2917 "CMD_BLOCK_STATUS payload not valid",
2918 errp);
e7b1948d 2919 } else {
66d4f4fe 2920 return nbd_send_generic_reply(client, request, -EINVAL,
e7b1948d
VSO
2921 "CMD_BLOCK_STATUS not negotiated",
2922 errp);
2923 }
2924
6f302e60
VSO
2925 default:
2926 msg = g_strdup_printf("invalid request type (%" PRIu32 ") received",
2927 request->type);
66d4f4fe 2928 ret = nbd_send_generic_reply(client, request, -EINVAL, msg,
6f302e60
VSO
2929 errp);
2930 g_free(msg);
2931 return ret;
2932 }
2933}
2934
ff82911c
PB
2935/* Owns a reference to the NBDClient passed as opaque. */
2936static coroutine_fn void nbd_trip(void *opaque)
75818250 2937{
262db388 2938 NBDClient *client = opaque;
315f78ab 2939 NBDRequestData *req;
ff82911c 2940 NBDRequest request = { 0 }; /* GCC thinks it can be used uninitialized */
a0dc63a6 2941 int ret;
2fd2c840 2942 Error *local_err = NULL;
b2e3d87f 2943
9588463e 2944 trace_nbd_trip();
ff2b68aa 2945 if (client->closing) {
ff82911c 2946 nbd_client_put(client);
ff2b68aa
PB
2947 return;
2948 }
b2e3d87f 2949
f148ae7d
SL
2950 if (client->quiescing) {
2951 /*
2952 * We're switching between AIO contexts. Don't attempt to receive a new
2953 * request and kick the main context which may be waiting for us.
2954 */
2955 nbd_client_put(client);
2956 client->recv_coroutine = NULL;
2957 aio_wait_kick();
2958 return;
2959 }
2960
ff2b68aa 2961 req = nbd_request_get(client);
2fd2c840 2962 ret = nbd_co_receive_request(req, &request, &local_err);
ee898b87 2963 client->recv_coroutine = NULL;
b2e3d87f 2964
d6268348
WC
2965 if (client->closing) {
2966 /*
2967 * The client may be closed when we are blocked in
2968 * nbd_co_receive_request()
2969 */
2970 goto done;
2971 }
2972
f148ae7d
SL
2973 if (ret == -EAGAIN) {
2974 assert(client->quiescing);
2975 goto done;
2976 }
2977
a0d7ce20
VSO
2978 nbd_client_receive_next_request(client);
2979 if (ret == -EIO) {
2980 goto disconnect;
2981 }
2982
bd2cd4a4
FW
2983 qio_channel_set_cork(client->ioc, true);
2984
a0d7ce20 2985 if (ret < 0) {
314b9026 2986 /* It wasn't -EIO, so, according to nbd_co_receive_request()
6a417599
VSO
2987 * semantics, we should return the error to the client. */
2988 Error *export_err = local_err;
2989
2990 local_err = NULL;
66d4f4fe 2991 ret = nbd_send_generic_reply(client, &request, -EINVAL,
6a417599
VSO
2992 error_get_pretty(export_err), &local_err);
2993 error_free(export_err);
6f302e60
VSO
2994 } else {
2995 ret = nbd_handle_request(client, &request, req->data, &local_err);
5c54e7fa 2996 }
1dec4643
EB
2997 if (request.contexts && request.contexts != &client->contexts) {
2998 assert(request.type == NBD_CMD_BLOCK_STATUS);
2999 g_free(request.contexts->bitmaps);
3000 g_free(request.contexts);
3001 }
5c54e7fa 3002 if (ret < 0) {
c7b97282 3003 error_prepend(&local_err, "Failed to send reply: ");
2fd2c840
VSO
3004 goto disconnect;
3005 }
3006
2dcbb11b
EB
3007 /*
3008 * We must disconnect after NBD_CMD_WRITE or BLOCK_STATUS with
3009 * payload if we did not read the payload.
8c372a02 3010 */
2fd2c840
VSO
3011 if (!req->complete) {
3012 error_setg(&local_err, "Request handling failed in intermediate state");
8c372a02 3013 goto disconnect;
b2e3d87f
NT
3014 }
3015
bd2cd4a4 3016 qio_channel_set_cork(client->ioc, false);
7fe7b68b 3017done:
262db388 3018 nbd_request_put(req);
ff82911c 3019 nbd_client_put(client);
262db388
PB
3020 return;
3021
8c372a02 3022disconnect:
2fd2c840
VSO
3023 if (local_err) {
3024 error_reportf_err(local_err, "Disconnect client, due to: ");
3025 }
72deddc5 3026 nbd_request_put(req);
0c9390d9 3027 client_close(client, true);
ff82911c 3028 nbd_client_put(client);
7a5ca864 3029}
af49bbbe 3030
ff82911c 3031static void nbd_client_receive_next_request(NBDClient *client)
958c717d 3032{
f148ae7d
SL
3033 if (!client->recv_coroutine && client->nb_requests < MAX_NBD_REQUESTS &&
3034 !client->quiescing) {
ff82911c
PB
3035 nbd_client_get(client);
3036 client->recv_coroutine = qemu_coroutine_create(nbd_trip, client);
8612c686 3037 aio_co_schedule(client->exp->common.ctx, client->recv_coroutine);
958c717d
HR
3038 }
3039}
3040
1a6245a5
FZ
3041static coroutine_fn void nbd_co_client_start(void *opaque)
3042{
c84087f2 3043 NBDClient *client = opaque;
2fd2c840 3044 Error *local_err = NULL;
1a6245a5 3045
df8ad9f1
EB
3046 qemu_co_mutex_init(&client->send_lock);
3047
2fd2c840
VSO
3048 if (nbd_negotiate(client, &local_err)) {
3049 if (local_err) {
3050 error_report_err(local_err);
3051 }
0c9390d9 3052 client_close(client, false);
c84087f2 3053 return;
1a6245a5 3054 }
ff82911c
PB
3055
3056 nbd_client_receive_next_request(client);
1a6245a5
FZ
3057}
3058
0c9390d9 3059/*
7f7dfe2a
VSO
3060 * Create a new client listener using the given channel @sioc.
3061 * Begin servicing it in a coroutine. When the connection closes, call
3062 * @close_fn with an indication of whether the client completed negotiation.
0c9390d9 3063 */
7f7dfe2a 3064void nbd_client_new(QIOChannelSocket *sioc,
f95910fe 3065 QCryptoTLSCreds *tlscreds,
b25e12da 3066 const char *tlsauthz,
0c9390d9 3067 void (*close_fn)(NBDClient *, bool))
af49bbbe 3068{
1743b515 3069 NBDClient *client;
c84087f2 3070 Coroutine *co;
1a6245a5 3071
e8d3eb74 3072 client = g_new0(NBDClient, 1);
1743b515 3073 client->refcount = 1;
f95910fe
DB
3074 client->tlscreds = tlscreds;
3075 if (tlscreds) {
3076 object_ref(OBJECT(client->tlscreds));
3077 }
b25e12da 3078 client->tlsauthz = g_strdup(tlsauthz);
1c778ef7 3079 client->sioc = sioc;
f1426881 3080 qio_channel_set_delay(QIO_CHANNEL(sioc), false);
1c778ef7
DB
3081 object_ref(OBJECT(client->sioc));
3082 client->ioc = QIO_CHANNEL(sioc);
3083 object_ref(OBJECT(client->ioc));
0c9390d9 3084 client->close_fn = close_fn;
2c8d9f06 3085
c84087f2
VSO
3086 co = qemu_coroutine_create(nbd_co_client_start, client);
3087 qemu_coroutine_enter(co);
af49bbbe 3088}