]> git.proxmox.com Git - mirror_qemu.git/blame - nbd/server.c
disas/riscv: Clean up includes
[mirror_qemu.git] / nbd / server.c
CommitLineData
75818250 1/*
a7c8ed36 2 * Copyright Red Hat
7a5ca864
FB
3 * Copyright (C) 2005 Anthony Liguori <anthony@codemonkey.ws>
4 *
798bfe00 5 * Network Block Device Server Side
7a5ca864
FB
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; under version 2 of the License.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
8167ee88 17 * along with this program; if not, see <http://www.gnu.org/licenses/>.
75818250 18 */
7a5ca864 19
d38ea87a 20#include "qemu/osdep.h"
56ee8626 21
e2c1c34f 22#include "block/block_int.h"
56ee8626 23#include "block/export.h"
e2c1c34f 24#include "block/dirty-bitmap.h"
da34e65c 25#include "qapi/error.h"
dc5e9ac7 26#include "qemu/queue.h"
9588463e 27#include "trace.h"
798bfe00 28#include "nbd-internal.h"
416e34bd 29#include "qemu/units.h"
5df022cf 30#include "qemu/memalign.h"
ca441480 31
e7b1948d 32#define NBD_META_ID_BASE_ALLOCATION 0
71719cd5 33#define NBD_META_ID_ALLOCATION_DEPTH 1
3b1f244c 34/* Dirty bitmaps use 'NBD_META_ID_DIRTY_BITMAP + i', so keep this id last. */
71719cd5 35#define NBD_META_ID_DIRTY_BITMAP 2
3d068aff 36
416e34bd
EB
37/*
38 * NBD_MAX_BLOCK_STATUS_EXTENTS: 1 MiB of extents data. An empirical
3d068aff
VSO
39 * constant. If an increase is needed, note that the NBD protocol
40 * recommends no larger than 32 mb, so that the client won't consider
416e34bd
EB
41 * the reply as a denial of service attack.
42 */
43#define NBD_MAX_BLOCK_STATUS_EXTENTS (1 * MiB / 8)
e7b1948d 44
ca441480
PB
45static int system_errno_to_nbd_errno(int err)
46{
47 switch (err) {
48 case 0:
49 return NBD_SUCCESS;
50 case EPERM:
c0301fcc 51 case EROFS:
ca441480
PB
52 return NBD_EPERM;
53 case EIO:
54 return NBD_EIO;
55 case ENOMEM:
56 return NBD_ENOMEM;
57#ifdef EDQUOT
58 case EDQUOT:
59#endif
60 case EFBIG:
61 case ENOSPC:
62 return NBD_ENOSPC;
bae245d1
EB
63 case EOVERFLOW:
64 return NBD_EOVERFLOW;
0a479545
EB
65 case ENOTSUP:
66#if ENOTSUP != EOPNOTSUPP
67 case EOPNOTSUPP:
68#endif
69 return NBD_ENOTSUP;
b6f5d3b5
EB
70 case ESHUTDOWN:
71 return NBD_ESHUTDOWN;
ca441480
PB
72 case EINVAL:
73 default:
74 return NBD_EINVAL;
75 }
76}
77
9a304d29
PB
78/* Definitions for opaque data types */
79
315f78ab 80typedef struct NBDRequestData NBDRequestData;
9a304d29 81
315f78ab 82struct NBDRequestData {
9a304d29
PB
83 NBDClient *client;
84 uint8_t *data;
29b6c3b3 85 bool complete;
9a304d29
PB
86};
87
88struct NBDExport {
56ee8626 89 BlockExport common;
0ddf08db 90
ee0a19ec 91 char *name;
b1a75b33 92 char *description;
9d26dfcb 93 uint64_t size;
7423f417 94 uint16_t nbdflags;
4b9441f6 95 QTAILQ_HEAD(, NBDClient) clients;
ee0a19ec 96 QTAILQ_ENTRY(NBDExport) next;
958c717d 97
cd7fca95 98 BlockBackend *eject_notifier_blk;
741cc431 99 Notifier eject_notifier;
3d068aff 100
71719cd5 101 bool allocation_depth;
3b1f244c
EB
102 BdrvDirtyBitmap **export_bitmaps;
103 size_t nr_export_bitmaps;
9a304d29
PB
104};
105
ee0a19ec
PB
106static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports);
107
fd358d83
EB
108/*
109 * NBDMetaContexts represents a list of meta contexts in use,
e7b1948d 110 * as selected by NBD_OPT_SET_META_CONTEXT. Also used for
fd358d83
EB
111 * NBD_OPT_LIST_META_CONTEXT.
112 */
113struct NBDMetaContexts {
114 const NBDExport *exp; /* associated export */
47ec485e 115 size_t count; /* number of negotiated contexts */
e7b1948d 116 bool base_allocation; /* export base:allocation context (block status) */
71719cd5 117 bool allocation_depth; /* export qemu:allocation-depth */
3b1f244c
EB
118 bool *bitmaps; /*
119 * export qemu:dirty-bitmap:<export bitmap name>,
120 * sized by exp->nr_export_bitmaps
121 */
fd358d83 122};
e7b1948d 123
9a304d29 124struct NBDClient {
f816310d 125 int refcount; /* atomic */
0c9390d9 126 void (*close_fn)(NBDClient *client, bool negotiated);
9a304d29 127
7075d235
SH
128 QemuMutex lock;
129
9a304d29 130 NBDExport *exp;
f95910fe 131 QCryptoTLSCreds *tlscreds;
b25e12da 132 char *tlsauthz;
1c778ef7
DB
133 QIOChannelSocket *sioc; /* The underlying data channel */
134 QIOChannel *ioc; /* The current I/O channel which may differ (eg TLS) */
9a304d29 135
7075d235 136 Coroutine *recv_coroutine; /* protected by lock */
9a304d29
PB
137
138 CoMutex send_lock;
139 Coroutine *send_coroutine;
140
7075d235
SH
141 bool read_yielding; /* protected by lock */
142 bool quiescing; /* protected by lock */
f148ae7d 143
4b9441f6 144 QTAILQ_ENTRY(NBDClient) next;
7075d235
SH
145 int nb_requests; /* protected by lock */
146 bool closing; /* protected by lock */
5c54e7fa 147
6e280648
EB
148 uint32_t check_align; /* If non-zero, check for aligned client requests */
149
ac132d05 150 NBDMode mode;
fd358d83 151 NBDMetaContexts contexts; /* Negotiated meta contexts */
9a304d29 152
0cfae925
VSO
153 uint32_t opt; /* Current option being negotiated */
154 uint32_t optlen; /* remaining length of data in ioc for the option being
155 negotiated now */
156};
7a5ca864 157
ff82911c 158static void nbd_client_receive_next_request(NBDClient *client);
958c717d 159
6b8c01e7 160/* Basic flow for negotiation
7a5ca864
FB
161
162 Server Client
7a5ca864 163 Negotiate
6b8c01e7
PB
164
165 or
166
167 Server Client
168 Negotiate #1
169 Option
170 Negotiate #2
171
172 ----
173
174 followed by
175
176 Server Client
7a5ca864
FB
177 Request
178 Response
179 Request
180 Response
181 ...
182 ...
183 Request (type == 2)
6b8c01e7 184
7a5ca864
FB
185*/
186
1d17922a
VSO
187static inline void set_be_option_rep(NBDOptionReply *rep, uint32_t option,
188 uint32_t type, uint32_t length)
189{
190 stq_be_p(&rep->magic, NBD_REP_MAGIC);
191 stl_be_p(&rep->option, option);
192 stl_be_p(&rep->type, type);
193 stl_be_p(&rep->length, length);
194}
195
526e5c65
EB
196/* Send a reply header, including length, but no payload.
197 * Return -errno on error, 0 on success. */
0cfae925
VSO
198static int nbd_negotiate_send_rep_len(NBDClient *client, uint32_t type,
199 uint32_t len, Error **errp)
6b8c01e7 200{
1d17922a 201 NBDOptionReply rep;
6b8c01e7 202
1d17922a 203 trace_nbd_negotiate_send_rep_len(client->opt, nbd_opt_lookup(client->opt),
3736cc5b 204 type, nbd_rep_lookup(type), len);
f95910fe 205
f37708f6 206 assert(len < NBD_MAX_BUFFER_SIZE);
2fd2c840 207
1d17922a
VSO
208 set_be_option_rep(&rep, client->opt, type, len);
209 return nbd_write(client->ioc, &rep, sizeof(rep), errp);
f5076b5a 210}
6b8c01e7 211
526e5c65
EB
212/* Send a reply header with default 0 length.
213 * Return -errno on error, 0 on success. */
0cfae925 214static int nbd_negotiate_send_rep(NBDClient *client, uint32_t type,
2fd2c840 215 Error **errp)
526e5c65 216{
0cfae925 217 return nbd_negotiate_send_rep_len(client, type, 0, errp);
526e5c65
EB
218}
219
36683283
EB
220/* Send an error reply.
221 * Return -errno on error, 0 on success. */
9edc6313 222static int G_GNUC_PRINTF(4, 0)
41f5dfaf
EB
223nbd_negotiate_send_rep_verr(NBDClient *client, uint32_t type,
224 Error **errp, const char *fmt, va_list va)
36683283 225{
795d946d 226 ERRP_GUARD();
df18c04e 227 g_autofree char *msg = NULL;
36683283
EB
228 int ret;
229 size_t len;
230
36683283 231 msg = g_strdup_vprintf(fmt, va);
36683283 232 len = strlen(msg);
5c4fe018 233 assert(len < NBD_MAX_STRING_SIZE);
9588463e 234 trace_nbd_negotiate_send_rep_err(msg);
0cfae925 235 ret = nbd_negotiate_send_rep_len(client, type, len, errp);
36683283 236 if (ret < 0) {
df18c04e 237 return ret;
36683283 238 }
0cfae925 239 if (nbd_write(client->ioc, msg, len, errp) < 0) {
2fd2c840 240 error_prepend(errp, "write failed (error message): ");
df18c04e 241 return -EIO;
36683283 242 }
2fd2c840 243
df18c04e 244 return 0;
36683283
EB
245}
246
5c4fe018
EB
247/*
248 * Return a malloc'd copy of @name suitable for use in an error reply.
249 */
250static char *
251nbd_sanitize_name(const char *name)
252{
253 if (strnlen(name, 80) < 80) {
254 return g_strdup(name);
255 }
256 /* XXX Should we also try to sanitize any control characters? */
257 return g_strdup_printf("%.80s...", name);
258}
259
41f5dfaf
EB
260/* Send an error reply.
261 * Return -errno on error, 0 on success. */
9edc6313 262static int G_GNUC_PRINTF(4, 5)
41f5dfaf
EB
263nbd_negotiate_send_rep_err(NBDClient *client, uint32_t type,
264 Error **errp, const char *fmt, ...)
265{
266 va_list va;
267 int ret;
268
269 va_start(va, fmt);
270 ret = nbd_negotiate_send_rep_verr(client, type, errp, fmt, va);
271 va_end(va);
272 return ret;
273}
274
894e0280
EB
275/* Drop remainder of the current option, and send a reply with the
276 * given error type and message. Return -errno on read or write
277 * failure; or 0 if connection is still live. */
9edc6313 278static int G_GNUC_PRINTF(4, 0)
2e425fd5
VSO
279nbd_opt_vdrop(NBDClient *client, uint32_t type, Error **errp,
280 const char *fmt, va_list va)
894e0280
EB
281{
282 int ret = nbd_drop(client->ioc, client->optlen, errp);
894e0280
EB
283
284 client->optlen = 0;
285 if (!ret) {
894e0280 286 ret = nbd_negotiate_send_rep_verr(client, type, errp, fmt, va);
894e0280
EB
287 }
288 return ret;
289}
290
9edc6313 291static int G_GNUC_PRINTF(4, 5)
2e425fd5
VSO
292nbd_opt_drop(NBDClient *client, uint32_t type, Error **errp,
293 const char *fmt, ...)
294{
295 int ret;
296 va_list va;
297
298 va_start(va, fmt);
299 ret = nbd_opt_vdrop(client, type, errp, fmt, va);
300 va_end(va);
301
302 return ret;
303}
304
9edc6313 305static int G_GNUC_PRINTF(3, 4)
2e425fd5
VSO
306nbd_opt_invalid(NBDClient *client, Error **errp, const char *fmt, ...)
307{
308 int ret;
309 va_list va;
310
311 va_start(va, fmt);
312 ret = nbd_opt_vdrop(client, NBD_REP_ERR_INVALID, errp, fmt, va);
313 va_end(va);
314
315 return ret;
316}
317
894e0280 318/* Read size bytes from the unparsed payload of the current option.
d1e2c3e7 319 * If @check_nul, require that no NUL bytes appear in buffer.
894e0280
EB
320 * Return -errno on I/O error, 0 if option was completely handled by
321 * sending a reply about inconsistent lengths, or 1 on success. */
322static int nbd_opt_read(NBDClient *client, void *buffer, size_t size,
d1e2c3e7 323 bool check_nul, Error **errp)
894e0280
EB
324{
325 if (size > client->optlen) {
2e425fd5
VSO
326 return nbd_opt_invalid(client, errp,
327 "Inconsistent lengths in option %s",
328 nbd_opt_lookup(client->opt));
894e0280
EB
329 }
330 client->optlen -= size;
d1e2c3e7
EB
331 if (qio_channel_read_all(client->ioc, buffer, size, errp) < 0) {
332 return -EIO;
333 }
334
335 if (check_nul && strnlen(buffer, size) != size) {
336 return nbd_opt_invalid(client, errp,
337 "Unexpected embedded NUL in option %s",
338 nbd_opt_lookup(client->opt));
339 }
340 return 1;
894e0280
EB
341}
342
e7b1948d
VSO
343/* Drop size bytes from the unparsed payload of the current option.
344 * Return -errno on I/O error, 0 if option was completely handled by
345 * sending a reply about inconsistent lengths, or 1 on success. */
346static int nbd_opt_skip(NBDClient *client, size_t size, Error **errp)
347{
348 if (size > client->optlen) {
349 return nbd_opt_invalid(client, errp,
350 "Inconsistent lengths in option %s",
351 nbd_opt_lookup(client->opt));
352 }
353 client->optlen -= size;
354 return nbd_drop(client->ioc, size, errp) < 0 ? -EIO : 1;
355}
356
12296459
VSO
357/* nbd_opt_read_name
358 *
359 * Read a string with the format:
93676c88 360 * uint32_t len (<= NBD_MAX_STRING_SIZE)
12296459
VSO
361 * len bytes string (not 0-terminated)
362 *
9d7ab222 363 * On success, @name will be allocated.
12296459
VSO
364 * If @length is non-null, it will be set to the actual string length.
365 *
366 * Return -errno on I/O error, 0 if option was completely handled by
367 * sending a reply about inconsistent lengths, or 1 on success.
368 */
9d7ab222 369static int nbd_opt_read_name(NBDClient *client, char **name, uint32_t *length,
12296459
VSO
370 Error **errp)
371{
372 int ret;
373 uint32_t len;
9d7ab222 374 g_autofree char *local_name = NULL;
12296459 375
9d7ab222 376 *name = NULL;
d1e2c3e7 377 ret = nbd_opt_read(client, &len, sizeof(len), false, errp);
12296459
VSO
378 if (ret <= 0) {
379 return ret;
380 }
80c7c2b0 381 len = cpu_to_be32(len);
12296459 382
93676c88 383 if (len > NBD_MAX_STRING_SIZE) {
12296459
VSO
384 return nbd_opt_invalid(client, errp,
385 "Invalid name length: %" PRIu32, len);
386 }
387
9d7ab222 388 local_name = g_malloc(len + 1);
d1e2c3e7 389 ret = nbd_opt_read(client, local_name, len, true, errp);
12296459
VSO
390 if (ret <= 0) {
391 return ret;
392 }
9d7ab222 393 local_name[len] = '\0';
12296459
VSO
394
395 if (length) {
396 *length = len;
397 }
9d7ab222 398 *name = g_steal_pointer(&local_name);
12296459
VSO
399
400 return 1;
401}
402
526e5c65
EB
403/* Send a single NBD_REP_SERVER reply to NBD_OPT_LIST, including payload.
404 * Return -errno on error, 0 on success. */
0cfae925 405static int nbd_negotiate_send_rep_list(NBDClient *client, NBDExport *exp,
2fd2c840 406 Error **errp)
32d7d2e0 407{
795d946d 408 ERRP_GUARD();
b1a75b33 409 size_t name_len, desc_len;
526e5c65 410 uint32_t len;
b1a75b33
EB
411 const char *name = exp->name ? exp->name : "";
412 const char *desc = exp->description ? exp->description : "";
0cfae925 413 QIOChannel *ioc = client->ioc;
2e5c9ad6 414 int ret;
32d7d2e0 415
9588463e 416 trace_nbd_negotiate_send_rep_list(name, desc);
b1a75b33
EB
417 name_len = strlen(name);
418 desc_len = strlen(desc);
93676c88 419 assert(name_len <= NBD_MAX_STRING_SIZE && desc_len <= NBD_MAX_STRING_SIZE);
526e5c65 420 len = name_len + desc_len + sizeof(len);
0cfae925 421 ret = nbd_negotiate_send_rep_len(client, NBD_REP_SERVER, len, errp);
2e5c9ad6
VSO
422 if (ret < 0) {
423 return ret;
32d7d2e0 424 }
526e5c65 425
32d7d2e0 426 len = cpu_to_be32(name_len);
2fd2c840
VSO
427 if (nbd_write(ioc, &len, sizeof(len), errp) < 0) {
428 error_prepend(errp, "write failed (name length): ");
b1a75b33
EB
429 return -EINVAL;
430 }
2fd2c840
VSO
431
432 if (nbd_write(ioc, name, name_len, errp) < 0) {
433 error_prepend(errp, "write failed (name buffer): ");
32d7d2e0
HB
434 return -EINVAL;
435 }
2fd2c840
VSO
436
437 if (nbd_write(ioc, desc, desc_len, errp) < 0) {
438 error_prepend(errp, "write failed (description buffer): ");
32d7d2e0
HB
439 return -EINVAL;
440 }
2fd2c840 441
32d7d2e0
HB
442 return 0;
443}
444
526e5c65
EB
445/* Process the NBD_OPT_LIST command, with a potential series of replies.
446 * Return -errno on error, 0 on success. */
e68c35cf 447static int nbd_negotiate_handle_list(NBDClient *client, Error **errp)
32d7d2e0 448{
32d7d2e0 449 NBDExport *exp;
0cfae925 450 assert(client->opt == NBD_OPT_LIST);
32d7d2e0 451
32d7d2e0
HB
452 /* For each export, send a NBD_REP_SERVER reply. */
453 QTAILQ_FOREACH(exp, &exports, next) {
0cfae925 454 if (nbd_negotiate_send_rep_list(client, exp, errp)) {
32d7d2e0
HB
455 return -EINVAL;
456 }
457 }
458 /* Finish with a NBD_REP_ACK. */
0cfae925 459 return nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
32d7d2e0
HB
460}
461
fd358d83 462static void nbd_check_meta_export(NBDClient *client, NBDExport *exp)
e7b1948d 463{
fd358d83
EB
464 if (exp != client->contexts.exp) {
465 client->contexts.count = 0;
47ec485e 466 }
e7b1948d
VSO
467}
468
f37708f6
EB
469/* Send a reply to NBD_OPT_EXPORT_NAME.
470 * Return -errno on error, 0 on success. */
dbb38caa 471static int nbd_negotiate_handle_export_name(NBDClient *client, bool no_zeroes,
2fd2c840 472 Error **errp)
f5076b5a 473{
795d946d 474 ERRP_GUARD();
9d7ab222 475 g_autofree char *name = NULL;
5f66d060 476 char buf[NBD_REPLY_EXPORT_NAME_SIZE] = "";
23e099c3
EB
477 size_t len;
478 int ret;
dbb38caa 479 uint16_t myflags;
6b8c01e7 480
f5076b5a
HB
481 /* Client sends:
482 [20 .. xx] export name (length bytes)
5f66d060
EB
483 Server replies:
484 [ 0 .. 7] size
485 [ 8 .. 9] export flags
486 [10 .. 133] reserved (0) [unless no_zeroes]
f5076b5a 487 */
9588463e 488 trace_nbd_negotiate_handle_export_name();
9c1d2614
EB
489 if (client->mode >= NBD_MODE_EXTENDED) {
490 error_setg(errp, "Extended headers already negotiated");
491 return -EINVAL;
492 }
93676c88 493 if (client->optlen > NBD_MAX_STRING_SIZE) {
2fd2c840 494 error_setg(errp, "Bad length received");
d9faeed8 495 return -EINVAL;
6b8c01e7 496 }
9d7ab222 497 name = g_malloc(client->optlen + 1);
e6798f06 498 if (nbd_read(client->ioc, name, client->optlen, "export name", errp) < 0) {
32f158a6 499 return -EIO;
6b8c01e7 500 }
0cfae925
VSO
501 name[client->optlen] = '\0';
502 client->optlen = 0;
6b8c01e7 503
9588463e 504 trace_nbd_negotiate_handle_export_name_request(name);
9344e5f5 505
6b8c01e7
PB
506 client->exp = nbd_export_find(name);
507 if (!client->exp) {
2fd2c840 508 error_setg(errp, "export not found");
d9faeed8 509 return -EINVAL;
6b8c01e7 510 }
fd358d83 511 nbd_check_meta_export(client, client->exp);
6b8c01e7 512
dbb38caa 513 myflags = client->exp->nbdflags;
ac132d05 514 if (client->mode >= NBD_MODE_STRUCTURED) {
dbb38caa
EB
515 myflags |= NBD_FLAG_SEND_DF;
516 }
2dcbb11b
EB
517 if (client->mode >= NBD_MODE_EXTENDED && client->contexts.count) {
518 myflags |= NBD_FLAG_BLOCK_STAT_PAYLOAD;
519 }
dbb38caa 520 trace_nbd_negotiate_new_style_size_flags(client->exp->size, myflags);
23e099c3 521 stq_be_p(buf, client->exp->size);
dbb38caa 522 stw_be_p(buf + 8, myflags);
23e099c3
EB
523 len = no_zeroes ? 10 : sizeof(buf);
524 ret = nbd_write(client->ioc, buf, len, errp);
525 if (ret < 0) {
526 error_prepend(errp, "write failed: ");
527 return ret;
528 }
529
6b8c01e7 530 QTAILQ_INSERT_TAIL(&client->exp->clients, client, next);
c69de1be 531 blk_exp_ref(&client->exp->common);
d9faeed8
VSO
532
533 return 0;
6b8c01e7
PB
534}
535
f37708f6
EB
536/* Send a single NBD_REP_INFO, with a buffer @buf of @length bytes.
537 * The buffer does NOT include the info type prefix.
538 * Return -errno on error, 0 if ready to send more. */
0cfae925 539static int nbd_negotiate_send_info(NBDClient *client,
f37708f6
EB
540 uint16_t info, uint32_t length, void *buf,
541 Error **errp)
542{
543 int rc;
544
545 trace_nbd_negotiate_send_info(info, nbd_info_lookup(info), length);
0cfae925 546 rc = nbd_negotiate_send_rep_len(client, NBD_REP_INFO,
f37708f6
EB
547 sizeof(info) + length, errp);
548 if (rc < 0) {
549 return rc;
550 }
80c7c2b0 551 info = cpu_to_be16(info);
f37708f6
EB
552 if (nbd_write(client->ioc, &info, sizeof(info), errp) < 0) {
553 return -EIO;
554 }
555 if (nbd_write(client->ioc, buf, length, errp) < 0) {
556 return -EIO;
557 }
558 return 0;
559}
560
a16a7907
EB
561/* nbd_reject_length: Handle any unexpected payload.
562 * @fatal requests that we quit talking to the client, even if we are able
563 * to successfully send an error reply.
564 * Return:
565 * -errno transmission error occurred or @fatal was requested, errp is set
566 * 0 error message successfully sent to client, errp is not set
567 */
0cfae925 568static int nbd_reject_length(NBDClient *client, bool fatal, Error **errp)
a16a7907
EB
569{
570 int ret;
571
0cfae925 572 assert(client->optlen);
2e425fd5
VSO
573 ret = nbd_opt_invalid(client, errp, "option '%s' has unexpected length",
574 nbd_opt_lookup(client->opt));
a16a7907 575 if (fatal && !ret) {
894e0280 576 error_setg(errp, "option '%s' has unexpected length",
0cfae925 577 nbd_opt_lookup(client->opt));
a16a7907
EB
578 return -EINVAL;
579 }
580 return ret;
581}
582
f37708f6
EB
583/* Handle NBD_OPT_INFO and NBD_OPT_GO.
584 * Return -errno on error, 0 if ready for next option, and 1 to move
585 * into transmission phase. */
dbb38caa 586static int nbd_negotiate_handle_info(NBDClient *client, Error **errp)
f37708f6
EB
587{
588 int rc;
9d7ab222 589 g_autofree char *name = NULL;
f37708f6
EB
590 NBDExport *exp;
591 uint16_t requests;
592 uint16_t request;
bbc35fc2 593 uint32_t namelen = 0;
f37708f6 594 bool sendname = false;
0c1d50bd
EB
595 bool blocksize = false;
596 uint32_t sizes[3];
f37708f6 597 char buf[sizeof(uint64_t) + sizeof(uint16_t)];
6e280648 598 uint32_t check_align = 0;
dbb38caa 599 uint16_t myflags;
f37708f6
EB
600
601 /* Client sends:
602 4 bytes: L, name length (can be 0)
603 L bytes: export name
604 2 bytes: N, number of requests (can be 0)
605 N * 2 bytes: N requests
606 */
9d7ab222 607 rc = nbd_opt_read_name(client, &name, &namelen, errp);
894e0280
EB
608 if (rc <= 0) {
609 return rc;
f37708f6 610 }
f37708f6
EB
611 trace_nbd_negotiate_handle_export_name_request(name);
612
d1e2c3e7 613 rc = nbd_opt_read(client, &requests, sizeof(requests), false, errp);
894e0280
EB
614 if (rc <= 0) {
615 return rc;
f37708f6 616 }
80c7c2b0 617 requests = be16_to_cpu(requests);
f37708f6 618 trace_nbd_negotiate_handle_info_requests(requests);
f37708f6 619 while (requests--) {
d1e2c3e7 620 rc = nbd_opt_read(client, &request, sizeof(request), false, errp);
894e0280
EB
621 if (rc <= 0) {
622 return rc;
f37708f6 623 }
80c7c2b0 624 request = be16_to_cpu(request);
f37708f6
EB
625 trace_nbd_negotiate_handle_info_request(request,
626 nbd_info_lookup(request));
0c1d50bd
EB
627 /* We care about NBD_INFO_NAME and NBD_INFO_BLOCK_SIZE;
628 * everything else is either a request we don't know or
629 * something we send regardless of request */
630 switch (request) {
631 case NBD_INFO_NAME:
f37708f6 632 sendname = true;
0c1d50bd
EB
633 break;
634 case NBD_INFO_BLOCK_SIZE:
635 blocksize = true;
636 break;
f37708f6
EB
637 }
638 }
894e0280
EB
639 if (client->optlen) {
640 return nbd_reject_length(client, false, errp);
641 }
f37708f6
EB
642
643 exp = nbd_export_find(name);
644 if (!exp) {
5c4fe018
EB
645 g_autofree char *sane_name = nbd_sanitize_name(name);
646
0cfae925
VSO
647 return nbd_negotiate_send_rep_err(client, NBD_REP_ERR_UNKNOWN,
648 errp, "export '%s' not present",
5c4fe018 649 sane_name);
f37708f6 650 }
fd358d83
EB
651 if (client->opt == NBD_OPT_GO) {
652 nbd_check_meta_export(client, exp);
653 }
f37708f6
EB
654
655 /* Don't bother sending NBD_INFO_NAME unless client requested it */
656 if (sendname) {
0cfae925 657 rc = nbd_negotiate_send_info(client, NBD_INFO_NAME, namelen, name,
f37708f6
EB
658 errp);
659 if (rc < 0) {
660 return rc;
661 }
662 }
663
664 /* Send NBD_INFO_DESCRIPTION only if available, regardless of
665 * client request */
666 if (exp->description) {
667 size_t len = strlen(exp->description);
668
93676c88 669 assert(len <= NBD_MAX_STRING_SIZE);
0cfae925 670 rc = nbd_negotiate_send_info(client, NBD_INFO_DESCRIPTION,
f37708f6
EB
671 len, exp->description, errp);
672 if (rc < 0) {
673 return rc;
674 }
675 }
676
0c1d50bd
EB
677 /* Send NBD_INFO_BLOCK_SIZE always, but tweak the minimum size
678 * according to whether the client requested it, and according to
679 * whether this is OPT_INFO or OPT_GO. */
b0245d64
EB
680 /* minimum - 1 for back-compat, or actual if client will obey it. */
681 if (client->opt == NBD_OPT_INFO || blocksize) {
37a4f70c 682 check_align = sizes[0] = blk_get_request_alignment(exp->common.blk);
b0245d64
EB
683 } else {
684 sizes[0] = 1;
685 }
686 assert(sizes[0] <= NBD_MAX_BUFFER_SIZE);
0c1d50bd
EB
687 /* preferred - Hard-code to 4096 for now.
688 * TODO: is blk_bs(blk)->bl.opt_transfer appropriate? */
b0245d64 689 sizes[1] = MAX(4096, sizes[0]);
0c1d50bd 690 /* maximum - At most 32M, but smaller as appropriate. */
37a4f70c 691 sizes[2] = MIN(blk_get_max_transfer(exp->common.blk), NBD_MAX_BUFFER_SIZE);
0c1d50bd 692 trace_nbd_negotiate_handle_info_block_size(sizes[0], sizes[1], sizes[2]);
80c7c2b0
PM
693 sizes[0] = cpu_to_be32(sizes[0]);
694 sizes[1] = cpu_to_be32(sizes[1]);
695 sizes[2] = cpu_to_be32(sizes[2]);
0cfae925 696 rc = nbd_negotiate_send_info(client, NBD_INFO_BLOCK_SIZE,
0c1d50bd
EB
697 sizeof(sizes), sizes, errp);
698 if (rc < 0) {
699 return rc;
700 }
701
f37708f6 702 /* Send NBD_INFO_EXPORT always */
dbb38caa 703 myflags = exp->nbdflags;
ac132d05 704 if (client->mode >= NBD_MODE_STRUCTURED) {
dbb38caa
EB
705 myflags |= NBD_FLAG_SEND_DF;
706 }
2dcbb11b
EB
707 if (client->mode >= NBD_MODE_EXTENDED &&
708 (client->contexts.count || client->opt == NBD_OPT_INFO)) {
709 myflags |= NBD_FLAG_BLOCK_STAT_PAYLOAD;
710 }
dbb38caa 711 trace_nbd_negotiate_new_style_size_flags(exp->size, myflags);
f37708f6 712 stq_be_p(buf, exp->size);
dbb38caa 713 stw_be_p(buf + 8, myflags);
0cfae925 714 rc = nbd_negotiate_send_info(client, NBD_INFO_EXPORT,
f37708f6
EB
715 sizeof(buf), buf, errp);
716 if (rc < 0) {
717 return rc;
718 }
719
099fbcd6
EB
720 /*
721 * If the client is just asking for NBD_OPT_INFO, but forgot to
722 * request block sizes in a situation that would impact
723 * performance, then return an error. But for NBD_OPT_GO, we
724 * tolerate all clients, regardless of alignments.
725 */
726 if (client->opt == NBD_OPT_INFO && !blocksize &&
37a4f70c 727 blk_get_request_alignment(exp->common.blk) > 1) {
0cfae925
VSO
728 return nbd_negotiate_send_rep_err(client,
729 NBD_REP_ERR_BLOCK_SIZE_REQD,
0c1d50bd
EB
730 errp,
731 "request NBD_INFO_BLOCK_SIZE to "
732 "use this export");
733 }
734
f37708f6 735 /* Final reply */
0cfae925 736 rc = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
f37708f6
EB
737 if (rc < 0) {
738 return rc;
739 }
740
0cfae925 741 if (client->opt == NBD_OPT_GO) {
f37708f6 742 client->exp = exp;
6e280648 743 client->check_align = check_align;
f37708f6 744 QTAILQ_INSERT_TAIL(&client->exp->clients, client, next);
c69de1be 745 blk_exp_ref(&client->exp->common);
f37708f6
EB
746 rc = 1;
747 }
748 return rc;
f37708f6
EB
749}
750
751
36683283
EB
752/* Handle NBD_OPT_STARTTLS. Return NULL to drop connection, or else the
753 * new channel for all further (now-encrypted) communication. */
f95910fe 754static QIOChannel *nbd_negotiate_handle_starttls(NBDClient *client,
2fd2c840 755 Error **errp)
f95910fe
DB
756{
757 QIOChannel *ioc;
758 QIOChannelTLS *tioc;
759 struct NBDTLSHandshakeData data = { 0 };
760
0cfae925
VSO
761 assert(client->opt == NBD_OPT_STARTTLS);
762
9588463e 763 trace_nbd_negotiate_handle_starttls();
f95910fe 764 ioc = client->ioc;
f95910fe 765
0cfae925 766 if (nbd_negotiate_send_rep(client, NBD_REP_ACK, errp) < 0) {
63d5ef86
EB
767 return NULL;
768 }
f95910fe
DB
769
770 tioc = qio_channel_tls_new_server(ioc,
771 client->tlscreds,
b25e12da 772 client->tlsauthz,
2fd2c840 773 errp);
f95910fe
DB
774 if (!tioc) {
775 return NULL;
776 }
777
0d73f725 778 qio_channel_set_name(QIO_CHANNEL(tioc), "nbd-server-tls");
9588463e 779 trace_nbd_negotiate_handle_starttls_handshake();
f95910fe
DB
780 data.loop = g_main_loop_new(g_main_context_default(), FALSE);
781 qio_channel_tls_handshake(tioc,
782 nbd_tls_handshake,
783 &data,
1939ccda 784 NULL,
f95910fe
DB
785 NULL);
786
787 if (!data.complete) {
788 g_main_loop_run(data.loop);
789 }
790 g_main_loop_unref(data.loop);
791 if (data.error) {
792 object_unref(OBJECT(tioc));
2fd2c840 793 error_propagate(errp, data.error);
f95910fe
DB
794 return NULL;
795 }
796
797 return QIO_CHANNEL(tioc);
798}
799
e7b1948d
VSO
800/* nbd_negotiate_send_meta_context
801 *
802 * Send one chunk of reply to NBD_OPT_{LIST,SET}_META_CONTEXT
803 *
804 * For NBD_OPT_LIST_META_CONTEXT @context_id is ignored, 0 is used instead.
805 */
806static int nbd_negotiate_send_meta_context(NBDClient *client,
807 const char *context,
808 uint32_t context_id,
809 Error **errp)
810{
811 NBDOptionReplyMetaContext opt;
812 struct iovec iov[] = {
813 {.iov_base = &opt, .iov_len = sizeof(opt)},
814 {.iov_base = (void *)context, .iov_len = strlen(context)}
815 };
816
93676c88 817 assert(iov[1].iov_len <= NBD_MAX_STRING_SIZE);
e7b1948d
VSO
818 if (client->opt == NBD_OPT_LIST_META_CONTEXT) {
819 context_id = 0;
820 }
821
2b53af25 822 trace_nbd_negotiate_meta_query_reply(context, context_id);
e7b1948d
VSO
823 set_be_option_rep(&opt.h, client->opt, NBD_REP_META_CONTEXT,
824 sizeof(opt) - sizeof(opt.h) + iov[1].iov_len);
825 stl_be_p(&opt.context_id, context_id);
826
827 return qio_channel_writev_all(client->ioc, iov, 2, errp) < 0 ? -EIO : 0;
828}
829
ebd57062
EB
830/*
831 * Return true if @query matches @pattern, or if @query is empty when
832 * the @client is performing _LIST_.
dbb8b396 833 */
ebd57062
EB
834static bool nbd_meta_empty_or_pattern(NBDClient *client, const char *pattern,
835 const char *query)
e7b1948d 836{
ebd57062
EB
837 if (!*query) {
838 trace_nbd_negotiate_meta_query_parse("empty");
839 return client->opt == NBD_OPT_LIST_META_CONTEXT;
e7b1948d 840 }
ebd57062 841 if (strcmp(query, pattern) == 0) {
b0769d8f 842 trace_nbd_negotiate_meta_query_parse(pattern);
ebd57062 843 return true;
e7b1948d 844 }
ebd57062
EB
845 trace_nbd_negotiate_meta_query_skip("pattern not matched");
846 return false;
e7b1948d
VSO
847}
848
b0769d8f 849/*
ebd57062 850 * Return true and adjust @str in place if it begins with @prefix.
b0769d8f 851 */
ebd57062 852static bool nbd_strshift(const char **str, const char *prefix)
b0769d8f 853{
ebd57062 854 size_t len = strlen(prefix);
b0769d8f 855
ebd57062
EB
856 if (strncmp(*str, prefix, len) == 0) {
857 *str += len;
858 return true;
b0769d8f 859 }
ebd57062 860 return false;
b0769d8f
VSO
861}
862
863/* nbd_meta_base_query
864 *
865 * Handle queries to 'base' namespace. For now, only the base:allocation
ebd57062 866 * context is available. Return true if @query has been handled.
b0769d8f 867 */
fd358d83 868static bool nbd_meta_base_query(NBDClient *client, NBDMetaContexts *meta,
ebd57062 869 const char *query)
b0769d8f 870{
ebd57062
EB
871 if (!nbd_strshift(&query, "base:")) {
872 return false;
873 }
874 trace_nbd_negotiate_meta_query_parse("base:");
875
876 if (nbd_meta_empty_or_pattern(client, "allocation", query)) {
877 meta->base_allocation = true;
878 }
879 return true;
b0769d8f
VSO
880}
881
ebd57062 882/* nbd_meta_qemu_query
3d068aff 883 *
ebd57062 884 * Handle queries to 'qemu' namespace. For now, only the qemu:dirty-bitmap:
71719cd5
EB
885 * and qemu:allocation-depth contexts are available. Return true if @query
886 * has been handled.
ebd57062 887 */
fd358d83 888static bool nbd_meta_qemu_query(NBDClient *client, NBDMetaContexts *meta,
ebd57062 889 const char *query)
3d068aff 890{
3b1f244c
EB
891 size_t i;
892
ebd57062
EB
893 if (!nbd_strshift(&query, "qemu:")) {
894 return false;
3d068aff 895 }
ebd57062 896 trace_nbd_negotiate_meta_query_parse("qemu:");
3d068aff 897
ebd57062 898 if (!*query) {
3d068aff 899 if (client->opt == NBD_OPT_LIST_META_CONTEXT) {
71719cd5 900 meta->allocation_depth = meta->exp->allocation_depth;
76df2b8d
EB
901 if (meta->exp->nr_export_bitmaps) {
902 memset(meta->bitmaps, 1, meta->exp->nr_export_bitmaps);
903 }
3d068aff
VSO
904 }
905 trace_nbd_negotiate_meta_query_parse("empty");
ebd57062 906 return true;
3d068aff
VSO
907 }
908
71719cd5
EB
909 if (strcmp(query, "allocation-depth") == 0) {
910 trace_nbd_negotiate_meta_query_parse("allocation-depth");
911 meta->allocation_depth = meta->exp->allocation_depth;
912 return true;
913 }
914
ebd57062
EB
915 if (nbd_strshift(&query, "dirty-bitmap:")) {
916 trace_nbd_negotiate_meta_query_parse("dirty-bitmap:");
3b1f244c 917 if (!*query) {
76df2b8d
EB
918 if (client->opt == NBD_OPT_LIST_META_CONTEXT &&
919 meta->exp->nr_export_bitmaps) {
3b1f244c
EB
920 memset(meta->bitmaps, 1, meta->exp->nr_export_bitmaps);
921 }
922 trace_nbd_negotiate_meta_query_parse("empty");
ebd57062
EB
923 return true;
924 }
3b1f244c
EB
925
926 for (i = 0; i < meta->exp->nr_export_bitmaps; i++) {
927 const char *bm_name;
928
929 bm_name = bdrv_dirty_bitmap_name(meta->exp->export_bitmaps[i]);
930 if (strcmp(bm_name, query) == 0) {
931 meta->bitmaps[i] = true;
932 trace_nbd_negotiate_meta_query_parse(query);
933 return true;
934 }
ebd57062 935 }
3b1f244c 936 trace_nbd_negotiate_meta_query_skip("no dirty-bitmap match");
ebd57062 937 return true;
3d068aff
VSO
938 }
939
71719cd5 940 trace_nbd_negotiate_meta_query_skip("unknown qemu context");
ebd57062 941 return true;
3d068aff
VSO
942}
943
e7b1948d
VSO
944/* nbd_negotiate_meta_query
945 *
946 * Parse namespace name and call corresponding function to parse body of the
947 * query.
948 *
93676c88 949 * The only supported namespaces are 'base' and 'qemu'.
e7b1948d 950 *
e7b1948d
VSO
951 * Return -errno on I/O error, 0 if option was completely handled by
952 * sending a reply about inconsistent lengths, or 1 on success. */
953static int nbd_negotiate_meta_query(NBDClient *client,
fd358d83 954 NBDMetaContexts *meta, Error **errp)
e7b1948d
VSO
955{
956 int ret;
ebd57062 957 g_autofree char *query = NULL;
e7b1948d
VSO
958 uint32_t len;
959
d1e2c3e7 960 ret = nbd_opt_read(client, &len, sizeof(len), false, errp);
e7b1948d
VSO
961 if (ret <= 0) {
962 return ret;
963 }
80c7c2b0 964 len = cpu_to_be32(len);
e7b1948d 965
93676c88
EB
966 if (len > NBD_MAX_STRING_SIZE) {
967 trace_nbd_negotiate_meta_query_skip("length too long");
968 return nbd_opt_skip(client, len, errp);
969 }
e7b1948d 970
ebd57062
EB
971 query = g_malloc(len + 1);
972 ret = nbd_opt_read(client, query, len, true, errp);
e7b1948d
VSO
973 if (ret <= 0) {
974 return ret;
975 }
ebd57062 976 query[len] = '\0';
3d068aff 977
ebd57062
EB
978 if (nbd_meta_base_query(client, meta, query)) {
979 return 1;
980 }
981 if (nbd_meta_qemu_query(client, meta, query)) {
982 return 1;
e7b1948d
VSO
983 }
984
3d068aff 985 trace_nbd_negotiate_meta_query_skip("unknown namespace");
ebd57062 986 return 1;
e7b1948d
VSO
987}
988
989/* nbd_negotiate_meta_queries
990 * Handle NBD_OPT_LIST_META_CONTEXT and NBD_OPT_SET_META_CONTEXT
991 *
992 * Return -errno on I/O error, or 0 if option was completely handled. */
fd358d83 993static int nbd_negotiate_meta_queries(NBDClient *client, Error **errp)
e7b1948d
VSO
994{
995 int ret;
9d7ab222 996 g_autofree char *export_name = NULL;
cd1675f8
RH
997 /* Mark unused to work around https://bugs.llvm.org/show_bug.cgi?id=3888 */
998 g_autofree G_GNUC_UNUSED bool *bitmaps = NULL;
fd358d83
EB
999 NBDMetaContexts local_meta = {0};
1000 NBDMetaContexts *meta;
e7b1948d 1001 uint32_t nb_queries;
3b1f244c 1002 size_t i;
47ec485e 1003 size_t count = 0;
e7b1948d 1004
ac132d05
EB
1005 if (client->opt == NBD_OPT_SET_META_CONTEXT &&
1006 client->mode < NBD_MODE_STRUCTURED) {
e7b1948d
VSO
1007 return nbd_opt_invalid(client, errp,
1008 "request option '%s' when structured reply "
1009 "is not negotiated",
1010 nbd_opt_lookup(client->opt));
1011 }
1012
1013 if (client->opt == NBD_OPT_LIST_META_CONTEXT) {
1014 /* Only change the caller's meta on SET. */
1015 meta = &local_meta;
fd358d83
EB
1016 } else {
1017 meta = &client->contexts;
e7b1948d
VSO
1018 }
1019
3b1f244c 1020 g_free(meta->bitmaps);
e7b1948d
VSO
1021 memset(meta, 0, sizeof(*meta));
1022
9d7ab222 1023 ret = nbd_opt_read_name(client, &export_name, NULL, errp);
e7b1948d
VSO
1024 if (ret <= 0) {
1025 return ret;
1026 }
1027
af736e54
VSO
1028 meta->exp = nbd_export_find(export_name);
1029 if (meta->exp == NULL) {
5c4fe018
EB
1030 g_autofree char *sane_name = nbd_sanitize_name(export_name);
1031
e7b1948d 1032 return nbd_opt_drop(client, NBD_REP_ERR_UNKNOWN, errp,
5c4fe018 1033 "export '%s' not present", sane_name);
e7b1948d 1034 }
3b1f244c
EB
1035 meta->bitmaps = g_new0(bool, meta->exp->nr_export_bitmaps);
1036 if (client->opt == NBD_OPT_LIST_META_CONTEXT) {
1037 bitmaps = meta->bitmaps;
1038 }
e7b1948d 1039
d1e2c3e7 1040 ret = nbd_opt_read(client, &nb_queries, sizeof(nb_queries), false, errp);
e7b1948d
VSO
1041 if (ret <= 0) {
1042 return ret;
1043 }
80c7c2b0 1044 nb_queries = cpu_to_be32(nb_queries);
2b53af25 1045 trace_nbd_negotiate_meta_context(nbd_opt_lookup(client->opt),
af736e54 1046 export_name, nb_queries);
e7b1948d
VSO
1047
1048 if (client->opt == NBD_OPT_LIST_META_CONTEXT && !nb_queries) {
1049 /* enable all known contexts */
1050 meta->base_allocation = true;
71719cd5 1051 meta->allocation_depth = meta->exp->allocation_depth;
76df2b8d
EB
1052 if (meta->exp->nr_export_bitmaps) {
1053 memset(meta->bitmaps, 1, meta->exp->nr_export_bitmaps);
1054 }
e7b1948d
VSO
1055 } else {
1056 for (i = 0; i < nb_queries; ++i) {
1057 ret = nbd_negotiate_meta_query(client, meta, errp);
1058 if (ret <= 0) {
1059 return ret;
1060 }
1061 }
1062 }
1063
1064 if (meta->base_allocation) {
1065 ret = nbd_negotiate_send_meta_context(client, "base:allocation",
1066 NBD_META_ID_BASE_ALLOCATION,
1067 errp);
1068 if (ret < 0) {
1069 return ret;
1070 }
47ec485e 1071 count++;
e7b1948d
VSO
1072 }
1073
71719cd5
EB
1074 if (meta->allocation_depth) {
1075 ret = nbd_negotiate_send_meta_context(client, "qemu:allocation-depth",
1076 NBD_META_ID_ALLOCATION_DEPTH,
1077 errp);
1078 if (ret < 0) {
1079 return ret;
1080 }
1081 count++;
1082 }
1083
3b1f244c
EB
1084 for (i = 0; i < meta->exp->nr_export_bitmaps; i++) {
1085 const char *bm_name;
1086 g_autofree char *context = NULL;
1087
1088 if (!meta->bitmaps[i]) {
1089 continue;
1090 }
1091
1092 bm_name = bdrv_dirty_bitmap_name(meta->exp->export_bitmaps[i]);
1093 context = g_strdup_printf("qemu:dirty-bitmap:%s", bm_name);
02e87e3b
EB
1094
1095 ret = nbd_negotiate_send_meta_context(client, context,
3b1f244c 1096 NBD_META_ID_DIRTY_BITMAP + i,
3d068aff
VSO
1097 errp);
1098 if (ret < 0) {
1099 return ret;
1100 }
47ec485e 1101 count++;
3d068aff
VSO
1102 }
1103
e7b1948d
VSO
1104 ret = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
1105 if (ret == 0) {
47ec485e 1106 meta->count = count;
e7b1948d
VSO
1107 }
1108
1109 return ret;
1110}
1111
1e120ffe 1112/* nbd_negotiate_options
f37708f6
EB
1113 * Process all NBD_OPT_* client option commands, during fixed newstyle
1114 * negotiation.
1e120ffe 1115 * Return:
2fd2c840
VSO
1116 * -errno on error, errp is set
1117 * 0 on successful negotiation, errp is not set
1118 * 1 if client sent NBD_OPT_ABORT, i.e. on valid disconnect,
1119 * errp is not set
1e120ffe 1120 */
dbb38caa 1121static int nbd_negotiate_options(NBDClient *client, Error **errp)
f5076b5a 1122{
9c122ada 1123 uint32_t flags;
26afa868 1124 bool fixedNewstyle = false;
23e099c3 1125 bool no_zeroes = false;
9c122ada
HR
1126
1127 /* Client sends:
1128 [ 0 .. 3] client flags
1129
f37708f6 1130 Then we loop until NBD_OPT_EXPORT_NAME or NBD_OPT_GO:
9c122ada
HR
1131 [ 0 .. 7] NBD_OPTS_MAGIC
1132 [ 8 .. 11] NBD option
1133 [12 .. 15] Data length
1134 ... Rest of request
1135
1136 [ 0 .. 7] NBD_OPTS_MAGIC
1137 [ 8 .. 11] Second NBD option
1138 [12 .. 15] Data length
1139 ... Rest of request
1140 */
1141
e6798f06 1142 if (nbd_read32(client->ioc, &flags, "flags", errp) < 0) {
9c122ada
HR
1143 return -EIO;
1144 }
ac132d05 1145 client->mode = NBD_MODE_EXPORT_NAME;
621c4f4e 1146 trace_nbd_negotiate_options_flags(flags);
26afa868 1147 if (flags & NBD_FLAG_C_FIXED_NEWSTYLE) {
26afa868
DB
1148 fixedNewstyle = true;
1149 flags &= ~NBD_FLAG_C_FIXED_NEWSTYLE;
ac132d05 1150 client->mode = NBD_MODE_SIMPLE;
26afa868 1151 }
c203c59a 1152 if (flags & NBD_FLAG_C_NO_ZEROES) {
23e099c3 1153 no_zeroes = true;
c203c59a
EB
1154 flags &= ~NBD_FLAG_C_NO_ZEROES;
1155 }
26afa868 1156 if (flags != 0) {
2fd2c840 1157 error_setg(errp, "Unknown client flags 0x%" PRIx32 " received", flags);
621c4f4e 1158 return -EINVAL;
9c122ada
HR
1159 }
1160
f5076b5a 1161 while (1) {
9c122ada 1162 int ret;
7f9039cd 1163 uint32_t option, length;
f5076b5a
HB
1164 uint64_t magic;
1165
e6798f06 1166 if (nbd_read64(client->ioc, &magic, "opts magic", errp) < 0) {
f5076b5a
HB
1167 return -EINVAL;
1168 }
9588463e
VSO
1169 trace_nbd_negotiate_options_check_magic(magic);
1170 if (magic != NBD_OPTS_MAGIC) {
2fd2c840 1171 error_setg(errp, "Bad magic received");
f5076b5a
HB
1172 return -EINVAL;
1173 }
1174
e6798f06 1175 if (nbd_read32(client->ioc, &option, "option", errp) < 0) {
f5076b5a
HB
1176 return -EINVAL;
1177 }
0cfae925 1178 client->opt = option;
f5076b5a 1179
e6798f06 1180 if (nbd_read32(client->ioc, &length, "option length", errp) < 0) {
f5076b5a
HB
1181 return -EINVAL;
1182 }
894e0280 1183 assert(!client->optlen);
0cfae925 1184 client->optlen = length;
f5076b5a 1185
fdad35ef 1186 if (length > NBD_MAX_BUFFER_SIZE) {
b2578459 1187 error_setg(errp, "len (%" PRIu32 ") is larger than max len (%u)",
fdad35ef
EB
1188 length, NBD_MAX_BUFFER_SIZE);
1189 return -EINVAL;
1190 }
1191
3736cc5b
EB
1192 trace_nbd_negotiate_options_check_option(option,
1193 nbd_opt_lookup(option));
f95910fe
DB
1194 if (client->tlscreds &&
1195 client->ioc == (QIOChannel *)client->sioc) {
1196 QIOChannel *tioc;
1197 if (!fixedNewstyle) {
7f9039cd 1198 error_setg(errp, "Unsupported option 0x%" PRIx32, option);
f95910fe
DB
1199 return -EINVAL;
1200 }
7f9039cd 1201 switch (option) {
f95910fe 1202 case NBD_OPT_STARTTLS:
e68c35cf
EB
1203 if (length) {
1204 /* Unconditionally drop the connection if the client
1205 * can't start a TLS negotiation correctly */
0cfae925 1206 return nbd_reject_length(client, true, errp);
e68c35cf
EB
1207 }
1208 tioc = nbd_negotiate_handle_starttls(client, errp);
f95910fe
DB
1209 if (!tioc) {
1210 return -EIO;
1211 }
8cbee49e 1212 ret = 0;
f95910fe 1213 object_unref(OBJECT(client->ioc));
7d5b0d68 1214 client->ioc = tioc;
f95910fe
DB
1215 break;
1216
d1129a8a
EB
1217 case NBD_OPT_EXPORT_NAME:
1218 /* No way to return an error to client, so drop connection */
2fd2c840 1219 error_setg(errp, "Option 0x%x not permitted before TLS",
7f9039cd 1220 option);
d1129a8a
EB
1221 return -EINVAL;
1222
f95910fe 1223 default:
3e99ebb9
EB
1224 /* Let the client keep trying, unless they asked to
1225 * quit. Always try to give an error back to the
1226 * client; but when replying to OPT_ABORT, be aware
1227 * that the client may hang up before receiving the
1228 * error, in which case we are fine ignoring the
1229 * resulting EPIPE. */
1230 ret = nbd_opt_drop(client, NBD_REP_ERR_TLS_REQD,
1231 option == NBD_OPT_ABORT ? NULL : errp,
894e0280 1232 "Option 0x%" PRIx32
0b0bb124 1233 " not permitted before TLS", option);
7f9039cd 1234 if (option == NBD_OPT_ABORT) {
1e120ffe 1235 return 1;
b6f5d3b5 1236 }
d1129a8a 1237 break;
f95910fe
DB
1238 }
1239 } else if (fixedNewstyle) {
7f9039cd 1240 switch (option) {
26afa868 1241 case NBD_OPT_LIST:
e68c35cf 1242 if (length) {
0cfae925 1243 ret = nbd_reject_length(client, false, errp);
e68c35cf
EB
1244 } else {
1245 ret = nbd_negotiate_handle_list(client, errp);
1246 }
26afa868
DB
1247 break;
1248
1249 case NBD_OPT_ABORT:
b6f5d3b5
EB
1250 /* NBD spec says we must try to reply before
1251 * disconnecting, but that we must also tolerate
1252 * guests that don't wait for our reply. */
0cfae925 1253 nbd_negotiate_send_rep(client, NBD_REP_ACK, NULL);
1e120ffe 1254 return 1;
26afa868
DB
1255
1256 case NBD_OPT_EXPORT_NAME:
dbb38caa 1257 return nbd_negotiate_handle_export_name(client, no_zeroes,
23e099c3 1258 errp);
26afa868 1259
f37708f6
EB
1260 case NBD_OPT_INFO:
1261 case NBD_OPT_GO:
dbb38caa 1262 ret = nbd_negotiate_handle_info(client, errp);
f37708f6
EB
1263 if (ret == 1) {
1264 assert(option == NBD_OPT_GO);
1265 return 0;
1266 }
f37708f6
EB
1267 break;
1268
f95910fe 1269 case NBD_OPT_STARTTLS:
e68c35cf 1270 if (length) {
0cfae925 1271 ret = nbd_reject_length(client, false, errp);
e68c35cf 1272 } else if (client->tlscreds) {
0cfae925
VSO
1273 ret = nbd_negotiate_send_rep_err(client,
1274 NBD_REP_ERR_INVALID, errp,
36683283 1275 "TLS already enabled");
f95910fe 1276 } else {
0cfae925
VSO
1277 ret = nbd_negotiate_send_rep_err(client,
1278 NBD_REP_ERR_POLICY, errp,
36683283 1279 "TLS not configured");
63d5ef86 1280 }
d1129a8a 1281 break;
5c54e7fa
VSO
1282
1283 case NBD_OPT_STRUCTURED_REPLY:
1284 if (length) {
0cfae925 1285 ret = nbd_reject_length(client, false, errp);
9c1d2614
EB
1286 } else if (client->mode >= NBD_MODE_EXTENDED) {
1287 ret = nbd_negotiate_send_rep_err(
1288 client, NBD_REP_ERR_EXT_HEADER_REQD, errp,
1289 "extended headers already negotiated");
ac132d05 1290 } else if (client->mode >= NBD_MODE_STRUCTURED) {
5c54e7fa 1291 ret = nbd_negotiate_send_rep_err(
0cfae925 1292 client, NBD_REP_ERR_INVALID, errp,
5c54e7fa
VSO
1293 "structured reply already negotiated");
1294 } else {
0cfae925 1295 ret = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
ac132d05 1296 client->mode = NBD_MODE_STRUCTURED;
5c54e7fa
VSO
1297 }
1298 break;
1299
e7b1948d
VSO
1300 case NBD_OPT_LIST_META_CONTEXT:
1301 case NBD_OPT_SET_META_CONTEXT:
fd358d83 1302 ret = nbd_negotiate_meta_queries(client, errp);
e7b1948d
VSO
1303 break;
1304
9c1d2614
EB
1305 case NBD_OPT_EXTENDED_HEADERS:
1306 if (length) {
1307 ret = nbd_reject_length(client, false, errp);
1308 } else if (client->mode >= NBD_MODE_EXTENDED) {
1309 ret = nbd_negotiate_send_rep_err(
1310 client, NBD_REP_ERR_INVALID, errp,
1311 "extended headers already negotiated");
1312 } else {
1313 ret = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
1314 client->mode = NBD_MODE_EXTENDED;
1315 }
1316 break;
1317
26afa868 1318 default:
894e0280 1319 ret = nbd_opt_drop(client, NBD_REP_ERR_UNSUP, errp,
28fb494f 1320 "Unsupported option %" PRIu32 " (%s)",
894e0280 1321 option, nbd_opt_lookup(option));
156f6a10 1322 break;
26afa868
DB
1323 }
1324 } else {
1325 /*
1326 * If broken new-style we should drop the connection
1327 * for anything except NBD_OPT_EXPORT_NAME
1328 */
7f9039cd 1329 switch (option) {
26afa868 1330 case NBD_OPT_EXPORT_NAME:
dbb38caa 1331 return nbd_negotiate_handle_export_name(client, no_zeroes,
23e099c3 1332 errp);
26afa868
DB
1333
1334 default:
28fb494f 1335 error_setg(errp, "Unsupported option %" PRIu32 " (%s)",
3736cc5b 1336 option, nbd_opt_lookup(option));
26afa868 1337 return -EINVAL;
32d7d2e0 1338 }
f5076b5a 1339 }
8cbee49e
EB
1340 if (ret < 0) {
1341 return ret;
1342 }
f5076b5a
HB
1343 }
1344}
1345
1e120ffe
VSO
1346/* nbd_negotiate
1347 * Return:
2fd2c840
VSO
1348 * -errno on error, errp is set
1349 * 0 on successful negotiation, errp is not set
1350 * 1 if client sent NBD_OPT_ABORT, i.e. on valid disconnect,
1351 * errp is not set
1e120ffe 1352 */
2fd2c840 1353static coroutine_fn int nbd_negotiate(NBDClient *client, Error **errp)
7a5ca864 1354{
795d946d 1355 ERRP_GUARD();
5f66d060 1356 char buf[NBD_OLDSTYLE_NEGOTIATE_SIZE] = "";
2e5c9ad6 1357 int ret;
b2e3d87f 1358
5f66d060 1359 /* Old style negotiation header, no room for options
6b8c01e7
PB
1360 [ 0 .. 7] passwd ("NBDMAGIC")
1361 [ 8 .. 15] magic (NBD_CLIENT_MAGIC)
b2e3d87f 1362 [16 .. 23] size
5f66d060 1363 [24 .. 27] export flags (zero-extended)
6b8c01e7
PB
1364 [28 .. 151] reserved (0)
1365
5f66d060 1366 New style negotiation header, client can send options
6b8c01e7
PB
1367 [ 0 .. 7] passwd ("NBDMAGIC")
1368 [ 8 .. 15] magic (NBD_OPTS_MAGIC)
1369 [16 .. 17] server flags (0)
f37708f6 1370 ....options sent, ending in NBD_OPT_EXPORT_NAME or NBD_OPT_GO....
b2e3d87f
NT
1371 */
1372
1c778ef7 1373 qio_channel_set_blocking(client->ioc, false, NULL);
06e0f098 1374 qio_channel_set_follow_coroutine_ctx(client->ioc, true);
185b4338 1375
9588463e 1376 trace_nbd_negotiate_begin();
b2e3d87f 1377 memcpy(buf, "NBDMAGIC", 8);
f95910fe 1378
7f7dfe2a
VSO
1379 stq_be_p(buf + 8, NBD_OPTS_MAGIC);
1380 stw_be_p(buf + 16, NBD_FLAG_FIXED_NEWSTYLE | NBD_FLAG_NO_ZEROES);
b2e3d87f 1381
7f7dfe2a
VSO
1382 if (nbd_write(client->ioc, buf, 18, errp) < 0) {
1383 error_prepend(errp, "write failed: ");
1384 return -EINVAL;
1385 }
dbb38caa 1386 ret = nbd_negotiate_options(client, errp);
7f7dfe2a
VSO
1387 if (ret != 0) {
1388 if (ret < 0) {
1389 error_prepend(errp, "option negotiation failed: ");
6b8c01e7 1390 }
7f7dfe2a 1391 return ret;
b2e3d87f
NT
1392 }
1393
0cfae925 1394 assert(!client->optlen);
9588463e 1395 trace_nbd_negotiate_success();
d9faeed8
VSO
1396
1397 return 0;
7a5ca864
FB
1398}
1399
f148ae7d
SL
1400/* nbd_read_eof
1401 * Tries to read @size bytes from @ioc. This is a local implementation of
1402 * qio_channel_readv_all_eof. We have it here because we need it to be
1403 * interruptible and to know when the coroutine is yielding.
1404 * Returns 1 on success
1405 * 0 on eof, when no data was read (errp is not set)
1406 * negative errno on failure (errp is set)
1407 */
1408static inline int coroutine_fn
1409nbd_read_eof(NBDClient *client, void *buffer, size_t size, Error **errp)
1410{
1411 bool partial = false;
1412
1413 assert(size);
1414 while (size > 0) {
1415 struct iovec iov = { .iov_base = buffer, .iov_len = size };
1416 ssize_t len;
1417
1418 len = qio_channel_readv(client->ioc, &iov, 1, errp);
1419 if (len == QIO_CHANNEL_ERR_BLOCK) {
7075d235
SH
1420 WITH_QEMU_LOCK_GUARD(&client->lock) {
1421 client->read_yielding = true;
1422
1423 /* Prompt main loop thread to re-run nbd_drained_poll() */
1424 aio_wait_kick();
1425 }
f148ae7d 1426 qio_channel_yield(client->ioc, G_IO_IN);
7075d235
SH
1427 WITH_QEMU_LOCK_GUARD(&client->lock) {
1428 client->read_yielding = false;
1429 if (client->quiescing) {
1430 return -EAGAIN;
1431 }
f148ae7d
SL
1432 }
1433 continue;
1434 } else if (len < 0) {
1435 return -EIO;
1436 } else if (len == 0) {
1437 if (partial) {
1438 error_setg(errp,
1439 "Unexpected end-of-file before all bytes were read");
1440 return -EIO;
1441 } else {
1442 return 0;
1443 }
1444 }
1445
1446 partial = true;
1447 size -= len;
1448 buffer = (uint8_t *) buffer + len;
1449 }
1450 return 1;
1451}
1452
d2223cdd
PB
1453static int coroutine_fn nbd_receive_request(NBDClient *client, NBDRequest *request,
1454 Error **errp)
75818250 1455{
c8720ca0
EB
1456 uint8_t buf[NBD_EXTENDED_REQUEST_SIZE];
1457 uint32_t magic, expect;
a0dc63a6 1458 int ret;
c8720ca0
EB
1459 size_t size = client->mode >= NBD_MODE_EXTENDED ?
1460 NBD_EXTENDED_REQUEST_SIZE : NBD_REQUEST_SIZE;
b2e3d87f 1461
c8720ca0 1462 ret = nbd_read_eof(client, buf, size, errp);
185b4338
PB
1463 if (ret < 0) {
1464 return ret;
1465 }
1644ccce
EB
1466 if (ret == 0) {
1467 return -EIO;
1468 }
185b4338 1469
c8720ca0
EB
1470 /*
1471 * Compact request
1472 * [ 0 .. 3] magic (NBD_REQUEST_MAGIC)
1473 * [ 4 .. 5] flags (NBD_CMD_FLAG_FUA, ...)
1474 * [ 6 .. 7] type (NBD_CMD_READ, ...)
1475 * [ 8 .. 15] cookie
1476 * [16 .. 23] from
1477 * [24 .. 27] len
1478 * Extended request
1479 * [ 0 .. 3] magic (NBD_EXTENDED_REQUEST_MAGIC)
1480 * [ 4 .. 5] flags (NBD_CMD_FLAG_FUA, NBD_CMD_FLAG_PAYLOAD_LEN, ...)
1481 * [ 6 .. 7] type (NBD_CMD_READ, ...)
1482 * [ 8 .. 15] cookie
1483 * [16 .. 23] from
1484 * [24 .. 31] len
b2e3d87f
NT
1485 */
1486
773dce3c 1487 magic = ldl_be_p(buf);
b626b51a
EB
1488 request->flags = lduw_be_p(buf + 4);
1489 request->type = lduw_be_p(buf + 6);
22efd811 1490 request->cookie = ldq_be_p(buf + 8);
773dce3c 1491 request->from = ldq_be_p(buf + 16);
c8720ca0
EB
1492 if (client->mode >= NBD_MODE_EXTENDED) {
1493 request->len = ldq_be_p(buf + 24);
1494 expect = NBD_EXTENDED_REQUEST_MAGIC;
1495 } else {
1496 request->len = (uint32_t)ldl_be_p(buf + 24); /* widen 32 to 64 bits */
1497 expect = NBD_REQUEST_MAGIC;
1498 }
b2e3d87f 1499
9588463e
VSO
1500 trace_nbd_receive_request(magic, request->flags, request->type,
1501 request->from, request->len);
b2e3d87f 1502
c8720ca0
EB
1503 if (magic != expect) {
1504 error_setg(errp, "invalid magic (got 0x%" PRIx32 ", expected 0x%"
1505 PRIx32 ")", magic, expect);
185b4338 1506 return -EINVAL;
b2e3d87f
NT
1507 }
1508 return 0;
75818250
TS
1509}
1510
41996e38
PB
1511#define MAX_NBD_REQUESTS 16
1512
f816310d 1513/* Runs in export AioContext and main loop thread */
ce33967a 1514void nbd_client_get(NBDClient *client)
1743b515 1515{
f816310d 1516 qatomic_inc(&client->refcount);
1743b515
PB
1517}
1518
ce33967a 1519void nbd_client_put(NBDClient *client)
1743b515 1520{
f816310d
SH
1521 assert(qemu_in_main_thread());
1522
1523 if (qatomic_fetch_dec(&client->refcount) == 1) {
ff2b68aa 1524 /* The last reference should be dropped by client->close,
f53a829b 1525 * which is called by client_close.
ff2b68aa
PB
1526 */
1527 assert(client->closing);
1528
1c778ef7
DB
1529 object_unref(OBJECT(client->sioc));
1530 object_unref(OBJECT(client->ioc));
f95910fe
DB
1531 if (client->tlscreds) {
1532 object_unref(OBJECT(client->tlscreds));
1533 }
b25e12da 1534 g_free(client->tlsauthz);
6b8c01e7
PB
1535 if (client->exp) {
1536 QTAILQ_REMOVE(&client->exp->clients, client, next);
c69de1be 1537 blk_exp_unref(&client->exp->common);
6b8c01e7 1538 }
fd358d83 1539 g_free(client->contexts.bitmaps);
7075d235 1540 qemu_mutex_destroy(&client->lock);
1743b515
PB
1541 g_free(client);
1542 }
1543}
1544
f816310d
SH
1545/*
1546 * Tries to release the reference to @client, but only if other references
1547 * remain. This is an optimization for the common case where we want to avoid
1548 * the expense of scheduling nbd_client_put() in the main loop thread.
1549 *
1550 * Returns true upon success or false if the reference was not released because
1551 * it is the last reference.
1552 */
1553static bool nbd_client_put_nonzero(NBDClient *client)
1554{
1555 int old = qatomic_read(&client->refcount);
1556 int expected;
1557
1558 do {
1559 if (old == 1) {
1560 return false;
1561 }
1562
1563 expected = old;
1564 old = qatomic_cmpxchg(&client->refcount, expected, expected - 1);
1565 } while (old != expected);
1566
1567 return true;
1568}
1569
0c9390d9 1570static void client_close(NBDClient *client, bool negotiated)
1743b515 1571{
f816310d
SH
1572 assert(qemu_in_main_thread());
1573
7075d235
SH
1574 WITH_QEMU_LOCK_GUARD(&client->lock) {
1575 if (client->closing) {
1576 return;
1577 }
ff2b68aa 1578
7075d235
SH
1579 client->closing = true;
1580 }
ff2b68aa
PB
1581
1582 /* Force requests to finish. They will drop their own references,
1583 * then we'll close the socket and free the NBDClient.
1584 */
1c778ef7
DB
1585 qio_channel_shutdown(client->ioc, QIO_CHANNEL_SHUTDOWN_BOTH,
1586 NULL);
ff2b68aa
PB
1587
1588 /* Also tell the client, so that they release their reference. */
0c9390d9
EB
1589 if (client->close_fn) {
1590 client->close_fn(client, negotiated);
1743b515 1591 }
1743b515
PB
1592}
1593
7075d235 1594/* Runs in export AioContext with client->lock held */
315f78ab 1595static NBDRequestData *nbd_request_get(NBDClient *client)
d9a73806 1596{
315f78ab 1597 NBDRequestData *req;
72deddc5 1598
41996e38
PB
1599 assert(client->nb_requests <= MAX_NBD_REQUESTS - 1);
1600 client->nb_requests++;
1601
315f78ab 1602 req = g_new0(NBDRequestData, 1);
72deddc5 1603 req->client = client;
d9a73806
PB
1604 return req;
1605}
1606
7075d235 1607/* Runs in export AioContext with client->lock held */
315f78ab 1608static void nbd_request_put(NBDRequestData *req)
d9a73806 1609{
72deddc5 1610 NBDClient *client = req->client;
e1adb27a 1611
2d821488
SH
1612 if (req->data) {
1613 qemu_vfree(req->data);
1614 }
1729404c 1615 g_free(req);
e1adb27a 1616
958c717d 1617 client->nb_requests--;
fd6afc50
SL
1618
1619 if (client->quiescing && client->nb_requests == 0) {
1620 aio_wait_kick();
1621 }
1622
ff82911c 1623 nbd_client_receive_next_request(client);
d9a73806
PB
1624}
1625
aadf99a7 1626static void blk_aio_attached(AioContext *ctx, void *opaque)
f2149281
HR
1627{
1628 NBDExport *exp = opaque;
1629 NBDClient *client;
1630
7075d235
SH
1631 assert(qemu_in_main_thread());
1632
9588463e 1633 trace_nbd_blk_aio_attached(exp->name, ctx);
f2149281 1634
8612c686 1635 exp->common.ctx = ctx;
f2149281
HR
1636
1637 QTAILQ_FOREACH(client, &exp->clients, next) {
7075d235
SH
1638 WITH_QEMU_LOCK_GUARD(&client->lock) {
1639 assert(client->nb_requests == 0);
1640 assert(client->recv_coroutine == NULL);
1641 assert(client->send_coroutine == NULL);
1642 }
f148ae7d
SL
1643 }
1644}
1645
fd6afc50 1646static void blk_aio_detach(void *opaque)
f148ae7d
SL
1647{
1648 NBDExport *exp = opaque;
f148ae7d 1649
7075d235
SH
1650 assert(qemu_in_main_thread());
1651
fd6afc50
SL
1652 trace_nbd_blk_aio_detach(exp->name, exp->common.ctx);
1653
fd6afc50
SL
1654 exp->common.ctx = NULL;
1655}
1656
1657static void nbd_drained_begin(void *opaque)
1658{
1659 NBDExport *exp = opaque;
1660 NBDClient *client;
1661
7075d235
SH
1662 assert(qemu_in_main_thread());
1663
fd6afc50 1664 QTAILQ_FOREACH(client, &exp->clients, next) {
7075d235
SH
1665 WITH_QEMU_LOCK_GUARD(&client->lock) {
1666 client->quiescing = true;
1667 }
fd6afc50
SL
1668 }
1669}
f148ae7d 1670
fd6afc50
SL
1671static void nbd_drained_end(void *opaque)
1672{
1673 NBDExport *exp = opaque;
1674 NBDClient *client;
f148ae7d 1675
7075d235
SH
1676 assert(qemu_in_main_thread());
1677
fd6afc50 1678 QTAILQ_FOREACH(client, &exp->clients, next) {
7075d235
SH
1679 WITH_QEMU_LOCK_GUARD(&client->lock) {
1680 client->quiescing = false;
1681 nbd_client_receive_next_request(client);
1682 }
f2149281
HR
1683 }
1684}
1685
7075d235
SH
1686/* Runs in export AioContext */
1687static void nbd_wake_read_bh(void *opaque)
1688{
1689 NBDClient *client = opaque;
1690 qio_channel_wake_read(client->ioc);
1691}
1692
fd6afc50 1693static bool nbd_drained_poll(void *opaque)
f2149281
HR
1694{
1695 NBDExport *exp = opaque;
fd6afc50 1696 NBDClient *client;
f2149281 1697
7075d235
SH
1698 assert(qemu_in_main_thread());
1699
fd6afc50 1700 QTAILQ_FOREACH(client, &exp->clients, next) {
7075d235
SH
1701 WITH_QEMU_LOCK_GUARD(&client->lock) {
1702 if (client->nb_requests != 0) {
1703 /*
1704 * If there's a coroutine waiting for a request on nbd_read_eof()
1705 * enter it here so we don't depend on the client to wake it up.
1706 *
1707 * Schedule a BH in the export AioContext to avoid missing the
1708 * wake up due to the race between qio_channel_wake_read() and
1709 * qio_channel_yield().
1710 */
1711 if (client->recv_coroutine != NULL && client->read_yielding) {
1712 aio_bh_schedule_oneshot(nbd_export_aio_context(client->exp),
1713 nbd_wake_read_bh, client);
1714 }
f2149281 1715
7075d235
SH
1716 return true;
1717 }
fd6afc50
SL
1718 }
1719 }
f2149281 1720
fd6afc50 1721 return false;
f2149281
HR
1722}
1723
741cc431
HR
1724static void nbd_eject_notifier(Notifier *n, void *data)
1725{
1726 NBDExport *exp = container_of(n, NBDExport, eject_notifier);
61bc846d 1727
7075d235
SH
1728 assert(qemu_in_main_thread());
1729
bc4ee65b 1730 blk_exp_request_shutdown(&exp->common);
741cc431
HR
1731}
1732
9b562c64
KW
1733void nbd_export_set_on_eject_blk(BlockExport *exp, BlockBackend *blk)
1734{
1735 NBDExport *nbd_exp = container_of(exp, NBDExport, common);
1736 assert(exp->drv == &blk_exp_nbd);
1737 assert(nbd_exp->eject_notifier_blk == NULL);
1738
1739 blk_ref(blk);
1740 nbd_exp->eject_notifier_blk = blk;
1741 nbd_exp->eject_notifier.notify = nbd_eject_notifier;
1742 blk_add_remove_bs_notifier(blk, &nbd_exp->eject_notifier);
1743}
1744
fd6afc50
SL
1745static const BlockDevOps nbd_block_ops = {
1746 .drained_begin = nbd_drained_begin,
1747 .drained_end = nbd_drained_end,
1748 .drained_poll = nbd_drained_poll,
1749};
1750
5b1cb497
KW
1751static int nbd_export_create(BlockExport *blk_exp, BlockExportOptions *exp_args,
1752 Error **errp)
af49bbbe 1753{
a6ff7989 1754 NBDExport *exp = container_of(blk_exp, NBDExport, common);
5b1cb497 1755 BlockExportOptionsNbd *arg = &exp_args->u.nbd;
8461b4d6 1756 const char *name = arg->name ?: exp_args->node_name;
331170e0 1757 BlockBackend *blk = blk_exp->blk;
b57e4de0 1758 int64_t size;
331170e0 1759 uint64_t perm, shared_perm;
5b1cb497 1760 bool readonly = !exp_args->writable;
e5fb29d5 1761 BlockDirtyBitmapOrStrList *bitmaps;
3b1f244c 1762 size_t i;
d7086422 1763 int ret;
cd7fca95 1764
372b69f5 1765 GLOBAL_STATE_CODE();
5b1cb497
KW
1766 assert(exp_args->type == BLOCK_EXPORT_TYPE_NBD);
1767
1768 if (!nbd_server_is_running()) {
1769 error_setg(errp, "NBD server not running");
1770 return -EINVAL;
1771 }
1772
8461b4d6
MA
1773 if (strlen(name) > NBD_MAX_STRING_SIZE) {
1774 error_setg(errp, "export name '%s' too long", name);
5b1cb497
KW
1775 return -EINVAL;
1776 }
1777
1778 if (arg->description && strlen(arg->description) > NBD_MAX_STRING_SIZE) {
1779 error_setg(errp, "description '%s' too long", arg->description);
1780 return -EINVAL;
1781 }
1782
8461b4d6
MA
1783 if (nbd_export_find(name)) {
1784 error_setg(errp, "NBD server already has export named '%s'", name);
5b1cb497
KW
1785 return -EEXIST;
1786 }
1787
331170e0 1788 size = blk_getlength(blk);
b57e4de0
KW
1789 if (size < 0) {
1790 error_setg_errno(errp, -size,
1791 "Failed to determine the NBD export's length");
a6ff7989 1792 return size;
b57e4de0
KW
1793 }
1794
8a7ce4f9
KW
1795 /* Don't allow resize while the NBD server is running, otherwise we don't
1796 * care what happens with the node. */
331170e0 1797 blk_get_perm(blk, &perm, &shared_perm);
331170e0 1798 ret = blk_set_perm(blk, perm, shared_perm & ~BLK_PERM_RESIZE, errp);
d7086422 1799 if (ret < 0) {
331170e0 1800 return ret;
d7086422 1801 }
331170e0 1802
4b9441f6 1803 QTAILQ_INIT(&exp->clients);
8461b4d6 1804 exp->name = g_strdup(name);
5b1cb497 1805 exp->description = g_strdup(arg->description);
dbb38caa
EB
1806 exp->nbdflags = (NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_FLUSH |
1807 NBD_FLAG_SEND_FUA | NBD_FLAG_SEND_CACHE);
58a6fdcc
EB
1808
1809 if (nbd_server_max_connections() != 1) {
1810 exp->nbdflags |= NBD_FLAG_CAN_MULTI_CONN;
1811 }
dbb38caa
EB
1812 if (readonly) {
1813 exp->nbdflags |= NBD_FLAG_READ_ONLY;
dbb38caa 1814 } else {
b491dbb7
EB
1815 exp->nbdflags |= (NBD_FLAG_SEND_TRIM | NBD_FLAG_SEND_WRITE_ZEROES |
1816 NBD_FLAG_SEND_FAST_ZERO);
dbb38caa 1817 }
7596bbb3 1818 exp->size = QEMU_ALIGN_DOWN(size, BDRV_SECTOR_SIZE);
98f44bbe 1819
372b69f5
KW
1820 bdrv_graph_rdlock_main_loop();
1821
cbad81ce 1822 for (bitmaps = arg->bitmaps; bitmaps; bitmaps = bitmaps->next) {
3b1f244c
EB
1823 exp->nr_export_bitmaps++;
1824 }
1825 exp->export_bitmaps = g_new0(BdrvDirtyBitmap *, exp->nr_export_bitmaps);
1826 for (i = 0, bitmaps = arg->bitmaps; bitmaps;
e5fb29d5
VSO
1827 i++, bitmaps = bitmaps->next)
1828 {
1829 const char *bitmap;
331170e0 1830 BlockDriverState *bs = blk_bs(blk);
678ba275 1831 BdrvDirtyBitmap *bm = NULL;
678ba275 1832
e5fb29d5
VSO
1833 switch (bitmaps->value->type) {
1834 case QTYPE_QSTRING:
1835 bitmap = bitmaps->value->u.local;
1836 while (bs) {
1837 bm = bdrv_find_dirty_bitmap(bs, bitmap);
1838 if (bm != NULL) {
1839 break;
1840 }
1841
1842 bs = bdrv_filter_or_cow_bs(bs);
678ba275
EB
1843 }
1844
e5fb29d5
VSO
1845 if (bm == NULL) {
1846 ret = -ENOENT;
1847 error_setg(errp, "Bitmap '%s' is not found",
1848 bitmaps->value->u.local);
1849 goto fail;
1850 }
678ba275 1851
e5fb29d5
VSO
1852 if (readonly && bdrv_is_writable(bs) &&
1853 bdrv_dirty_bitmap_enabled(bm)) {
1854 ret = -EINVAL;
1855 error_setg(errp, "Enabled bitmap '%s' incompatible with "
1856 "readonly export", bitmap);
1857 goto fail;
1858 }
1859 break;
1860 case QTYPE_QDICT:
1861 bitmap = bitmaps->value->u.external.name;
1862 bm = block_dirty_bitmap_lookup(bitmaps->value->u.external.node,
1863 bitmap, NULL, errp);
1864 if (!bm) {
1865 ret = -ENOENT;
1866 goto fail;
1867 }
1868 break;
1869 default:
1870 abort();
678ba275
EB
1871 }
1872
e5fb29d5 1873 assert(bm);
3b78a927 1874
e5fb29d5 1875 if (bdrv_dirty_bitmap_check(bm, BDRV_BITMAP_ALLOW_RO, errp)) {
a6ff7989 1876 ret = -EINVAL;
678ba275
EB
1877 goto fail;
1878 }
1879
3b1f244c 1880 exp->export_bitmaps[i] = bm;
cbad81ce 1881 assert(strlen(bitmap) <= BDRV_BITMAP_MAX_NAME_SIZE);
678ba275
EB
1882 }
1883
3b1f244c
EB
1884 /* Mark bitmaps busy in a separate loop, to simplify roll-back concerns. */
1885 for (i = 0; i < exp->nr_export_bitmaps; i++) {
1886 bdrv_dirty_bitmap_set_busy(exp->export_bitmaps[i], true);
1887 }
1888
dbc7b014
EB
1889 exp->allocation_depth = arg->allocation_depth;
1890
fd6afc50
SL
1891 /*
1892 * We need to inhibit request queuing in the block layer to ensure we can
1893 * be properly quiesced when entering a drained section, as our coroutines
1894 * servicing pending requests might enter blk_pread().
1895 */
1896 blk_set_disable_request_queuing(blk, true);
1897
aadf99a7 1898 blk_add_aio_context_notifier(blk, blk_aio_attached, blk_aio_detach, exp);
741cc431 1899
fd6afc50
SL
1900 blk_set_dev_ops(blk, &nbd_block_ops, exp);
1901
3fa4c765 1902 QTAILQ_INSERT_TAIL(&exports, exp, next);
c69de1be 1903
372b69f5
KW
1904 bdrv_graph_rdunlock_main_loop();
1905
a6ff7989 1906 return 0;
98f44bbe
HR
1907
1908fail:
372b69f5 1909 bdrv_graph_rdunlock_main_loop();
3b1f244c 1910 g_free(exp->export_bitmaps);
3fa4c765
EB
1911 g_free(exp->name);
1912 g_free(exp->description);
a6ff7989 1913 return ret;
af49bbbe
PB
1914}
1915
ee0a19ec
PB
1916NBDExport *nbd_export_find(const char *name)
1917{
1918 NBDExport *exp;
1919 QTAILQ_FOREACH(exp, &exports, next) {
1920 if (strcmp(name, exp->name) == 0) {
1921 return exp;
1922 }
1923 }
1924
1925 return NULL;
1926}
1927
61bc846d
EB
1928AioContext *
1929nbd_export_aio_context(NBDExport *exp)
1930{
8612c686 1931 return exp->common.ctx;
61bc846d
EB
1932}
1933
bc4ee65b 1934static void nbd_export_request_shutdown(BlockExport *blk_exp)
af49bbbe 1935{
bc4ee65b 1936 NBDExport *exp = container_of(blk_exp, NBDExport, common);
4b9441f6 1937 NBDClient *client, *next;
2c8d9f06 1938
c69de1be 1939 blk_exp_ref(&exp->common);
3fa4c765
EB
1940 /*
1941 * TODO: Should we expand QMP NbdServerRemoveNode enum to allow a
1942 * close mode that stops advertising the export to new clients but
1943 * still permits existing clients to run to completion? Because of
1944 * that possibility, nbd_export_close() can be called more than
1945 * once on an export.
1946 */
4b9441f6 1947 QTAILQ_FOREACH_SAFE(client, &exp->clients, next, next) {
0c9390d9 1948 client_close(client, true);
4b9441f6 1949 }
3fa4c765 1950 if (exp->name) {
3fa4c765
EB
1951 g_free(exp->name);
1952 exp->name = NULL;
1953 QTAILQ_REMOVE(&exports, exp, next);
1954 }
c69de1be 1955 blk_exp_unref(&exp->common);
2c8d9f06
PB
1956}
1957
c69de1be 1958static void nbd_export_delete(BlockExport *blk_exp)
2c8d9f06 1959{
3b1f244c 1960 size_t i;
c69de1be 1961 NBDExport *exp = container_of(blk_exp, NBDExport, common);
2c8d9f06 1962
c69de1be
KW
1963 assert(exp->name == NULL);
1964 assert(QTAILQ_EMPTY(&exp->clients));
d6268348 1965
c69de1be
KW
1966 g_free(exp->description);
1967 exp->description = NULL;
1968
dd5b6780
PB
1969 if (exp->eject_notifier_blk) {
1970 notifier_remove(&exp->eject_notifier);
1971 blk_unref(exp->eject_notifier_blk);
c69de1be 1972 }
dd5b6780
PB
1973 blk_remove_aio_context_notifier(exp->common.blk, blk_aio_attached,
1974 blk_aio_detach, exp);
1975 blk_set_disable_request_queuing(exp->common.blk, false);
3d068aff 1976
3b1f244c
EB
1977 for (i = 0; i < exp->nr_export_bitmaps; i++) {
1978 bdrv_dirty_bitmap_set_busy(exp->export_bitmaps[i], false);
2c8d9f06 1979 }
af49bbbe
PB
1980}
1981
56ee8626
KW
1982const BlockExportDriver blk_exp_nbd = {
1983 .type = BLOCK_EXPORT_TYPE_NBD,
a6ff7989 1984 .instance_size = sizeof(NBDExport),
56ee8626 1985 .create = nbd_export_create,
c69de1be 1986 .delete = nbd_export_delete,
bc4ee65b 1987 .request_shutdown = nbd_export_request_shutdown,
56ee8626
KW
1988};
1989
de79bfc3
VSO
1990static int coroutine_fn nbd_co_send_iov(NBDClient *client, struct iovec *iov,
1991 unsigned niov, Error **errp)
1992{
1993 int ret;
1994
1995 g_assert(qemu_in_coroutine());
1996 qemu_co_mutex_lock(&client->send_lock);
1997 client->send_coroutine = qemu_coroutine_self();
1998
1999 ret = qio_channel_writev_all(client->ioc, iov, niov, errp) < 0 ? -EIO : 0;
2000
2001 client->send_coroutine = NULL;
2002 qemu_co_mutex_unlock(&client->send_lock);
2003
2004 return ret;
2005}
2006
caad5384 2007static inline void set_be_simple_reply(NBDSimpleReply *reply, uint64_t error,
22efd811 2008 uint64_t cookie)
caad5384
VSO
2009{
2010 stl_be_p(&reply->magic, NBD_SIMPLE_REPLY_MAGIC);
2011 stl_be_p(&reply->error, error);
22efd811 2012 stq_be_p(&reply->cookie, cookie);
caad5384
VSO
2013}
2014
d2223cdd 2015static int coroutine_fn nbd_co_send_simple_reply(NBDClient *client,
66d4f4fe 2016 NBDRequest *request,
d2223cdd
PB
2017 uint32_t error,
2018 void *data,
b2578459 2019 uint64_t len,
d2223cdd 2020 Error **errp)
22045592 2021{
de79bfc3 2022 NBDSimpleReply reply;
14cea41d 2023 int nbd_err = system_errno_to_nbd_errno(error);
de79bfc3
VSO
2024 struct iovec iov[] = {
2025 {.iov_base = &reply, .iov_len = sizeof(reply)},
2026 {.iov_base = data, .iov_len = len}
2027 };
6fb2b972 2028
a7c8ed36 2029 assert(!len || !nbd_err);
b2578459 2030 assert(len <= NBD_MAX_BUFFER_SIZE);
ac132d05
EB
2031 assert(client->mode < NBD_MODE_STRUCTURED ||
2032 (client->mode == NBD_MODE_STRUCTURED &&
2033 request->type != NBD_CMD_READ));
22efd811 2034 trace_nbd_co_send_simple_reply(request->cookie, nbd_err,
66d4f4fe 2035 nbd_err_lookup(nbd_err), len);
22efd811 2036 set_be_simple_reply(&reply, nbd_err, request->cookie);
262db388 2037
a7c8ed36 2038 return nbd_co_send_iov(client, iov, 2, errp);
22045592
PB
2039}
2040
a7c8ed36
EB
2041/*
2042 * Prepare the header of a reply chunk for network transmission.
2043 *
2044 * On input, @iov is partially initialized: iov[0].iov_base must point
2045 * to an uninitialized NBDReply, while the remaining @niov elements
2046 * (if any) must be ready for transmission. This function then
2047 * populates iov[0] for transmission.
2048 */
2049static inline void set_be_chunk(NBDClient *client, struct iovec *iov,
2050 size_t niov, uint16_t flags, uint16_t type,
66d4f4fe 2051 NBDRequest *request)
5c54e7fa 2052{
a7c8ed36
EB
2053 size_t i, length = 0;
2054
2055 for (i = 1; i < niov; i++) {
2056 length += iov[i].iov_len;
2057 }
2058 assert(length <= NBD_MAX_BUFFER_SIZE + sizeof(NBDStructuredReadData));
2059
11d3355f
EB
2060 if (client->mode >= NBD_MODE_EXTENDED) {
2061 NBDExtendedReplyChunk *chunk = iov->iov_base;
2062
2063 iov[0].iov_len = sizeof(*chunk);
2064 stl_be_p(&chunk->magic, NBD_EXTENDED_REPLY_MAGIC);
2065 stw_be_p(&chunk->flags, flags);
2066 stw_be_p(&chunk->type, type);
2067 stq_be_p(&chunk->cookie, request->cookie);
2068 stq_be_p(&chunk->offset, request->from);
2069 stq_be_p(&chunk->length, length);
2070 } else {
2071 NBDStructuredReplyChunk *chunk = iov->iov_base;
2072
2073 iov[0].iov_len = sizeof(*chunk);
2074 stl_be_p(&chunk->magic, NBD_STRUCTURED_REPLY_MAGIC);
2075 stw_be_p(&chunk->flags, flags);
2076 stw_be_p(&chunk->type, type);
2077 stq_be_p(&chunk->cookie, request->cookie);
2078 stl_be_p(&chunk->length, length);
2079 }
5c54e7fa
VSO
2080}
2081
a7c8ed36 2082static int coroutine_fn nbd_co_send_chunk_done(NBDClient *client,
66d4f4fe 2083 NBDRequest *request,
a7c8ed36 2084 Error **errp)
ef8c887e 2085{
a7c8ed36 2086 NBDReply hdr;
ef8c887e 2087 struct iovec iov[] = {
a7c8ed36 2088 {.iov_base = &hdr},
ef8c887e
EB
2089 };
2090
22efd811 2091 trace_nbd_co_send_chunk_done(request->cookie);
a7c8ed36 2092 set_be_chunk(client, iov, 1, NBD_REPLY_FLAG_DONE,
66d4f4fe 2093 NBD_REPLY_TYPE_NONE, request);
ef8c887e
EB
2094 return nbd_co_send_iov(client, iov, 1, errp);
2095}
2096
a7c8ed36 2097static int coroutine_fn nbd_co_send_chunk_read(NBDClient *client,
66d4f4fe 2098 NBDRequest *request,
a7c8ed36
EB
2099 uint64_t offset,
2100 void *data,
b2578459 2101 uint64_t size,
a7c8ed36
EB
2102 bool final,
2103 Error **errp)
5c54e7fa 2104{
a7c8ed36 2105 NBDReply hdr;
efdc0c10 2106 NBDStructuredReadData chunk;
5c54e7fa 2107 struct iovec iov[] = {
a7c8ed36 2108 {.iov_base = &hdr},
5c54e7fa
VSO
2109 {.iov_base = &chunk, .iov_len = sizeof(chunk)},
2110 {.iov_base = data, .iov_len = size}
2111 };
2112
b2578459 2113 assert(size && size <= NBD_MAX_BUFFER_SIZE);
22efd811 2114 trace_nbd_co_send_chunk_read(request->cookie, offset, data, size);
a7c8ed36 2115 set_be_chunk(client, iov, 3, final ? NBD_REPLY_FLAG_DONE : 0,
66d4f4fe 2116 NBD_REPLY_TYPE_OFFSET_DATA, request);
5c54e7fa
VSO
2117 stq_be_p(&chunk.offset, offset);
2118
a7c8ed36 2119 return nbd_co_send_iov(client, iov, 3, errp);
5c54e7fa 2120}
ac132d05 2121
a7c8ed36 2122static int coroutine_fn nbd_co_send_chunk_error(NBDClient *client,
66d4f4fe 2123 NBDRequest *request,
a7c8ed36
EB
2124 uint32_t error,
2125 const char *msg,
2126 Error **errp)
60ace2ba 2127{
a7c8ed36 2128 NBDReply hdr;
60ace2ba
VSO
2129 NBDStructuredError chunk;
2130 int nbd_err = system_errno_to_nbd_errno(error);
2131 struct iovec iov[] = {
a7c8ed36 2132 {.iov_base = &hdr},
60ace2ba
VSO
2133 {.iov_base = &chunk, .iov_len = sizeof(chunk)},
2134 {.iov_base = (char *)msg, .iov_len = msg ? strlen(msg) : 0},
2135 };
2136
2137 assert(nbd_err);
22efd811 2138 trace_nbd_co_send_chunk_error(request->cookie, nbd_err,
a7c8ed36
EB
2139 nbd_err_lookup(nbd_err), msg ? msg : "");
2140 set_be_chunk(client, iov, 3, NBD_REPLY_FLAG_DONE,
66d4f4fe 2141 NBD_REPLY_TYPE_ERROR, request);
60ace2ba 2142 stl_be_p(&chunk.error, nbd_err);
a7c8ed36 2143 stw_be_p(&chunk.message_length, iov[2].iov_len);
60ace2ba 2144
a7c8ed36 2145 return nbd_co_send_iov(client, iov, 3, errp);
60ace2ba
VSO
2146}
2147
37e02aeb 2148/* Do a sparse read and send the structured reply to the client.
ff7e261b 2149 * Returns -errno if sending fails. blk_co_block_status_above() failure is
37e02aeb
VSO
2150 * reported to the client, at which point this function succeeds.
2151 */
418638d3 2152static int coroutine_fn nbd_co_send_sparse_read(NBDClient *client,
66d4f4fe 2153 NBDRequest *request,
418638d3
EB
2154 uint64_t offset,
2155 uint8_t *data,
b2578459 2156 uint64_t size,
418638d3
EB
2157 Error **errp)
2158{
2159 int ret = 0;
2160 NBDExport *exp = client->exp;
2161 size_t progress = 0;
2162
b2578459 2163 assert(size <= NBD_MAX_BUFFER_SIZE);
418638d3
EB
2164 while (progress < size) {
2165 int64_t pnum;
ff7e261b
EGE
2166 int status = blk_co_block_status_above(exp->common.blk, NULL,
2167 offset + progress,
2168 size - progress, &pnum, NULL,
2169 NULL);
e2de3256 2170 bool final;
418638d3
EB
2171
2172 if (status < 0) {
37e02aeb
VSO
2173 char *msg = g_strdup_printf("unable to check for holes: %s",
2174 strerror(-status));
2175
66d4f4fe 2176 ret = nbd_co_send_chunk_error(client, request, -status, msg, errp);
37e02aeb
VSO
2177 g_free(msg);
2178 return ret;
418638d3
EB
2179 }
2180 assert(pnum && pnum <= size - progress);
e2de3256 2181 final = progress + pnum == size;
418638d3 2182 if (status & BDRV_BLOCK_ZERO) {
a7c8ed36 2183 NBDReply hdr;
418638d3
EB
2184 NBDStructuredReadHole chunk;
2185 struct iovec iov[] = {
a7c8ed36 2186 {.iov_base = &hdr},
418638d3
EB
2187 {.iov_base = &chunk, .iov_len = sizeof(chunk)},
2188 };
2189
22efd811 2190 trace_nbd_co_send_chunk_read_hole(request->cookie,
66d4f4fe 2191 offset + progress, pnum);
a7c8ed36
EB
2192 set_be_chunk(client, iov, 2,
2193 final ? NBD_REPLY_FLAG_DONE : 0,
66d4f4fe 2194 NBD_REPLY_TYPE_OFFSET_HOLE, request);
418638d3
EB
2195 stq_be_p(&chunk.offset, offset + progress);
2196 stl_be_p(&chunk.length, pnum);
a7c8ed36 2197 ret = nbd_co_send_iov(client, iov, 2, errp);
418638d3 2198 } else {
d2223cdd
PB
2199 ret = blk_co_pread(exp->common.blk, offset + progress, pnum,
2200 data + progress, 0);
418638d3
EB
2201 if (ret < 0) {
2202 error_setg_errno(errp, -ret, "reading from file failed");
2203 break;
2204 }
66d4f4fe 2205 ret = nbd_co_send_chunk_read(client, request, offset + progress,
a7c8ed36 2206 data + progress, pnum, final, errp);
418638d3
EB
2207 }
2208
2209 if (ret < 0) {
2210 break;
2211 }
2212 progress += pnum;
2213 }
418638d3
EB
2214 return ret;
2215}
2216
89cbc7e3 2217typedef struct NBDExtentArray {
bcc16cc1 2218 NBDExtent64 *extents;
89cbc7e3
VSO
2219 unsigned int nb_alloc;
2220 unsigned int count;
2221 uint64_t total_length;
bcc16cc1 2222 bool extended;
89cbc7e3
VSO
2223 bool can_add;
2224 bool converted_to_be;
2225} NBDExtentArray;
2226
bcc16cc1
EB
2227static NBDExtentArray *nbd_extent_array_new(unsigned int nb_alloc,
2228 NBDMode mode)
89cbc7e3
VSO
2229{
2230 NBDExtentArray *ea = g_new0(NBDExtentArray, 1);
2231
bcc16cc1 2232 assert(mode >= NBD_MODE_STRUCTURED);
89cbc7e3 2233 ea->nb_alloc = nb_alloc;
bcc16cc1
EB
2234 ea->extents = g_new(NBDExtent64, nb_alloc);
2235 ea->extended = mode >= NBD_MODE_EXTENDED;
89cbc7e3
VSO
2236 ea->can_add = true;
2237
2238 return ea;
2239}
2240
2241static void nbd_extent_array_free(NBDExtentArray *ea)
2242{
2243 g_free(ea->extents);
2244 g_free(ea);
2245}
e0e7fe07 2246G_DEFINE_AUTOPTR_CLEANUP_FUNC(NBDExtentArray, nbd_extent_array_free)
89cbc7e3
VSO
2247
2248/* Further modifications of the array after conversion are abandoned */
2249static void nbd_extent_array_convert_to_be(NBDExtentArray *ea)
2250{
2251 int i;
2252
2253 assert(!ea->converted_to_be);
bcc16cc1 2254 assert(ea->extended);
89cbc7e3
VSO
2255 ea->can_add = false;
2256 ea->converted_to_be = true;
2257
2258 for (i = 0; i < ea->count; i++) {
bcc16cc1
EB
2259 ea->extents[i].length = cpu_to_be64(ea->extents[i].length);
2260 ea->extents[i].flags = cpu_to_be64(ea->extents[i].flags);
89cbc7e3
VSO
2261 }
2262}
2263
bcc16cc1
EB
2264/* Further modifications of the array after conversion are abandoned */
2265static NBDExtent32 *nbd_extent_array_convert_to_narrow(NBDExtentArray *ea)
2266{
2267 int i;
2268 NBDExtent32 *extents = g_new(NBDExtent32, ea->count);
2269
2270 assert(!ea->converted_to_be);
2271 assert(!ea->extended);
2272 ea->can_add = false;
2273 ea->converted_to_be = true;
2274
2275 for (i = 0; i < ea->count; i++) {
2276 assert((ea->extents[i].length | ea->extents[i].flags) <= UINT32_MAX);
2277 extents[i].length = cpu_to_be32(ea->extents[i].length);
2278 extents[i].flags = cpu_to_be32(ea->extents[i].flags);
2279 }
2280
2281 return extents;
2282}
2283
fb7afc79 2284/*
89cbc7e3
VSO
2285 * Add extent to NBDExtentArray. If extent can't be added (no available space),
2286 * return -1.
2287 * For safety, when returning -1 for the first time, .can_add is set to false,
314b9026
EB
2288 * and further calls to nbd_extent_array_add() will crash.
2289 * (this avoids the situation where a caller ignores failure to add one extent,
2290 * where adding another extent that would squash into the last array entry
2291 * would result in an incorrect range reported to the client)
fb7afc79 2292 */
89cbc7e3 2293static int nbd_extent_array_add(NBDExtentArray *ea,
bcc16cc1 2294 uint64_t length, uint32_t flags)
e7b1948d 2295{
89cbc7e3
VSO
2296 assert(ea->can_add);
2297
2298 if (!length) {
2299 return 0;
2300 }
bcc16cc1
EB
2301 if (!ea->extended) {
2302 assert(length <= UINT32_MAX);
2303 }
89cbc7e3
VSO
2304
2305 /* Extend previous extent if flags are the same */
2306 if (ea->count > 0 && flags == ea->extents[ea->count - 1].flags) {
bcc16cc1 2307 uint64_t sum = length + ea->extents[ea->count - 1].length;
89cbc7e3 2308
bcc16cc1
EB
2309 /*
2310 * sum cannot overflow: the block layer bounds image size at
2311 * 2^63, and ea->extents[].length comes from the block layer.
2312 */
2313 assert(sum >= length);
2314 if (sum <= UINT32_MAX || ea->extended) {
89cbc7e3
VSO
2315 ea->extents[ea->count - 1].length = sum;
2316 ea->total_length += length;
2317 return 0;
2318 }
2319 }
2320
2321 if (ea->count >= ea->nb_alloc) {
2322 ea->can_add = false;
2323 return -1;
2324 }
2325
2326 ea->total_length += length;
bcc16cc1 2327 ea->extents[ea->count] = (NBDExtent64) {.length = length, .flags = flags};
89cbc7e3 2328 ea->count++;
e7b1948d 2329
89cbc7e3
VSO
2330 return 0;
2331}
2332
ff7e261b 2333static int coroutine_fn blockstatus_to_extents(BlockBackend *blk,
6f58ac55
EGE
2334 uint64_t offset, uint64_t bytes,
2335 NBDExtentArray *ea)
89cbc7e3
VSO
2336{
2337 while (bytes) {
e7b1948d
VSO
2338 uint32_t flags;
2339 int64_t num;
ff7e261b
EGE
2340 int ret = blk_co_block_status_above(blk, NULL, offset, bytes, &num,
2341 NULL, NULL);
fb7afc79 2342
e7b1948d
VSO
2343 if (ret < 0) {
2344 return ret;
2345 }
2346
0da98568
NS
2347 flags = (ret & BDRV_BLOCK_DATA ? 0 : NBD_STATE_HOLE) |
2348 (ret & BDRV_BLOCK_ZERO ? NBD_STATE_ZERO : 0);
e7b1948d 2349
89cbc7e3
VSO
2350 if (nbd_extent_array_add(ea, num, flags) < 0) {
2351 return 0;
e7b1948d 2352 }
fb7afc79 2353
89cbc7e3
VSO
2354 offset += num;
2355 bytes -= num;
e7b1948d
VSO
2356 }
2357
e7b1948d
VSO
2358 return 0;
2359}
2360
ff7e261b 2361static int coroutine_fn blockalloc_to_extents(BlockBackend *blk,
6f58ac55
EGE
2362 uint64_t offset, uint64_t bytes,
2363 NBDExtentArray *ea)
71719cd5
EB
2364{
2365 while (bytes) {
2366 int64_t num;
ff7e261b
EGE
2367 int ret = blk_co_is_allocated_above(blk, NULL, false, offset, bytes,
2368 &num);
71719cd5
EB
2369
2370 if (ret < 0) {
2371 return ret;
2372 }
2373
2374 if (nbd_extent_array_add(ea, num, ret) < 0) {
2375 return 0;
2376 }
2377
2378 offset += num;
2379 bytes -= num;
2380 }
2381
2382 return 0;
2383}
2384
89cbc7e3
VSO
2385/*
2386 * nbd_co_send_extents
3d068aff 2387 *
89cbc7e3
VSO
2388 * @ea is converted to BE by the function
2389 * @last controls whether NBD_REPLY_FLAG_DONE is sent.
3d068aff 2390 */
d2223cdd 2391static int coroutine_fn
66d4f4fe 2392nbd_co_send_extents(NBDClient *client, NBDRequest *request, NBDExtentArray *ea,
d2223cdd 2393 bool last, uint32_t context_id, Error **errp)
e7b1948d 2394{
a7c8ed36 2395 NBDReply hdr;
bcc16cc1
EB
2396 NBDStructuredMeta meta;
2397 NBDExtendedMeta meta_ext;
2398 g_autofree NBDExtent32 *extents = NULL;
2399 uint16_t type;
2400 struct iovec iov[] = { {.iov_base = &hdr}, {0}, {0} };
e7b1948d 2401
bcc16cc1
EB
2402 if (client->mode >= NBD_MODE_EXTENDED) {
2403 type = NBD_REPLY_TYPE_BLOCK_STATUS_EXT;
2404
2405 iov[1].iov_base = &meta_ext;
2406 iov[1].iov_len = sizeof(meta_ext);
2407 stl_be_p(&meta_ext.context_id, context_id);
2408 stl_be_p(&meta_ext.count, ea->count);
2409
2410 nbd_extent_array_convert_to_be(ea);
2411 iov[2].iov_base = ea->extents;
2412 iov[2].iov_len = ea->count * sizeof(ea->extents[0]);
2413 } else {
2414 type = NBD_REPLY_TYPE_BLOCK_STATUS;
2415
2416 iov[1].iov_base = &meta;
2417 iov[1].iov_len = sizeof(meta);
2418 stl_be_p(&meta.context_id, context_id);
2419
2420 extents = nbd_extent_array_convert_to_narrow(ea);
2421 iov[2].iov_base = extents;
2422 iov[2].iov_len = ea->count * sizeof(extents[0]);
2423 }
89cbc7e3 2424
22efd811 2425 trace_nbd_co_send_extents(request->cookie, ea->count, context_id,
66d4f4fe 2426 ea->total_length, last);
bcc16cc1
EB
2427 set_be_chunk(client, iov, 3, last ? NBD_REPLY_FLAG_DONE : 0, type,
2428 request);
e7b1948d 2429
a7c8ed36 2430 return nbd_co_send_iov(client, iov, 3, errp);
e7b1948d
VSO
2431}
2432
2433/* Get block status from the exported device and send it to the client */
6f58ac55 2434static int
66d4f4fe 2435coroutine_fn nbd_co_send_block_status(NBDClient *client, NBDRequest *request,
ff7e261b 2436 BlockBackend *blk, uint64_t offset,
bcc16cc1 2437 uint64_t length, bool dont_fragment,
6f58ac55
EGE
2438 bool last, uint32_t context_id,
2439 Error **errp)
e7b1948d
VSO
2440{
2441 int ret;
416e34bd 2442 unsigned int nb_extents = dont_fragment ? 1 : NBD_MAX_BLOCK_STATUS_EXTENTS;
bcc16cc1
EB
2443 g_autoptr(NBDExtentArray) ea =
2444 nbd_extent_array_new(nb_extents, client->mode);
e7b1948d 2445
71719cd5 2446 if (context_id == NBD_META_ID_BASE_ALLOCATION) {
ff7e261b 2447 ret = blockstatus_to_extents(blk, offset, length, ea);
71719cd5 2448 } else {
ff7e261b 2449 ret = blockalloc_to_extents(blk, offset, length, ea);
71719cd5 2450 }
e7b1948d 2451 if (ret < 0) {
66d4f4fe 2452 return nbd_co_send_chunk_error(client, request, -ret,
a7c8ed36 2453 "can't get block status", errp);
e7b1948d
VSO
2454 }
2455
66d4f4fe 2456 return nbd_co_send_extents(client, request, ea, last, context_id, errp);
3d068aff
VSO
2457}
2458
dacbb6eb 2459/* Populate @ea from a dirty bitmap. */
89cbc7e3
VSO
2460static void bitmap_to_extents(BdrvDirtyBitmap *bitmap,
2461 uint64_t offset, uint64_t length,
dacbb6eb 2462 NBDExtentArray *es)
3d068aff 2463{
dacbb6eb
VSO
2464 int64_t start, dirty_start, dirty_count;
2465 int64_t end = offset + length;
2466 bool full = false;
bcc16cc1 2467 int64_t bound = es->extended ? INT64_MAX : INT32_MAX;
3d068aff
VSO
2468
2469 bdrv_dirty_bitmap_lock(bitmap);
2470
dacbb6eb 2471 for (start = offset;
bcc16cc1 2472 bdrv_dirty_bitmap_next_dirty_area(bitmap, start, end, bound,
dacbb6eb
VSO
2473 &dirty_start, &dirty_count);
2474 start = dirty_start + dirty_count)
2475 {
2476 if ((nbd_extent_array_add(es, dirty_start - start, 0) < 0) ||
2477 (nbd_extent_array_add(es, dirty_count, NBD_STATE_DIRTY) < 0))
2478 {
2479 full = true;
89cbc7e3
VSO
2480 break;
2481 }
3d068aff
VSO
2482 }
2483
dacbb6eb 2484 if (!full) {
c0b21f2e
EB
2485 /* last non dirty extent, nothing to do if array is now full */
2486 (void) nbd_extent_array_add(es, end - start, 0);
dacbb6eb 2487 }
3d068aff
VSO
2488
2489 bdrv_dirty_bitmap_unlock(bitmap);
3d068aff
VSO
2490}
2491
66d4f4fe
EB
2492static int coroutine_fn nbd_co_send_bitmap(NBDClient *client,
2493 NBDRequest *request,
2494 BdrvDirtyBitmap *bitmap,
2495 uint64_t offset,
bcc16cc1 2496 uint64_t length, bool dont_fragment,
66d4f4fe
EB
2497 bool last, uint32_t context_id,
2498 Error **errp)
3d068aff 2499{
416e34bd 2500 unsigned int nb_extents = dont_fragment ? 1 : NBD_MAX_BLOCK_STATUS_EXTENTS;
bcc16cc1
EB
2501 g_autoptr(NBDExtentArray) ea =
2502 nbd_extent_array_new(nb_extents, client->mode);
3d068aff 2503
dacbb6eb 2504 bitmap_to_extents(bitmap, offset, length, ea);
3d068aff 2505
66d4f4fe 2506 return nbd_co_send_extents(client, request, ea, last, context_id, errp);
e7b1948d
VSO
2507}
2508
2dcbb11b
EB
2509/*
2510 * nbd_co_block_status_payload_read
2511 * Called when a client wants a subset of negotiated contexts via a
2512 * BLOCK_STATUS payload. Check the payload for valid length and
2513 * contents. On success, return 0 with request updated to effective
2514 * length. If request was invalid but all payload consumed, return 0
2515 * with request->len and request->contexts->count set to 0 (which will
2516 * trigger an appropriate NBD_EINVAL response later on). Return
2517 * negative errno if the payload was not fully consumed.
2518 */
2519static int
2520nbd_co_block_status_payload_read(NBDClient *client, NBDRequest *request,
2521 Error **errp)
2522{
2523 uint64_t payload_len = request->len;
2524 g_autofree char *buf = NULL;
2525 size_t count, i, nr_bitmaps;
2526 uint32_t id;
2527
2528 if (payload_len > NBD_MAX_BUFFER_SIZE) {
2529 error_setg(errp, "len (%" PRIu64 ") is larger than max len (%u)",
2530 request->len, NBD_MAX_BUFFER_SIZE);
2531 return -EINVAL;
2532 }
2533
2534 assert(client->contexts.exp == client->exp);
2535 nr_bitmaps = client->exp->nr_export_bitmaps;
2536 request->contexts = g_new0(NBDMetaContexts, 1);
2537 request->contexts->exp = client->exp;
2538
2539 if (payload_len % sizeof(uint32_t) ||
2540 payload_len < sizeof(NBDBlockStatusPayload) ||
2541 payload_len > (sizeof(NBDBlockStatusPayload) +
2542 sizeof(id) * client->contexts.count)) {
2543 goto skip;
2544 }
2545
2546 buf = g_malloc(payload_len);
2547 if (nbd_read(client->ioc, buf, payload_len,
2548 "CMD_BLOCK_STATUS data", errp) < 0) {
2549 return -EIO;
2550 }
2551 trace_nbd_co_receive_request_payload_received(request->cookie,
2552 payload_len);
2553 request->contexts->bitmaps = g_new0(bool, nr_bitmaps);
2554 count = (payload_len - sizeof(NBDBlockStatusPayload)) / sizeof(id);
2555 payload_len = 0;
2556
2557 for (i = 0; i < count; i++) {
2558 id = ldl_be_p(buf + sizeof(NBDBlockStatusPayload) + sizeof(id) * i);
2559 if (id == NBD_META_ID_BASE_ALLOCATION) {
2560 if (!client->contexts.base_allocation ||
2561 request->contexts->base_allocation) {
2562 goto skip;
2563 }
2564 request->contexts->base_allocation = true;
2565 } else if (id == NBD_META_ID_ALLOCATION_DEPTH) {
2566 if (!client->contexts.allocation_depth ||
2567 request->contexts->allocation_depth) {
2568 goto skip;
2569 }
2570 request->contexts->allocation_depth = true;
2571 } else {
2572 unsigned idx = id - NBD_META_ID_DIRTY_BITMAP;
2573
2574 if (idx >= nr_bitmaps || !client->contexts.bitmaps[idx] ||
2575 request->contexts->bitmaps[idx]) {
2576 goto skip;
2577 }
2578 request->contexts->bitmaps[idx] = true;
2579 }
2580 }
2581
2582 request->len = ldq_be_p(buf);
2583 request->contexts->count = count;
2584 return 0;
2585
2586 skip:
2587 trace_nbd_co_receive_block_status_payload_compliance(request->from,
2588 request->len);
2589 request->len = request->contexts->count = 0;
2590 return nbd_drop(client->ioc, payload_len, errp);
2591}
2592
2a6e128b
VSO
2593/* nbd_co_receive_request
2594 * Collect a client request. Return 0 if request looks valid, -EIO to drop
f148ae7d
SL
2595 * connection right away, -EAGAIN to indicate we were interrupted and the
2596 * channel should be quiesced, and any other negative value to report an error
2597 * to the client (although the caller may still need to disconnect after
2598 * reporting the error).
2a6e128b 2599 */
8db7e2d6
EB
2600static int coroutine_fn nbd_co_receive_request(NBDRequestData *req,
2601 NBDRequest *request,
d2223cdd 2602 Error **errp)
a030b347 2603{
72deddc5 2604 NBDClient *client = req->client;
009cd866 2605 bool extended_with_payload;
8db7e2d6
EB
2606 bool check_length = false;
2607 bool check_rofs = false;
2608 bool allocate_buffer = false;
009cd866
EB
2609 bool payload_okay = false;
2610 uint64_t payload_len = 0;
8db7e2d6 2611 int valid_flags = NBD_CMD_FLAG_FUA;
f148ae7d 2612 int ret;
a030b347 2613
1c778ef7 2614 g_assert(qemu_in_coroutine());
f148ae7d
SL
2615 ret = nbd_receive_request(client, request, errp);
2616 if (ret < 0) {
314b9026 2617 return ret;
a030b347
PB
2618 }
2619
22efd811 2620 trace_nbd_co_receive_request_decode_type(request->cookie, request->type,
3736cc5b 2621 nbd_cmd_lookup(request->type));
009cd866
EB
2622 extended_with_payload = client->mode >= NBD_MODE_EXTENDED &&
2623 request->flags & NBD_CMD_FLAG_PAYLOAD_LEN;
2624 if (extended_with_payload) {
2625 payload_len = request->len;
2626 check_length = true;
2627 }
2628
8db7e2d6
EB
2629 switch (request->type) {
2630 case NBD_CMD_DISC:
29b6c3b3
EB
2631 /* Special case: we're going to disconnect without a reply,
2632 * whether or not flags, from, or len are bogus */
8db7e2d6 2633 req->complete = true;
ee898b87 2634 return -EIO;
29b6c3b3 2635
8db7e2d6
EB
2636 case NBD_CMD_READ:
2637 if (client->mode >= NBD_MODE_STRUCTURED) {
2638 valid_flags |= NBD_CMD_FLAG_DF;
eb38c3b6 2639 }
8db7e2d6
EB
2640 check_length = true;
2641 allocate_buffer = true;
2642 break;
eb38c3b6 2643
8db7e2d6 2644 case NBD_CMD_WRITE:
009cd866
EB
2645 if (client->mode >= NBD_MODE_EXTENDED) {
2646 if (!extended_with_payload) {
2647 /* The client is noncompliant. Trace it, but proceed. */
2648 trace_nbd_co_receive_ext_payload_compliance(request->from,
2649 request->len);
2650 }
2651 valid_flags |= NBD_CMD_FLAG_PAYLOAD_LEN;
2652 }
2653 payload_okay = true;
8db7e2d6
EB
2654 payload_len = request->len;
2655 check_length = true;
2656 allocate_buffer = true;
2657 check_rofs = true;
2658 break;
2659
2660 case NBD_CMD_FLUSH:
2661 break;
2662
2663 case NBD_CMD_TRIM:
2664 check_rofs = true;
2665 break;
2666
2667 case NBD_CMD_CACHE:
2668 check_length = true;
2669 break;
2670
2671 case NBD_CMD_WRITE_ZEROES:
2672 valid_flags |= NBD_CMD_FLAG_NO_HOLE | NBD_CMD_FLAG_FAST_ZERO;
2673 check_rofs = true;
2674 break;
2675
2676 case NBD_CMD_BLOCK_STATUS:
2dcbb11b
EB
2677 if (extended_with_payload) {
2678 ret = nbd_co_block_status_payload_read(client, request, errp);
2679 if (ret < 0) {
2680 return ret;
2681 }
2682 /* payload now consumed */
2683 check_length = false;
2684 payload_len = 0;
2685 valid_flags |= NBD_CMD_FLAG_PAYLOAD_LEN;
2686 } else {
2687 request->contexts = &client->contexts;
2688 }
8db7e2d6
EB
2689 valid_flags |= NBD_CMD_FLAG_REQ_ONE;
2690 break;
2691
2692 default:
2693 /* Unrecognized, will fail later */
2694 ;
2d821488 2695 }
7fa5c565 2696
8db7e2d6
EB
2697 /* Payload and buffer handling. */
2698 if (!payload_len) {
2699 req->complete = true;
2700 }
2701 if (check_length && request->len > NBD_MAX_BUFFER_SIZE) {
2702 /* READ, WRITE, CACHE */
2703 error_setg(errp, "len (%" PRIu64 ") is larger than max len (%u)",
2704 request->len, NBD_MAX_BUFFER_SIZE);
2705 return -EINVAL;
2706 }
009cd866
EB
2707 if (payload_len && !payload_okay) {
2708 /*
2709 * For now, we don't support payloads on other commands; but
2710 * we can keep the connection alive by ignoring the payload.
2711 * We will fail the command later with NBD_EINVAL for the use
2712 * of an unsupported flag (and not for access beyond bounds).
2713 */
2714 assert(request->type != NBD_CMD_WRITE);
2715 request->len = 0;
2716 }
8db7e2d6
EB
2717 if (allocate_buffer) {
2718 /* READ, WRITE */
2719 req->data = blk_try_blockalign(client->exp->common.blk,
2720 request->len);
2721 if (req->data == NULL) {
2722 error_setg(errp, "No memory");
2723 return -ENOMEM;
2724 }
2725 }
2726 if (payload_len) {
009cd866
EB
2727 if (payload_okay) {
2728 /* WRITE */
2729 assert(req->data);
2730 ret = nbd_read(client->ioc, req->data, payload_len,
2731 "CMD_WRITE data", errp);
2732 } else {
2733 ret = nbd_drop(client->ioc, payload_len, errp);
2734 }
8db7e2d6 2735 if (ret < 0) {
ee898b87 2736 return -EIO;
a030b347 2737 }
29b6c3b3 2738 req->complete = true;
22efd811 2739 trace_nbd_co_receive_request_payload_received(request->cookie,
8db7e2d6 2740 payload_len);
a030b347 2741 }
29b6c3b3 2742
fed5f8f8 2743 /* Sanity checks. */
8db7e2d6
EB
2744 if (client->exp->nbdflags & NBD_FLAG_READ_ONLY && check_rofs) {
2745 /* WRITE, TRIM, WRITE_ZEROES */
fed5f8f8
EB
2746 error_setg(errp, "Export is read-only");
2747 return -EROFS;
2748 }
2749 if (request->from > client->exp->size ||
9d26dfcb 2750 request->len > client->exp->size - request->from) {
b2578459 2751 error_setg(errp, "operation past EOF; From: %" PRIu64 ", Len: %" PRIu64
2fd2c840 2752 ", Size: %" PRIu64, request->from, request->len,
9d26dfcb 2753 client->exp->size);
fed5f8f8
EB
2754 return (request->type == NBD_CMD_WRITE ||
2755 request->type == NBD_CMD_WRITE_ZEROES) ? -ENOSPC : -EINVAL;
29b6c3b3 2756 }
6e280648
EB
2757 if (client->check_align && !QEMU_IS_ALIGNED(request->from | request->len,
2758 client->check_align)) {
2759 /*
2760 * The block layer gracefully handles unaligned requests, but
2761 * it's still worth tracing client non-compliance
2762 */
2763 trace_nbd_co_receive_align_compliance(nbd_cmd_lookup(request->type),
2764 request->from,
2765 request->len,
2766 client->check_align);
2767 }
5c54e7fa
VSO
2768 if (request->flags & ~valid_flags) {
2769 error_setg(errp, "unsupported flags for command %s (got 0x%x)",
2770 nbd_cmd_lookup(request->type), request->flags);
ee898b87 2771 return -EINVAL;
1f4d6d18 2772 }
29b6c3b3 2773
ee898b87 2774 return 0;
a030b347
PB
2775}
2776
6a417599
VSO
2777/* Send simple reply without a payload, or a structured error
2778 * @error_msg is ignored if @ret >= 0
2779 * Returns 0 if connection is still live, -errno on failure to talk to client
2780 */
2781static coroutine_fn int nbd_send_generic_reply(NBDClient *client,
66d4f4fe 2782 NBDRequest *request,
6a417599
VSO
2783 int ret,
2784 const char *error_msg,
2785 Error **errp)
2786{
ac132d05 2787 if (client->mode >= NBD_MODE_STRUCTURED && ret < 0) {
66d4f4fe 2788 return nbd_co_send_chunk_error(client, request, -ret, error_msg, errp);
11d3355f
EB
2789 } else if (client->mode >= NBD_MODE_EXTENDED) {
2790 return nbd_co_send_chunk_done(client, request, errp);
6a417599 2791 } else {
66d4f4fe 2792 return nbd_co_send_simple_reply(client, request, ret < 0 ? -ret : 0,
6a417599
VSO
2793 NULL, 0, errp);
2794 }
2795}
2796
2797/* Handle NBD_CMD_READ request.
2798 * Return -errno if sending fails. Other errors are reported directly to the
2799 * client as an error reply. */
2800static coroutine_fn int nbd_do_cmd_read(NBDClient *client, NBDRequest *request,
2801 uint8_t *data, Error **errp)
2802{
2803 int ret;
2804 NBDExport *exp = client->exp;
2805
7fa5c565 2806 assert(request->type == NBD_CMD_READ);
b2578459 2807 assert(request->len <= NBD_MAX_BUFFER_SIZE);
6a417599
VSO
2808
2809 /* XXX: NBD Protocol only documents use of FUA with WRITE */
2810 if (request->flags & NBD_CMD_FLAG_FUA) {
37a4f70c 2811 ret = blk_co_flush(exp->common.blk);
6a417599 2812 if (ret < 0) {
66d4f4fe 2813 return nbd_send_generic_reply(client, request, ret,
6a417599
VSO
2814 "flush failed", errp);
2815 }
2816 }
2817
ac132d05
EB
2818 if (client->mode >= NBD_MODE_STRUCTURED &&
2819 !(request->flags & NBD_CMD_FLAG_DF) && request->len)
2f454def 2820 {
66d4f4fe 2821 return nbd_co_send_sparse_read(client, request, request->from,
6a417599
VSO
2822 data, request->len, errp);
2823 }
2824
d2223cdd 2825 ret = blk_co_pread(exp->common.blk, request->from, request->len, data, 0);
7fa5c565 2826 if (ret < 0) {
66d4f4fe 2827 return nbd_send_generic_reply(client, request, ret,
6a417599
VSO
2828 "reading from file failed", errp);
2829 }
2830
ac132d05 2831 if (client->mode >= NBD_MODE_STRUCTURED) {
6a417599 2832 if (request->len) {
66d4f4fe 2833 return nbd_co_send_chunk_read(client, request, request->from, data,
a7c8ed36 2834 request->len, true, errp);
6a417599 2835 } else {
66d4f4fe 2836 return nbd_co_send_chunk_done(client, request, errp);
6a417599
VSO
2837 }
2838 } else {
66d4f4fe 2839 return nbd_co_send_simple_reply(client, request, 0,
6a417599
VSO
2840 data, request->len, errp);
2841 }
2842}
2843
7fa5c565
VSO
2844/*
2845 * nbd_do_cmd_cache
2846 *
2847 * Handle NBD_CMD_CACHE request.
2848 * Return -errno if sending fails. Other errors are reported directly to the
2849 * client as an error reply.
2850 */
2851static coroutine_fn int nbd_do_cmd_cache(NBDClient *client, NBDRequest *request,
2852 Error **errp)
2853{
2854 int ret;
2855 NBDExport *exp = client->exp;
2856
2857 assert(request->type == NBD_CMD_CACHE);
b2578459 2858 assert(request->len <= NBD_MAX_BUFFER_SIZE);
7fa5c565 2859
37a4f70c 2860 ret = blk_co_preadv(exp->common.blk, request->from, request->len,
7fa5c565
VSO
2861 NULL, BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH);
2862
66d4f4fe 2863 return nbd_send_generic_reply(client, request, ret,
7fa5c565
VSO
2864 "caching data failed", errp);
2865}
2866
6f302e60
VSO
2867/* Handle NBD request.
2868 * Return -errno if sending fails. Other errors are reported directly to the
2869 * client as an error reply. */
2870static coroutine_fn int nbd_handle_request(NBDClient *client,
2871 NBDRequest *request,
2872 uint8_t *data, Error **errp)
2873{
2874 int ret;
2875 int flags;
2876 NBDExport *exp = client->exp;
2877 char *msg;
3b1f244c 2878 size_t i;
6f302e60
VSO
2879
2880 switch (request->type) {
bc37b06a 2881 case NBD_CMD_CACHE:
7fa5c565
VSO
2882 return nbd_do_cmd_cache(client, request, errp);
2883
2884 case NBD_CMD_READ:
6f302e60
VSO
2885 return nbd_do_cmd_read(client, request, data, errp);
2886
2887 case NBD_CMD_WRITE:
2888 flags = 0;
2889 if (request->flags & NBD_CMD_FLAG_FUA) {
2890 flags |= BDRV_REQ_FUA;
2891 }
b2578459 2892 assert(request->len <= NBD_MAX_BUFFER_SIZE);
d2223cdd
PB
2893 ret = blk_co_pwrite(exp->common.blk, request->from, request->len, data,
2894 flags);
66d4f4fe 2895 return nbd_send_generic_reply(client, request, ret,
6f302e60
VSO
2896 "writing to file failed", errp);
2897
2898 case NBD_CMD_WRITE_ZEROES:
2899 flags = 0;
2900 if (request->flags & NBD_CMD_FLAG_FUA) {
2901 flags |= BDRV_REQ_FUA;
2902 }
2903 if (!(request->flags & NBD_CMD_FLAG_NO_HOLE)) {
2904 flags |= BDRV_REQ_MAY_UNMAP;
2905 }
b491dbb7
EB
2906 if (request->flags & NBD_CMD_FLAG_FAST_ZERO) {
2907 flags |= BDRV_REQ_NO_FALLBACK;
2908 }
d2223cdd
PB
2909 ret = blk_co_pwrite_zeroes(exp->common.blk, request->from, request->len,
2910 flags);
66d4f4fe 2911 return nbd_send_generic_reply(client, request, ret,
6f302e60
VSO
2912 "writing to file failed", errp);
2913
2914 case NBD_CMD_DISC:
2915 /* unreachable, thanks to special case in nbd_co_receive_request() */
2916 abort();
2917
2918 case NBD_CMD_FLUSH:
37a4f70c 2919 ret = blk_co_flush(exp->common.blk);
66d4f4fe 2920 return nbd_send_generic_reply(client, request, ret,
6f302e60
VSO
2921 "flush failed", errp);
2922
2923 case NBD_CMD_TRIM:
e3557422 2924 ret = blk_co_pdiscard(exp->common.blk, request->from, request->len);
890cbccb 2925 if (ret >= 0 && request->flags & NBD_CMD_FLAG_FUA) {
37a4f70c 2926 ret = blk_co_flush(exp->common.blk);
65529782 2927 }
66d4f4fe 2928 return nbd_send_generic_reply(client, request, ret,
6f302e60
VSO
2929 "discard failed", errp);
2930
e7b1948d 2931 case NBD_CMD_BLOCK_STATUS:
1dec4643 2932 assert(request->contexts);
bcc16cc1
EB
2933 assert(client->mode >= NBD_MODE_EXTENDED ||
2934 request->len <= UINT32_MAX);
1dec4643 2935 if (request->contexts->count) {
fb7afc79 2936 bool dont_fragment = request->flags & NBD_CMD_FLAG_REQ_ONE;
1dec4643 2937 int contexts_remaining = request->contexts->count;
fb7afc79 2938
2dcbb11b
EB
2939 if (!request->len) {
2940 return nbd_send_generic_reply(client, request, -EINVAL,
2941 "need non-zero length", errp);
2942 }
1dec4643 2943 if (request->contexts->base_allocation) {
66d4f4fe 2944 ret = nbd_co_send_block_status(client, request,
ff7e261b 2945 exp->common.blk,
37a4f70c 2946 request->from,
fb7afc79 2947 request->len, dont_fragment,
47ec485e 2948 !--contexts_remaining,
3d068aff
VSO
2949 NBD_META_ID_BASE_ALLOCATION,
2950 errp);
73e064cc
EB
2951 if (ret < 0) {
2952 return ret;
2953 }
2954 }
2955
1dec4643 2956 if (request->contexts->allocation_depth) {
66d4f4fe 2957 ret = nbd_co_send_block_status(client, request,
ff7e261b 2958 exp->common.blk,
71719cd5
EB
2959 request->from, request->len,
2960 dont_fragment,
2961 !--contexts_remaining,
2962 NBD_META_ID_ALLOCATION_DEPTH,
2963 errp);
2964 if (ret < 0) {
2965 return ret;
2966 }
2967 }
2968
1dec4643 2969 assert(request->contexts->exp == client->exp);
3b1f244c 2970 for (i = 0; i < client->exp->nr_export_bitmaps; i++) {
1dec4643 2971 if (!request->contexts->bitmaps[i]) {
3b1f244c
EB
2972 continue;
2973 }
66d4f4fe 2974 ret = nbd_co_send_bitmap(client, request,
3b1f244c 2975 client->exp->export_bitmaps[i],
3d068aff 2976 request->from, request->len,
47ec485e 2977 dont_fragment, !--contexts_remaining,
3b1f244c 2978 NBD_META_ID_DIRTY_BITMAP + i, errp);
73e064cc
EB
2979 if (ret < 0) {
2980 return ret;
2981 }
3d068aff
VSO
2982 }
2983
47ec485e
EB
2984 assert(!contexts_remaining);
2985
73e064cc 2986 return 0;
1dec4643
EB
2987 } else if (client->contexts.count) {
2988 return nbd_send_generic_reply(client, request, -EINVAL,
2989 "CMD_BLOCK_STATUS payload not valid",
2990 errp);
e7b1948d 2991 } else {
66d4f4fe 2992 return nbd_send_generic_reply(client, request, -EINVAL,
e7b1948d
VSO
2993 "CMD_BLOCK_STATUS not negotiated",
2994 errp);
2995 }
2996
6f302e60
VSO
2997 default:
2998 msg = g_strdup_printf("invalid request type (%" PRIu32 ") received",
2999 request->type);
66d4f4fe 3000 ret = nbd_send_generic_reply(client, request, -EINVAL, msg,
6f302e60
VSO
3001 errp);
3002 g_free(msg);
3003 return ret;
3004 }
3005}
3006
ff82911c
PB
3007/* Owns a reference to the NBDClient passed as opaque. */
3008static coroutine_fn void nbd_trip(void *opaque)
75818250 3009{
262db388 3010 NBDClient *client = opaque;
f816310d 3011 NBDRequestData *req = NULL;
ff82911c 3012 NBDRequest request = { 0 }; /* GCC thinks it can be used uninitialized */
a0dc63a6 3013 int ret;
2fd2c840 3014 Error *local_err = NULL;
b2e3d87f 3015
f816310d
SH
3016 /*
3017 * Note that nbd_client_put() and client_close() must be called from the
3018 * main loop thread. Use aio_co_reschedule_self() to switch AioContext
3019 * before calling these functions.
3020 */
3021
9588463e 3022 trace_nbd_trip();
7075d235
SH
3023
3024 qemu_mutex_lock(&client->lock);
3025
ff2b68aa 3026 if (client->closing) {
f816310d 3027 goto done;
ff2b68aa 3028 }
b2e3d87f 3029
f148ae7d
SL
3030 if (client->quiescing) {
3031 /*
3032 * We're switching between AIO contexts. Don't attempt to receive a new
3033 * request and kick the main context which may be waiting for us.
3034 */
f148ae7d
SL
3035 client->recv_coroutine = NULL;
3036 aio_wait_kick();
f816310d 3037 goto done;
f148ae7d
SL
3038 }
3039
ff2b68aa 3040 req = nbd_request_get(client);
7075d235
SH
3041
3042 /*
3043 * nbd_co_receive_request() returns -EAGAIN when nbd_drained_begin() has
3044 * set client->quiescing but by the time we get back nbd_drained_end() may
3045 * have already cleared client->quiescing. In that case we try again
3046 * because nothing else will spawn an nbd_trip() coroutine until we set
3047 * client->recv_coroutine = NULL further down.
3048 */
3049 do {
3050 assert(client->recv_coroutine == qemu_coroutine_self());
3051 qemu_mutex_unlock(&client->lock);
3052 ret = nbd_co_receive_request(req, &request, &local_err);
3053 qemu_mutex_lock(&client->lock);
3054 } while (ret == -EAGAIN && !client->quiescing);
3055
ee898b87 3056 client->recv_coroutine = NULL;
b2e3d87f 3057
d6268348
WC
3058 if (client->closing) {
3059 /*
3060 * The client may be closed when we are blocked in
3061 * nbd_co_receive_request()
3062 */
3063 goto done;
3064 }
3065
f148ae7d 3066 if (ret == -EAGAIN) {
f148ae7d
SL
3067 goto done;
3068 }
3069
a0d7ce20 3070 nbd_client_receive_next_request(client);
7075d235 3071
a0d7ce20
VSO
3072 if (ret == -EIO) {
3073 goto disconnect;
3074 }
3075
7075d235 3076 qemu_mutex_unlock(&client->lock);
bd2cd4a4
FW
3077 qio_channel_set_cork(client->ioc, true);
3078
a0d7ce20 3079 if (ret < 0) {
314b9026 3080 /* It wasn't -EIO, so, according to nbd_co_receive_request()
6a417599
VSO
3081 * semantics, we should return the error to the client. */
3082 Error *export_err = local_err;
3083
3084 local_err = NULL;
66d4f4fe 3085 ret = nbd_send_generic_reply(client, &request, -EINVAL,
6a417599
VSO
3086 error_get_pretty(export_err), &local_err);
3087 error_free(export_err);
6f302e60
VSO
3088 } else {
3089 ret = nbd_handle_request(client, &request, req->data, &local_err);
5c54e7fa 3090 }
1dec4643
EB
3091 if (request.contexts && request.contexts != &client->contexts) {
3092 assert(request.type == NBD_CMD_BLOCK_STATUS);
3093 g_free(request.contexts->bitmaps);
3094 g_free(request.contexts);
3095 }
7075d235
SH
3096
3097 qio_channel_set_cork(client->ioc, false);
3098 qemu_mutex_lock(&client->lock);
3099
5c54e7fa 3100 if (ret < 0) {
c7b97282 3101 error_prepend(&local_err, "Failed to send reply: ");
2fd2c840
VSO
3102 goto disconnect;
3103 }
3104
2dcbb11b
EB
3105 /*
3106 * We must disconnect after NBD_CMD_WRITE or BLOCK_STATUS with
3107 * payload if we did not read the payload.
8c372a02 3108 */
2fd2c840
VSO
3109 if (!req->complete) {
3110 error_setg(&local_err, "Request handling failed in intermediate state");
8c372a02 3111 goto disconnect;
b2e3d87f
NT
3112 }
3113
7fe7b68b 3114done:
f816310d
SH
3115 if (req) {
3116 nbd_request_put(req);
3117 }
7075d235
SH
3118
3119 qemu_mutex_unlock(&client->lock);
3120
f816310d
SH
3121 if (!nbd_client_put_nonzero(client)) {
3122 aio_co_reschedule_self(qemu_get_aio_context());
3123 nbd_client_put(client);
3124 }
262db388
PB
3125 return;
3126
8c372a02 3127disconnect:
2fd2c840
VSO
3128 if (local_err) {
3129 error_reportf_err(local_err, "Disconnect client, due to: ");
3130 }
7075d235 3131
72deddc5 3132 nbd_request_put(req);
7075d235 3133 qemu_mutex_unlock(&client->lock);
f816310d
SH
3134
3135 aio_co_reschedule_self(qemu_get_aio_context());
0c9390d9 3136 client_close(client, true);
ff82911c 3137 nbd_client_put(client);
7a5ca864 3138}
af49bbbe 3139
7075d235
SH
3140/*
3141 * Runs in export AioContext and main loop thread. Caller must hold
3142 * client->lock.
3143 */
ff82911c 3144static void nbd_client_receive_next_request(NBDClient *client)
958c717d 3145{
f148ae7d
SL
3146 if (!client->recv_coroutine && client->nb_requests < MAX_NBD_REQUESTS &&
3147 !client->quiescing) {
ff82911c
PB
3148 nbd_client_get(client);
3149 client->recv_coroutine = qemu_coroutine_create(nbd_trip, client);
8612c686 3150 aio_co_schedule(client->exp->common.ctx, client->recv_coroutine);
958c717d
HR
3151 }
3152}
3153
1a6245a5
FZ
3154static coroutine_fn void nbd_co_client_start(void *opaque)
3155{
c84087f2 3156 NBDClient *client = opaque;
2fd2c840 3157 Error *local_err = NULL;
1a6245a5 3158
df8ad9f1
EB
3159 qemu_co_mutex_init(&client->send_lock);
3160
2fd2c840
VSO
3161 if (nbd_negotiate(client, &local_err)) {
3162 if (local_err) {
3163 error_report_err(local_err);
3164 }
0c9390d9 3165 client_close(client, false);
c84087f2 3166 return;
1a6245a5 3167 }
ff82911c 3168
7075d235
SH
3169 WITH_QEMU_LOCK_GUARD(&client->lock) {
3170 nbd_client_receive_next_request(client);
3171 }
1a6245a5
FZ
3172}
3173
0c9390d9 3174/*
7f7dfe2a
VSO
3175 * Create a new client listener using the given channel @sioc.
3176 * Begin servicing it in a coroutine. When the connection closes, call
3177 * @close_fn with an indication of whether the client completed negotiation.
0c9390d9 3178 */
7f7dfe2a 3179void nbd_client_new(QIOChannelSocket *sioc,
f95910fe 3180 QCryptoTLSCreds *tlscreds,
b25e12da 3181 const char *tlsauthz,
0c9390d9 3182 void (*close_fn)(NBDClient *, bool))
af49bbbe 3183{
1743b515 3184 NBDClient *client;
c84087f2 3185 Coroutine *co;
1a6245a5 3186
e8d3eb74 3187 client = g_new0(NBDClient, 1);
7075d235 3188 qemu_mutex_init(&client->lock);
1743b515 3189 client->refcount = 1;
f95910fe
DB
3190 client->tlscreds = tlscreds;
3191 if (tlscreds) {
3192 object_ref(OBJECT(client->tlscreds));
3193 }
b25e12da 3194 client->tlsauthz = g_strdup(tlsauthz);
1c778ef7 3195 client->sioc = sioc;
f1426881 3196 qio_channel_set_delay(QIO_CHANNEL(sioc), false);
1c778ef7
DB
3197 object_ref(OBJECT(client->sioc));
3198 client->ioc = QIO_CHANNEL(sioc);
3199 object_ref(OBJECT(client->ioc));
0c9390d9 3200 client->close_fn = close_fn;
2c8d9f06 3201
c84087f2
VSO
3202 co = qemu_coroutine_create(nbd_co_client_start, client);
3203 qemu_coroutine_enter(co);
af49bbbe 3204}