]> git.proxmox.com Git - mirror_qemu.git/blame - nbd.c
block: resize backing image during active layer commit, if needed
[mirror_qemu.git] / nbd.c
CommitLineData
75818250 1/*
7a5ca864
FB
2 * Copyright (C) 2005 Anthony Liguori <anthony@codemonkey.ws>
3 *
4 * Network Block Device
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; under version 2 of the License.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
8167ee88 16 * along with this program; if not, see <http://www.gnu.org/licenses/>.
75818250 17 */
7a5ca864 18
737e150e
PB
19#include "block/nbd.h"
20#include "block/block.h"
7a5ca864 21
737e150e 22#include "block/coroutine.h"
262db388 23
7a5ca864
FB
24#include <errno.h>
25#include <string.h>
03ff3ca3 26#ifndef _WIN32
7a5ca864 27#include <sys/ioctl.h>
03ff3ca3 28#endif
5dc2eec9 29#if defined(__sun__) || defined(__HAIKU__)
7e00eb9b
AL
30#include <sys/ioccom.h>
31#endif
7a5ca864
FB
32#include <ctype.h>
33#include <inttypes.h>
75818250 34
b90fb4b8
PB
35#ifdef __linux__
36#include <linux/fs.h>
37#endif
38
1de7afc9
PB
39#include "qemu/sockets.h"
40#include "qemu/queue.h"
6a1751b7 41#include "qemu/main-loop.h"
03ff3ca3
AL
42
43//#define DEBUG_NBD
44
45#ifdef DEBUG_NBD
75818250 46#define TRACE(msg, ...) do { \
03ff3ca3 47 LOG(msg, ## __VA_ARGS__); \
75818250 48} while(0)
03ff3ca3
AL
49#else
50#define TRACE(msg, ...) \
51 do { } while (0)
52#endif
7a5ca864
FB
53
54#define LOG(msg, ...) do { \
55 fprintf(stderr, "%s:%s():L%d: " msg "\n", \
56 __FILE__, __FUNCTION__, __LINE__, ## __VA_ARGS__); \
57} while(0)
58
7a5ca864
FB
59/* This is all part of the "official" NBD API */
60
fa26c26b 61#define NBD_REQUEST_SIZE (4 + 4 + 8 + 8 + 4)
b2e3d87f 62#define NBD_REPLY_SIZE (4 + 4 + 8)
7a5ca864
FB
63#define NBD_REQUEST_MAGIC 0x25609513
64#define NBD_REPLY_MAGIC 0x67446698
fa26c26b
PB
65#define NBD_OPTS_MAGIC 0x49484156454F5054LL
66#define NBD_CLIENT_MAGIC 0x0000420281861253LL
7a5ca864
FB
67
68#define NBD_SET_SOCK _IO(0xab, 0)
69#define NBD_SET_BLKSIZE _IO(0xab, 1)
70#define NBD_SET_SIZE _IO(0xab, 2)
71#define NBD_DO_IT _IO(0xab, 3)
72#define NBD_CLEAR_SOCK _IO(0xab, 4)
73#define NBD_CLEAR_QUE _IO(0xab, 5)
b2e3d87f
NT
74#define NBD_PRINT_DEBUG _IO(0xab, 6)
75#define NBD_SET_SIZE_BLOCKS _IO(0xab, 7)
7a5ca864 76#define NBD_DISCONNECT _IO(0xab, 8)
bbb74edd
PB
77#define NBD_SET_TIMEOUT _IO(0xab, 9)
78#define NBD_SET_FLAGS _IO(0xab, 10)
7a5ca864 79
b2e3d87f 80#define NBD_OPT_EXPORT_NAME (1 << 0)
1d45f8b5 81
9a304d29
PB
82/* Definitions for opaque data types */
83
84typedef struct NBDRequest NBDRequest;
85
86struct NBDRequest {
87 QSIMPLEQ_ENTRY(NBDRequest) entry;
88 NBDClient *client;
89 uint8_t *data;
90};
91
92struct NBDExport {
2c8d9f06 93 int refcount;
0ddf08db
PB
94 void (*close)(NBDExport *exp);
95
9a304d29 96 BlockDriverState *bs;
ee0a19ec 97 char *name;
9a304d29
PB
98 off_t dev_offset;
99 off_t size;
100 uint32_t nbdflags;
4b9441f6 101 QTAILQ_HEAD(, NBDClient) clients;
ee0a19ec 102 QTAILQ_ENTRY(NBDExport) next;
9a304d29
PB
103};
104
ee0a19ec
PB
105static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports);
106
9a304d29
PB
107struct NBDClient {
108 int refcount;
109 void (*close)(NBDClient *client);
110
111 NBDExport *exp;
112 int sock;
113
114 Coroutine *recv_coroutine;
115
116 CoMutex send_lock;
117 Coroutine *send_coroutine;
118
4b9441f6 119 QTAILQ_ENTRY(NBDClient) next;
9a304d29 120 int nb_requests;
ff2b68aa 121 bool closing;
9a304d29
PB
122};
123
7a5ca864
FB
124/* That's all folks */
125
185b4338 126ssize_t nbd_wr_sync(int fd, void *buffer, size_t size, bool do_read)
7a5ca864
FB
127{
128 size_t offset = 0;
185b4338 129 int err;
7a5ca864 130
ae255e52
PB
131 if (qemu_in_coroutine()) {
132 if (do_read) {
133 return qemu_co_recv(fd, buffer, size);
134 } else {
135 return qemu_co_send(fd, buffer, size);
136 }
137 }
138
7a5ca864
FB
139 while (offset < size) {
140 ssize_t len;
141
142 if (do_read) {
00aa0040 143 len = qemu_recv(fd, buffer + offset, size - offset, 0);
7a5ca864 144 } else {
03ff3ca3 145 len = send(fd, buffer + offset, size - offset, 0);
7a5ca864
FB
146 }
147
fc19f8a0 148 if (len < 0) {
185b4338 149 err = socket_error();
03ff3ca3 150
fc19f8a0 151 /* recoverable error */
7fe7b68b 152 if (err == EINTR || (offset > 0 && err == EAGAIN)) {
fc19f8a0
PB
153 continue;
154 }
155
156 /* unrecoverable error */
185b4338 157 return -err;
7a5ca864
FB
158 }
159
160 /* eof */
161 if (len == 0) {
162 break;
163 }
164
7a5ca864
FB
165 offset += len;
166 }
167
168 return offset;
169}
170
7fe7b68b
PB
171static ssize_t read_sync(int fd, void *buffer, size_t size)
172{
173 /* Sockets are kept in blocking mode in the negotiation phase. After
174 * that, a non-readable socket simply means that another thread stole
175 * our request/reply. Synchronization is done with recv_coroutine, so
176 * that this is coroutine-safe.
177 */
178 return nbd_wr_sync(fd, buffer, size, true);
179}
180
181static ssize_t write_sync(int fd, void *buffer, size_t size)
182{
183 int ret;
184 do {
185 /* For writes, we do expect the socket to be writable. */
186 ret = nbd_wr_sync(fd, buffer, size, false);
187 } while (ret == -EAGAIN);
188 return ret;
189}
190
c12504ce
NT
191static void combine_addr(char *buf, size_t len, const char* address,
192 uint16_t port)
7a5ca864 193{
c12504ce
NT
194 /* If the address-part contains a colon, it's an IPv6 IP so needs [] */
195 if (strstr(address, ":")) {
196 snprintf(buf, len, "[%s]:%u", address, port);
197 } else {
198 snprintf(buf, len, "%s:%u", address, port);
7a5ca864 199 }
7a5ca864
FB
200}
201
f17c90be
KW
202int tcp_socket_outgoing_opts(QemuOpts *opts)
203{
204 Error *local_err = NULL;
205 int fd = inet_connect_opts(opts, &local_err, NULL, NULL);
206 if (local_err != NULL) {
207 qerror_report_err(local_err);
208 error_free(local_err);
209 }
210
211 return fd;
212}
213
c12504ce 214int tcp_socket_incoming(const char *address, uint16_t port)
cd831bd7 215{
c12504ce
NT
216 char address_and_port[128];
217 combine_addr(address_and_port, 128, address, port);
218 return tcp_socket_incoming_spec(address_and_port);
219}
cd831bd7 220
c12504ce
NT
221int tcp_socket_incoming_spec(const char *address_and_port)
222{
f8430e76
PB
223 Error *local_err = NULL;
224 int fd = inet_listen(address_and_port, NULL, 0, SOCK_STREAM, 0, &local_err);
225
226 if (local_err != NULL) {
227 qerror_report_err(local_err);
228 error_free(local_err);
229 }
230 return fd;
03ff3ca3 231}
c12504ce 232
03ff3ca3
AL
233int unix_socket_incoming(const char *path)
234{
f8430e76
PB
235 Error *local_err = NULL;
236 int fd = unix_listen(path, NULL, 0, &local_err);
c12504ce 237
f8430e76
PB
238 if (local_err != NULL) {
239 qerror_report_err(local_err);
240 error_free(local_err);
241 }
242 return fd;
cd831bd7
TS
243}
244
03ff3ca3
AL
245int unix_socket_outgoing(const char *path)
246{
f8430e76
PB
247 Error *local_err = NULL;
248 int fd = unix_connect(path, &local_err);
249
250 if (local_err != NULL) {
251 qerror_report_err(local_err);
252 error_free(local_err);
253 }
254 return fd;
03ff3ca3 255}
cd831bd7 256
6b8c01e7 257/* Basic flow for negotiation
7a5ca864
FB
258
259 Server Client
7a5ca864 260 Negotiate
6b8c01e7
PB
261
262 or
263
264 Server Client
265 Negotiate #1
266 Option
267 Negotiate #2
268
269 ----
270
271 followed by
272
273 Server Client
7a5ca864
FB
274 Request
275 Response
276 Request
277 Response
278 ...
279 ...
280 Request (type == 2)
6b8c01e7 281
7a5ca864
FB
282*/
283
6b8c01e7
PB
284static int nbd_receive_options(NBDClient *client)
285{
286 int csock = client->sock;
287 char name[256];
288 uint32_t tmp, length;
289 uint64_t magic;
290 int rc;
291
292 /* Client sends:
293 [ 0 .. 3] reserved (0)
294 [ 4 .. 11] NBD_OPTS_MAGIC
295 [12 .. 15] NBD_OPT_EXPORT_NAME
296 [16 .. 19] length
297 [20 .. xx] export name (length bytes)
298 */
299
300 rc = -EINVAL;
301 if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
302 LOG("read failed");
303 goto fail;
304 }
305 TRACE("Checking reserved");
306 if (tmp != 0) {
307 LOG("Bad reserved received");
308 goto fail;
309 }
310
311 if (read_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
312 LOG("read failed");
313 goto fail;
314 }
315 TRACE("Checking reserved");
316 if (magic != be64_to_cpu(NBD_OPTS_MAGIC)) {
317 LOG("Bad magic received");
318 goto fail;
319 }
320
321 if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
322 LOG("read failed");
323 goto fail;
324 }
325 TRACE("Checking option");
326 if (tmp != be32_to_cpu(NBD_OPT_EXPORT_NAME)) {
327 LOG("Bad option received");
328 goto fail;
329 }
330
331 if (read_sync(csock, &length, sizeof(length)) != sizeof(length)) {
332 LOG("read failed");
333 goto fail;
334 }
335 TRACE("Checking length");
336 length = be32_to_cpu(length);
337 if (length > 255) {
338 LOG("Bad length received");
339 goto fail;
340 }
341 if (read_sync(csock, name, length) != length) {
342 LOG("read failed");
343 goto fail;
344 }
345 name[length] = '\0';
346
347 client->exp = nbd_export_find(name);
348 if (!client->exp) {
349 LOG("export not found");
350 goto fail;
351 }
352
353 QTAILQ_INSERT_TAIL(&client->exp->clients, client, next);
354 nbd_export_get(client->exp);
355
356 TRACE("Option negotiation succeeded.");
357 rc = 0;
358fail:
359 return rc;
360}
361
9a304d29 362static int nbd_send_negotiate(NBDClient *client)
7a5ca864 363{
9a304d29 364 int csock = client->sock;
b2e3d87f 365 char buf[8 + 8 + 8 + 128];
185b4338 366 int rc;
6b8c01e7
PB
367 const int myflags = (NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_TRIM |
368 NBD_FLAG_SEND_FLUSH | NBD_FLAG_SEND_FUA);
b2e3d87f 369
6b8c01e7
PB
370 /* Negotiation header without options:
371 [ 0 .. 7] passwd ("NBDMAGIC")
372 [ 8 .. 15] magic (NBD_CLIENT_MAGIC)
b2e3d87f 373 [16 .. 23] size
6b8c01e7
PB
374 [24 .. 25] server flags (0)
375 [24 .. 27] export flags
376 [28 .. 151] reserved (0)
377
378 Negotiation header with options, part 1:
379 [ 0 .. 7] passwd ("NBDMAGIC")
380 [ 8 .. 15] magic (NBD_OPTS_MAGIC)
381 [16 .. 17] server flags (0)
382
383 part 2 (after options are sent):
384 [18 .. 25] size
385 [26 .. 27] export flags
386 [28 .. 151] reserved (0)
b2e3d87f
NT
387 */
388
f9e8cacc 389 qemu_set_block(csock);
185b4338
PB
390 rc = -EINVAL;
391
b2e3d87f 392 TRACE("Beginning negotiation.");
8ffaaba0 393 memset(buf, 0, sizeof(buf));
b2e3d87f 394 memcpy(buf, "NBDMAGIC", 8);
6b8c01e7
PB
395 if (client->exp) {
396 assert ((client->exp->nbdflags & ~65535) == 0);
397 cpu_to_be64w((uint64_t*)(buf + 8), NBD_CLIENT_MAGIC);
398 cpu_to_be64w((uint64_t*)(buf + 16), client->exp->size);
399 cpu_to_be16w((uint16_t*)(buf + 26), client->exp->nbdflags | myflags);
400 } else {
401 cpu_to_be64w((uint64_t*)(buf + 8), NBD_OPTS_MAGIC);
402 }
b2e3d87f 403
6b8c01e7
PB
404 if (client->exp) {
405 if (write_sync(csock, buf, sizeof(buf)) != sizeof(buf)) {
406 LOG("write failed");
407 goto fail;
408 }
409 } else {
410 if (write_sync(csock, buf, 18) != 18) {
411 LOG("write failed");
412 goto fail;
413 }
414 rc = nbd_receive_options(client);
415 if (rc < 0) {
416 LOG("option negotiation failed");
417 goto fail;
418 }
419
420 assert ((client->exp->nbdflags & ~65535) == 0);
421 cpu_to_be64w((uint64_t*)(buf + 18), client->exp->size);
422 cpu_to_be16w((uint16_t*)(buf + 26), client->exp->nbdflags | myflags);
423 if (write_sync(csock, buf + 18, sizeof(buf) - 18) != sizeof(buf) - 18) {
424 LOG("write failed");
425 goto fail;
426 }
b2e3d87f
NT
427 }
428
07f35073 429 TRACE("Negotiation succeeded.");
185b4338
PB
430 rc = 0;
431fail:
f9e8cacc 432 qemu_set_nonblock(csock);
185b4338 433 return rc;
7a5ca864
FB
434}
435
1d45f8b5
LV
436int nbd_receive_negotiate(int csock, const char *name, uint32_t *flags,
437 off_t *size, size_t *blocksize)
7a5ca864 438{
b2e3d87f
NT
439 char buf[256];
440 uint64_t magic, s;
441 uint16_t tmp;
185b4338 442 int rc;
b2e3d87f 443
07f35073 444 TRACE("Receiving negotiation.");
b2e3d87f 445
185b4338
PB
446 rc = -EINVAL;
447
b2e3d87f
NT
448 if (read_sync(csock, buf, 8) != 8) {
449 LOG("read failed");
185b4338 450 goto fail;
b2e3d87f
NT
451 }
452
453 buf[8] = '\0';
454 if (strlen(buf) == 0) {
455 LOG("server connection closed");
185b4338 456 goto fail;
b2e3d87f
NT
457 }
458
459 TRACE("Magic is %c%c%c%c%c%c%c%c",
460 qemu_isprint(buf[0]) ? buf[0] : '.',
461 qemu_isprint(buf[1]) ? buf[1] : '.',
462 qemu_isprint(buf[2]) ? buf[2] : '.',
463 qemu_isprint(buf[3]) ? buf[3] : '.',
464 qemu_isprint(buf[4]) ? buf[4] : '.',
465 qemu_isprint(buf[5]) ? buf[5] : '.',
466 qemu_isprint(buf[6]) ? buf[6] : '.',
467 qemu_isprint(buf[7]) ? buf[7] : '.');
468
469 if (memcmp(buf, "NBDMAGIC", 8) != 0) {
470 LOG("Invalid magic received");
185b4338 471 goto fail;
b2e3d87f
NT
472 }
473
474 if (read_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
475 LOG("read failed");
185b4338 476 goto fail;
b2e3d87f
NT
477 }
478 magic = be64_to_cpu(magic);
479 TRACE("Magic is 0x%" PRIx64, magic);
480
481 if (name) {
482 uint32_t reserved = 0;
483 uint32_t opt;
484 uint32_t namesize;
485
486 TRACE("Checking magic (opts_magic)");
fa26c26b 487 if (magic != NBD_OPTS_MAGIC) {
b2e3d87f 488 LOG("Bad magic received");
185b4338 489 goto fail;
b2e3d87f
NT
490 }
491 if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
492 LOG("flags read failed");
185b4338 493 goto fail;
b2e3d87f
NT
494 }
495 *flags = be16_to_cpu(tmp) << 16;
496 /* reserved for future use */
497 if (write_sync(csock, &reserved, sizeof(reserved)) !=
498 sizeof(reserved)) {
499 LOG("write failed (reserved)");
185b4338 500 goto fail;
b2e3d87f
NT
501 }
502 /* write the export name */
503 magic = cpu_to_be64(magic);
504 if (write_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
505 LOG("write failed (magic)");
185b4338 506 goto fail;
b2e3d87f
NT
507 }
508 opt = cpu_to_be32(NBD_OPT_EXPORT_NAME);
509 if (write_sync(csock, &opt, sizeof(opt)) != sizeof(opt)) {
510 LOG("write failed (opt)");
185b4338 511 goto fail;
b2e3d87f
NT
512 }
513 namesize = cpu_to_be32(strlen(name));
514 if (write_sync(csock, &namesize, sizeof(namesize)) !=
515 sizeof(namesize)) {
516 LOG("write failed (namesize)");
185b4338 517 goto fail;
b2e3d87f
NT
518 }
519 if (write_sync(csock, (char*)name, strlen(name)) != strlen(name)) {
520 LOG("write failed (name)");
185b4338 521 goto fail;
b2e3d87f
NT
522 }
523 } else {
524 TRACE("Checking magic (cli_magic)");
525
fa26c26b 526 if (magic != NBD_CLIENT_MAGIC) {
b2e3d87f 527 LOG("Bad magic received");
185b4338 528 goto fail;
b2e3d87f
NT
529 }
530 }
531
532 if (read_sync(csock, &s, sizeof(s)) != sizeof(s)) {
533 LOG("read failed");
185b4338 534 goto fail;
b2e3d87f
NT
535 }
536 *size = be64_to_cpu(s);
537 *blocksize = 1024;
538 TRACE("Size is %" PRIu64, *size);
539
540 if (!name) {
541 if (read_sync(csock, flags, sizeof(*flags)) != sizeof(*flags)) {
542 LOG("read failed (flags)");
185b4338 543 goto fail;
b2e3d87f
NT
544 }
545 *flags = be32_to_cpup(flags);
546 } else {
547 if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
548 LOG("read failed (tmp)");
185b4338 549 goto fail;
b2e3d87f
NT
550 }
551 *flags |= be32_to_cpu(tmp);
552 }
553 if (read_sync(csock, &buf, 124) != 124) {
554 LOG("read failed (buf)");
185b4338 555 goto fail;
b2e3d87f 556 }
185b4338
PB
557 rc = 0;
558
559fail:
560 return rc;
cd831bd7 561}
7a5ca864 562
b90fb4b8
PB
563#ifdef __linux__
564int nbd_init(int fd, int csock, uint32_t flags, off_t size, size_t blocksize)
cd831bd7 565{
3e05c785
CL
566 TRACE("Setting NBD socket");
567
fc19f8a0 568 if (ioctl(fd, NBD_SET_SOCK, csock) < 0) {
3e05c785
CL
569 int serrno = errno;
570 LOG("Failed to set NBD socket");
185b4338 571 return -serrno;
3e05c785
CL
572 }
573
b2e3d87f 574 TRACE("Setting block size to %lu", (unsigned long)blocksize);
7a5ca864 575
fc19f8a0 576 if (ioctl(fd, NBD_SET_BLKSIZE, blocksize) < 0) {
b2e3d87f
NT
577 int serrno = errno;
578 LOG("Failed setting NBD block size");
185b4338 579 return -serrno;
b2e3d87f 580 }
7a5ca864 581
0bfcd599 582 TRACE("Setting size to %zd block(s)", (size_t)(size / blocksize));
7a5ca864 583
fc19f8a0 584 if (ioctl(fd, NBD_SET_SIZE_BLOCKS, size / blocksize) < 0) {
b2e3d87f
NT
585 int serrno = errno;
586 LOG("Failed setting size (in blocks)");
185b4338 587 return -serrno;
b2e3d87f 588 }
7a5ca864 589
c8969ede
PB
590 if (ioctl(fd, NBD_SET_FLAGS, flags) < 0) {
591 if (errno == ENOTTY) {
592 int read_only = (flags & NBD_FLAG_READ_ONLY) != 0;
593 TRACE("Setting readonly attribute");
594
595 if (ioctl(fd, BLKROSET, (unsigned long) &read_only) < 0) {
596 int serrno = errno;
597 LOG("Failed setting read-only attribute");
598 return -serrno;
599 }
600 } else {
b90fb4b8 601 int serrno = errno;
c8969ede 602 LOG("Failed setting flags");
185b4338 603 return -serrno;
b90fb4b8
PB
604 }
605 }
606
b2e3d87f 607 TRACE("Negotiation ended");
7a5ca864 608
b2e3d87f 609 return 0;
7a5ca864
FB
610}
611
612int nbd_disconnect(int fd)
613{
b2e3d87f
NT
614 ioctl(fd, NBD_CLEAR_QUE);
615 ioctl(fd, NBD_DISCONNECT);
616 ioctl(fd, NBD_CLEAR_SOCK);
617 return 0;
7a5ca864
FB
618}
619
0a4eb864 620int nbd_client(int fd)
7a5ca864 621{
b2e3d87f
NT
622 int ret;
623 int serrno;
7a5ca864 624
b2e3d87f 625 TRACE("Doing NBD loop");
7a5ca864 626
b2e3d87f 627 ret = ioctl(fd, NBD_DO_IT);
fc19f8a0 628 if (ret < 0 && errno == EPIPE) {
74624688
PB
629 /* NBD_DO_IT normally returns EPIPE when someone has disconnected
630 * the socket via NBD_DISCONNECT. We do not want to return 1 in
631 * that case.
632 */
633 ret = 0;
634 }
b2e3d87f 635 serrno = errno;
7a5ca864 636
b2e3d87f 637 TRACE("NBD loop returned %d: %s", ret, strerror(serrno));
7a5ca864 638
b2e3d87f
NT
639 TRACE("Clearing NBD queue");
640 ioctl(fd, NBD_CLEAR_QUE);
7a5ca864 641
b2e3d87f
NT
642 TRACE("Clearing NBD socket");
643 ioctl(fd, NBD_CLEAR_SOCK);
7a5ca864 644
b2e3d87f
NT
645 errno = serrno;
646 return ret;
7a5ca864 647}
03ff3ca3 648#else
8e72506e 649int nbd_init(int fd, int csock, uint32_t flags, off_t size, size_t blocksize)
03ff3ca3 650{
185b4338 651 return -ENOTSUP;
03ff3ca3
AL
652}
653
654int nbd_disconnect(int fd)
655{
185b4338 656 return -ENOTSUP;
03ff3ca3
AL
657}
658
0a4eb864 659int nbd_client(int fd)
03ff3ca3 660{
185b4338 661 return -ENOTSUP;
03ff3ca3
AL
662}
663#endif
7a5ca864 664
94e7340b 665ssize_t nbd_send_request(int csock, struct nbd_request *request)
7a5ca864 666{
fa26c26b 667 uint8_t buf[NBD_REQUEST_SIZE];
185b4338 668 ssize_t ret;
b2e3d87f
NT
669
670 cpu_to_be32w((uint32_t*)buf, NBD_REQUEST_MAGIC);
671 cpu_to_be32w((uint32_t*)(buf + 4), request->type);
672 cpu_to_be64w((uint64_t*)(buf + 8), request->handle);
673 cpu_to_be64w((uint64_t*)(buf + 16), request->from);
674 cpu_to_be32w((uint32_t*)(buf + 24), request->len);
75818250 675
b2e3d87f
NT
676 TRACE("Sending request to client: "
677 "{ .from = %" PRIu64", .len = %u, .handle = %" PRIu64", .type=%i}",
678 request->from, request->len, request->handle, request->type);
679
185b4338
PB
680 ret = write_sync(csock, buf, sizeof(buf));
681 if (ret < 0) {
682 return ret;
683 }
684
685 if (ret != sizeof(buf)) {
b2e3d87f 686 LOG("writing to socket failed");
185b4338 687 return -EINVAL;
b2e3d87f
NT
688 }
689 return 0;
690}
75818250 691
94e7340b 692static ssize_t nbd_receive_request(int csock, struct nbd_request *request)
75818250 693{
fa26c26b 694 uint8_t buf[NBD_REQUEST_SIZE];
b2e3d87f 695 uint32_t magic;
185b4338 696 ssize_t ret;
b2e3d87f 697
185b4338
PB
698 ret = read_sync(csock, buf, sizeof(buf));
699 if (ret < 0) {
700 return ret;
701 }
702
703 if (ret != sizeof(buf)) {
b2e3d87f 704 LOG("read failed");
185b4338 705 return -EINVAL;
b2e3d87f
NT
706 }
707
708 /* Request
709 [ 0 .. 3] magic (NBD_REQUEST_MAGIC)
710 [ 4 .. 7] type (0 == READ, 1 == WRITE)
711 [ 8 .. 15] handle
712 [16 .. 23] from
713 [24 .. 27] len
714 */
715
716 magic = be32_to_cpup((uint32_t*)buf);
717 request->type = be32_to_cpup((uint32_t*)(buf + 4));
718 request->handle = be64_to_cpup((uint64_t*)(buf + 8));
719 request->from = be64_to_cpup((uint64_t*)(buf + 16));
720 request->len = be32_to_cpup((uint32_t*)(buf + 24));
721
722 TRACE("Got request: "
723 "{ magic = 0x%x, .type = %d, from = %" PRIu64" , len = %u }",
724 magic, request->type, request->from, request->len);
725
726 if (magic != NBD_REQUEST_MAGIC) {
727 LOG("invalid magic (got 0x%x)", magic);
185b4338 728 return -EINVAL;
b2e3d87f
NT
729 }
730 return 0;
75818250
TS
731}
732
94e7340b 733ssize_t nbd_receive_reply(int csock, struct nbd_reply *reply)
75818250 734{
b2e3d87f
NT
735 uint8_t buf[NBD_REPLY_SIZE];
736 uint32_t magic;
185b4338 737 ssize_t ret;
b2e3d87f 738
185b4338
PB
739 ret = read_sync(csock, buf, sizeof(buf));
740 if (ret < 0) {
741 return ret;
742 }
743
744 if (ret != sizeof(buf)) {
b2e3d87f 745 LOG("read failed");
185b4338 746 return -EINVAL;
b2e3d87f
NT
747 }
748
749 /* Reply
750 [ 0 .. 3] magic (NBD_REPLY_MAGIC)
751 [ 4 .. 7] error (0 == no error)
752 [ 7 .. 15] handle
753 */
754
755 magic = be32_to_cpup((uint32_t*)buf);
756 reply->error = be32_to_cpup((uint32_t*)(buf + 4));
757 reply->handle = be64_to_cpup((uint64_t*)(buf + 8));
758
759 TRACE("Got reply: "
760 "{ magic = 0x%x, .error = %d, handle = %" PRIu64" }",
761 magic, reply->error, reply->handle);
762
763 if (magic != NBD_REPLY_MAGIC) {
764 LOG("invalid magic (got 0x%x)", magic);
185b4338 765 return -EINVAL;
b2e3d87f
NT
766 }
767 return 0;
75818250
TS
768}
769
94e7340b 770static ssize_t nbd_send_reply(int csock, struct nbd_reply *reply)
75818250 771{
fa26c26b 772 uint8_t buf[NBD_REPLY_SIZE];
185b4338 773 ssize_t ret;
b2e3d87f
NT
774
775 /* Reply
776 [ 0 .. 3] magic (NBD_REPLY_MAGIC)
777 [ 4 .. 7] error (0 == no error)
778 [ 7 .. 15] handle
779 */
780 cpu_to_be32w((uint32_t*)buf, NBD_REPLY_MAGIC);
781 cpu_to_be32w((uint32_t*)(buf + 4), reply->error);
782 cpu_to_be64w((uint64_t*)(buf + 8), reply->handle);
783
784 TRACE("Sending response to client");
785
185b4338
PB
786 ret = write_sync(csock, buf, sizeof(buf));
787 if (ret < 0) {
788 return ret;
789 }
790
791 if (ret != sizeof(buf)) {
b2e3d87f 792 LOG("writing to socket failed");
185b4338 793 return -EINVAL;
b2e3d87f
NT
794 }
795 return 0;
75818250 796}
7a5ca864 797
41996e38
PB
798#define MAX_NBD_REQUESTS 16
799
ce33967a 800void nbd_client_get(NBDClient *client)
1743b515
PB
801{
802 client->refcount++;
803}
804
ce33967a 805void nbd_client_put(NBDClient *client)
1743b515
PB
806{
807 if (--client->refcount == 0) {
ff2b68aa
PB
808 /* The last reference should be dropped by client->close,
809 * which is called by nbd_client_close.
810 */
811 assert(client->closing);
812
813 qemu_set_fd_handler2(client->sock, NULL, NULL, NULL, NULL);
814 close(client->sock);
815 client->sock = -1;
6b8c01e7
PB
816 if (client->exp) {
817 QTAILQ_REMOVE(&client->exp->clients, client, next);
818 nbd_export_put(client->exp);
819 }
1743b515
PB
820 g_free(client);
821 }
822}
823
ff2b68aa 824void nbd_client_close(NBDClient *client)
1743b515 825{
ff2b68aa
PB
826 if (client->closing) {
827 return;
828 }
829
830 client->closing = true;
831
832 /* Force requests to finish. They will drop their own references,
833 * then we'll close the socket and free the NBDClient.
834 */
835 shutdown(client->sock, 2);
836
837 /* Also tell the client, so that they release their reference. */
1743b515
PB
838 if (client->close) {
839 client->close(client);
840 }
1743b515
PB
841}
842
72deddc5 843static NBDRequest *nbd_request_get(NBDClient *client)
d9a73806
PB
844{
845 NBDRequest *req;
72deddc5 846
41996e38
PB
847 assert(client->nb_requests <= MAX_NBD_REQUESTS - 1);
848 client->nb_requests++;
849
e1adb27a 850 req = g_slice_new0(NBDRequest);
72deddc5
PB
851 nbd_client_get(client);
852 req->client = client;
d9a73806
PB
853 return req;
854}
855
72deddc5 856static void nbd_request_put(NBDRequest *req)
d9a73806 857{
72deddc5 858 NBDClient *client = req->client;
e1adb27a 859
2d821488
SH
860 if (req->data) {
861 qemu_vfree(req->data);
862 }
e1adb27a
SH
863 g_slice_free(NBDRequest, req);
864
41996e38
PB
865 if (client->nb_requests-- == MAX_NBD_REQUESTS) {
866 qemu_notify_event();
867 }
72deddc5 868 nbd_client_put(client);
d9a73806
PB
869}
870
af49bbbe 871NBDExport *nbd_export_new(BlockDriverState *bs, off_t dev_offset,
0ddf08db
PB
872 off_t size, uint32_t nbdflags,
873 void (*close)(NBDExport *))
af49bbbe
PB
874{
875 NBDExport *exp = g_malloc0(sizeof(NBDExport));
2c8d9f06 876 exp->refcount = 1;
4b9441f6 877 QTAILQ_INIT(&exp->clients);
af49bbbe
PB
878 exp->bs = bs;
879 exp->dev_offset = dev_offset;
880 exp->nbdflags = nbdflags;
38ceff04 881 exp->size = size == -1 ? bdrv_getlength(bs) : size;
0ddf08db 882 exp->close = close;
38b54b6d 883 bdrv_ref(bs);
af49bbbe
PB
884 return exp;
885}
886
ee0a19ec
PB
887NBDExport *nbd_export_find(const char *name)
888{
889 NBDExport *exp;
890 QTAILQ_FOREACH(exp, &exports, next) {
891 if (strcmp(name, exp->name) == 0) {
892 return exp;
893 }
894 }
895
896 return NULL;
897}
898
899void nbd_export_set_name(NBDExport *exp, const char *name)
900{
901 if (exp->name == name) {
902 return;
903 }
904
905 nbd_export_get(exp);
906 if (exp->name != NULL) {
907 g_free(exp->name);
908 exp->name = NULL;
909 QTAILQ_REMOVE(&exports, exp, next);
910 nbd_export_put(exp);
911 }
912 if (name != NULL) {
913 nbd_export_get(exp);
914 exp->name = g_strdup(name);
915 QTAILQ_INSERT_TAIL(&exports, exp, next);
916 }
917 nbd_export_put(exp);
918}
919
af49bbbe
PB
920void nbd_export_close(NBDExport *exp)
921{
4b9441f6 922 NBDClient *client, *next;
2c8d9f06 923
4b9441f6
PB
924 nbd_export_get(exp);
925 QTAILQ_FOREACH_SAFE(client, &exp->clients, next, next) {
926 nbd_client_close(client);
927 }
125afda8 928 nbd_export_set_name(exp, NULL);
4b9441f6 929 nbd_export_put(exp);
38b54b6d
FZ
930 if (exp->bs) {
931 bdrv_unref(exp->bs);
932 exp->bs = NULL;
933 }
2c8d9f06
PB
934}
935
936void nbd_export_get(NBDExport *exp)
937{
938 assert(exp->refcount > 0);
939 exp->refcount++;
940}
941
942void nbd_export_put(NBDExport *exp)
943{
944 assert(exp->refcount > 0);
945 if (exp->refcount == 1) {
946 nbd_export_close(exp);
d9a73806
PB
947 }
948
2c8d9f06 949 if (--exp->refcount == 0) {
ee0a19ec
PB
950 assert(exp->name == NULL);
951
0ddf08db
PB
952 if (exp->close) {
953 exp->close(exp);
954 }
955
2c8d9f06
PB
956 g_free(exp);
957 }
af49bbbe
PB
958}
959
125afda8
PB
960BlockDriverState *nbd_export_get_blockdev(NBDExport *exp)
961{
962 return exp->bs;
963}
964
ee0a19ec
PB
965void nbd_export_close_all(void)
966{
967 NBDExport *exp, *next;
968
969 QTAILQ_FOREACH_SAFE(exp, &exports, next, next) {
970 nbd_export_close(exp);
ee0a19ec
PB
971 }
972}
973
41996e38 974static int nbd_can_read(void *opaque);
262db388
PB
975static void nbd_read(void *opaque);
976static void nbd_restart_write(void *opaque);
977
94e7340b
PB
978static ssize_t nbd_co_send_reply(NBDRequest *req, struct nbd_reply *reply,
979 int len)
22045592 980{
72deddc5
PB
981 NBDClient *client = req->client;
982 int csock = client->sock;
94e7340b 983 ssize_t rc, ret;
22045592 984
262db388 985 qemu_co_mutex_lock(&client->send_lock);
41996e38
PB
986 qemu_set_fd_handler2(csock, nbd_can_read, nbd_read,
987 nbd_restart_write, client);
262db388
PB
988 client->send_coroutine = qemu_coroutine_self();
989
22045592
PB
990 if (!len) {
991 rc = nbd_send_reply(csock, reply);
22045592
PB
992 } else {
993 socket_set_cork(csock, 1);
994 rc = nbd_send_reply(csock, reply);
fc19f8a0 995 if (rc >= 0) {
262db388 996 ret = qemu_co_send(csock, req->data, len);
22045592 997 if (ret != len) {
185b4338 998 rc = -EIO;
22045592
PB
999 }
1000 }
22045592
PB
1001 socket_set_cork(csock, 0);
1002 }
262db388
PB
1003
1004 client->send_coroutine = NULL;
41996e38 1005 qemu_set_fd_handler2(csock, nbd_can_read, nbd_read, NULL, client);
262db388 1006 qemu_co_mutex_unlock(&client->send_lock);
22045592
PB
1007 return rc;
1008}
1009
94e7340b 1010static ssize_t nbd_co_receive_request(NBDRequest *req, struct nbd_request *request)
a030b347 1011{
72deddc5
PB
1012 NBDClient *client = req->client;
1013 int csock = client->sock;
2d821488 1014 uint32_t command;
94e7340b 1015 ssize_t rc;
a030b347 1016
262db388 1017 client->recv_coroutine = qemu_coroutine_self();
7fe7b68b
PB
1018 rc = nbd_receive_request(csock, request);
1019 if (rc < 0) {
1020 if (rc != -EAGAIN) {
1021 rc = -EIO;
1022 }
a030b347
PB
1023 goto out;
1024 }
1025
2d821488 1026 if (request->len > NBD_MAX_BUFFER_SIZE) {
a030b347 1027 LOG("len (%u) is larger than max len (%u)",
2d821488 1028 request->len, NBD_MAX_BUFFER_SIZE);
a030b347
PB
1029 rc = -EINVAL;
1030 goto out;
1031 }
1032
1033 if ((request->from + request->len) < request->from) {
1034 LOG("integer overflow detected! "
1035 "you're probably being attacked");
1036 rc = -EINVAL;
1037 goto out;
1038 }
1039
1040 TRACE("Decoding type");
1041
2d821488
SH
1042 command = request->type & NBD_CMD_MASK_COMMAND;
1043 if (command == NBD_CMD_READ || command == NBD_CMD_WRITE) {
1044 req->data = qemu_blockalign(client->exp->bs, request->len);
1045 }
1046 if (command == NBD_CMD_WRITE) {
a030b347
PB
1047 TRACE("Reading %u byte(s)", request->len);
1048
262db388 1049 if (qemu_co_recv(csock, req->data, request->len) != request->len) {
a030b347
PB
1050 LOG("reading from socket failed");
1051 rc = -EIO;
1052 goto out;
1053 }
1054 }
1055 rc = 0;
1056
1057out:
262db388 1058 client->recv_coroutine = NULL;
a030b347
PB
1059 return rc;
1060}
1061
262db388 1062static void nbd_trip(void *opaque)
75818250 1063{
262db388 1064 NBDClient *client = opaque;
1743b515 1065 NBDExport *exp = client->exp;
ff2b68aa 1066 NBDRequest *req;
b2e3d87f
NT
1067 struct nbd_request request;
1068 struct nbd_reply reply;
94e7340b 1069 ssize_t ret;
b2e3d87f
NT
1070
1071 TRACE("Reading request.");
ff2b68aa
PB
1072 if (client->closing) {
1073 return;
1074 }
b2e3d87f 1075
ff2b68aa 1076 req = nbd_request_get(client);
262db388 1077 ret = nbd_co_receive_request(req, &request);
7fe7b68b
PB
1078 if (ret == -EAGAIN) {
1079 goto done;
1080 }
a030b347 1081 if (ret == -EIO) {
d9a73806 1082 goto out;
a030b347 1083 }
b2e3d87f 1084
fae69416
PB
1085 reply.handle = request.handle;
1086 reply.error = 0;
1087
a030b347
PB
1088 if (ret < 0) {
1089 reply.error = -ret;
1090 goto error_reply;
b2e3d87f
NT
1091 }
1092
af49bbbe 1093 if ((request.from + request.len) > exp->size) {
b2e3d87f
NT
1094 LOG("From: %" PRIu64 ", Len: %u, Size: %" PRIu64
1095 ", Offset: %" PRIu64 "\n",
af49bbbe 1096 request.from, request.len,
0fee8f34 1097 (uint64_t)exp->size, (uint64_t)exp->dev_offset);
b2e3d87f 1098 LOG("requested operation past EOF--bad client?");
fae69416 1099 goto invalid_request;
b2e3d87f
NT
1100 }
1101
2c7989a9 1102 switch (request.type & NBD_CMD_MASK_COMMAND) {
b2e3d87f
NT
1103 case NBD_CMD_READ:
1104 TRACE("Request type is READ");
1105
e25ceb76
PB
1106 if (request.type & NBD_CMD_FLAG_FUA) {
1107 ret = bdrv_co_flush(exp->bs);
1108 if (ret < 0) {
1109 LOG("flush failed");
1110 reply.error = -ret;
1111 goto error_reply;
1112 }
1113 }
1114
af49bbbe 1115 ret = bdrv_read(exp->bs, (request.from + exp->dev_offset) / 512,
d9a73806 1116 req->data, request.len / 512);
adcf6302 1117 if (ret < 0) {
b2e3d87f 1118 LOG("reading from file failed");
adcf6302 1119 reply.error = -ret;
fae69416 1120 goto error_reply;
b2e3d87f 1121 }
b2e3d87f
NT
1122
1123 TRACE("Read %u byte(s)", request.len);
262db388 1124 if (nbd_co_send_reply(req, &reply, request.len) < 0)
d9a73806 1125 goto out;
b2e3d87f
NT
1126 break;
1127 case NBD_CMD_WRITE:
1128 TRACE("Request type is WRITE");
1129
af49bbbe 1130 if (exp->nbdflags & NBD_FLAG_READ_ONLY) {
b2e3d87f 1131 TRACE("Server is read-only, return error");
fae69416
PB
1132 reply.error = EROFS;
1133 goto error_reply;
1134 }
1135
1136 TRACE("Writing to device");
1137
af49bbbe 1138 ret = bdrv_write(exp->bs, (request.from + exp->dev_offset) / 512,
d9a73806 1139 req->data, request.len / 512);
fae69416
PB
1140 if (ret < 0) {
1141 LOG("writing to file failed");
1142 reply.error = -ret;
1143 goto error_reply;
1144 }
b2e3d87f 1145
fae69416 1146 if (request.type & NBD_CMD_FLAG_FUA) {
262db388 1147 ret = bdrv_co_flush(exp->bs);
adcf6302 1148 if (ret < 0) {
fae69416 1149 LOG("flush failed");
adcf6302 1150 reply.error = -ret;
fae69416 1151 goto error_reply;
2c7989a9 1152 }
b2e3d87f
NT
1153 }
1154
fc19f8a0 1155 if (nbd_co_send_reply(req, &reply, 0) < 0) {
d9a73806 1156 goto out;
fc19f8a0 1157 }
b2e3d87f
NT
1158 break;
1159 case NBD_CMD_DISC:
1160 TRACE("Request type is DISCONNECT");
1161 errno = 0;
262db388 1162 goto out;
1486d04a
PB
1163 case NBD_CMD_FLUSH:
1164 TRACE("Request type is FLUSH");
1165
262db388 1166 ret = bdrv_co_flush(exp->bs);
1486d04a
PB
1167 if (ret < 0) {
1168 LOG("flush failed");
1169 reply.error = -ret;
1170 }
fc19f8a0 1171 if (nbd_co_send_reply(req, &reply, 0) < 0) {
d9a73806 1172 goto out;
fc19f8a0 1173 }
7a706633
PB
1174 break;
1175 case NBD_CMD_TRIM:
1176 TRACE("Request type is TRIM");
262db388
PB
1177 ret = bdrv_co_discard(exp->bs, (request.from + exp->dev_offset) / 512,
1178 request.len / 512);
7a706633
PB
1179 if (ret < 0) {
1180 LOG("discard failed");
1181 reply.error = -ret;
1182 }
fc19f8a0 1183 if (nbd_co_send_reply(req, &reply, 0) < 0) {
d9a73806 1184 goto out;
fc19f8a0 1185 }
1486d04a 1186 break;
b2e3d87f
NT
1187 default:
1188 LOG("invalid request type (%u) received", request.type);
fae69416
PB
1189 invalid_request:
1190 reply.error = -EINVAL;
1191 error_reply:
fc19f8a0 1192 if (nbd_co_send_reply(req, &reply, 0) < 0) {
d9a73806 1193 goto out;
fc19f8a0 1194 }
fae69416 1195 break;
b2e3d87f
NT
1196 }
1197
1198 TRACE("Request/Reply complete");
1199
7fe7b68b 1200done:
262db388
PB
1201 nbd_request_put(req);
1202 return;
1203
d9a73806 1204out:
72deddc5 1205 nbd_request_put(req);
262db388 1206 nbd_client_close(client);
7a5ca864 1207}
af49bbbe 1208
41996e38
PB
1209static int nbd_can_read(void *opaque)
1210{
1211 NBDClient *client = opaque;
1212
1213 return client->recv_coroutine || client->nb_requests < MAX_NBD_REQUESTS;
1214}
1215
1743b515
PB
1216static void nbd_read(void *opaque)
1217{
1218 NBDClient *client = opaque;
1219
262db388
PB
1220 if (client->recv_coroutine) {
1221 qemu_coroutine_enter(client->recv_coroutine, NULL);
1222 } else {
1223 qemu_coroutine_enter(qemu_coroutine_create(nbd_trip), client);
1743b515 1224 }
1743b515
PB
1225}
1226
262db388
PB
1227static void nbd_restart_write(void *opaque)
1228{
1229 NBDClient *client = opaque;
1230
1231 qemu_coroutine_enter(client->send_coroutine, NULL);
1232}
1233
1743b515
PB
1234NBDClient *nbd_client_new(NBDExport *exp, int csock,
1235 void (*close)(NBDClient *))
af49bbbe 1236{
1743b515 1237 NBDClient *client;
1743b515
PB
1238 client = g_malloc0(sizeof(NBDClient));
1239 client->refcount = 1;
1240 client->exp = exp;
1241 client->sock = csock;
9a304d29
PB
1242 if (nbd_send_negotiate(client) < 0) {
1243 g_free(client);
1244 return NULL;
1245 }
1743b515 1246 client->close = close;
262db388 1247 qemu_co_mutex_init(&client->send_lock);
41996e38 1248 qemu_set_fd_handler2(csock, nbd_can_read, nbd_read, NULL, client);
2c8d9f06 1249
6b8c01e7
PB
1250 if (exp) {
1251 QTAILQ_INSERT_TAIL(&exp->clients, client, next);
1252 nbd_export_get(exp);
1253 }
1743b515 1254 return client;
af49bbbe 1255}