]> git.proxmox.com Git - mirror_qemu.git/blame - nbd.c
nbd: Handle fixed new-style clients.
[mirror_qemu.git] / nbd.c
CommitLineData
75818250 1/*
7a5ca864
FB
2 * Copyright (C) 2005 Anthony Liguori <anthony@codemonkey.ws>
3 *
4 * Network Block Device
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; under version 2 of the License.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
8167ee88 16 * along with this program; if not, see <http://www.gnu.org/licenses/>.
75818250 17 */
7a5ca864 18
737e150e
PB
19#include "block/nbd.h"
20#include "block/block.h"
7a5ca864 21
737e150e 22#include "block/coroutine.h"
262db388 23
7a5ca864
FB
24#include <errno.h>
25#include <string.h>
03ff3ca3 26#ifndef _WIN32
7a5ca864 27#include <sys/ioctl.h>
03ff3ca3 28#endif
5dc2eec9 29#if defined(__sun__) || defined(__HAIKU__)
7e00eb9b
AL
30#include <sys/ioccom.h>
31#endif
7a5ca864
FB
32#include <ctype.h>
33#include <inttypes.h>
75818250 34
b90fb4b8
PB
35#ifdef __linux__
36#include <linux/fs.h>
37#endif
38
1de7afc9
PB
39#include "qemu/sockets.h"
40#include "qemu/queue.h"
6a1751b7 41#include "qemu/main-loop.h"
03ff3ca3
AL
42
43//#define DEBUG_NBD
44
45#ifdef DEBUG_NBD
75818250 46#define TRACE(msg, ...) do { \
03ff3ca3 47 LOG(msg, ## __VA_ARGS__); \
75818250 48} while(0)
03ff3ca3
AL
49#else
50#define TRACE(msg, ...) \
51 do { } while (0)
52#endif
7a5ca864
FB
53
54#define LOG(msg, ...) do { \
55 fprintf(stderr, "%s:%s():L%d: " msg "\n", \
56 __FILE__, __FUNCTION__, __LINE__, ## __VA_ARGS__); \
57} while(0)
58
f5076b5a
HB
59/* This is all part of the "official" NBD API.
60 *
61 * The most up-to-date documentation is available at:
62 * https://github.com/yoe/nbd/blob/master/doc/proto.txt
63 */
7a5ca864 64
fa26c26b 65#define NBD_REQUEST_SIZE (4 + 4 + 8 + 8 + 4)
b2e3d87f 66#define NBD_REPLY_SIZE (4 + 4 + 8)
7a5ca864
FB
67#define NBD_REQUEST_MAGIC 0x25609513
68#define NBD_REPLY_MAGIC 0x67446698
fa26c26b
PB
69#define NBD_OPTS_MAGIC 0x49484156454F5054LL
70#define NBD_CLIENT_MAGIC 0x0000420281861253LL
f5076b5a 71#define NBD_REP_MAGIC 0x3e889045565a9LL
7a5ca864
FB
72
73#define NBD_SET_SOCK _IO(0xab, 0)
74#define NBD_SET_BLKSIZE _IO(0xab, 1)
75#define NBD_SET_SIZE _IO(0xab, 2)
76#define NBD_DO_IT _IO(0xab, 3)
77#define NBD_CLEAR_SOCK _IO(0xab, 4)
78#define NBD_CLEAR_QUE _IO(0xab, 5)
b2e3d87f
NT
79#define NBD_PRINT_DEBUG _IO(0xab, 6)
80#define NBD_SET_SIZE_BLOCKS _IO(0xab, 7)
7a5ca864 81#define NBD_DISCONNECT _IO(0xab, 8)
bbb74edd
PB
82#define NBD_SET_TIMEOUT _IO(0xab, 9)
83#define NBD_SET_FLAGS _IO(0xab, 10)
7a5ca864 84
f5076b5a
HB
85#define NBD_OPT_EXPORT_NAME (1)
86#define NBD_OPT_ABORT (2)
1d45f8b5 87
9a304d29
PB
88/* Definitions for opaque data types */
89
90typedef struct NBDRequest NBDRequest;
91
92struct NBDRequest {
93 QSIMPLEQ_ENTRY(NBDRequest) entry;
94 NBDClient *client;
95 uint8_t *data;
96};
97
98struct NBDExport {
2c8d9f06 99 int refcount;
0ddf08db
PB
100 void (*close)(NBDExport *exp);
101
9a304d29 102 BlockDriverState *bs;
ee0a19ec 103 char *name;
9a304d29
PB
104 off_t dev_offset;
105 off_t size;
106 uint32_t nbdflags;
4b9441f6 107 QTAILQ_HEAD(, NBDClient) clients;
ee0a19ec 108 QTAILQ_ENTRY(NBDExport) next;
9a304d29
PB
109};
110
ee0a19ec
PB
111static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports);
112
9a304d29
PB
113struct NBDClient {
114 int refcount;
115 void (*close)(NBDClient *client);
116
117 NBDExport *exp;
118 int sock;
119
120 Coroutine *recv_coroutine;
121
122 CoMutex send_lock;
123 Coroutine *send_coroutine;
124
4b9441f6 125 QTAILQ_ENTRY(NBDClient) next;
9a304d29 126 int nb_requests;
ff2b68aa 127 bool closing;
9a304d29
PB
128};
129
7a5ca864
FB
130/* That's all folks */
131
185b4338 132ssize_t nbd_wr_sync(int fd, void *buffer, size_t size, bool do_read)
7a5ca864
FB
133{
134 size_t offset = 0;
185b4338 135 int err;
7a5ca864 136
ae255e52
PB
137 if (qemu_in_coroutine()) {
138 if (do_read) {
139 return qemu_co_recv(fd, buffer, size);
140 } else {
141 return qemu_co_send(fd, buffer, size);
142 }
143 }
144
7a5ca864
FB
145 while (offset < size) {
146 ssize_t len;
147
148 if (do_read) {
00aa0040 149 len = qemu_recv(fd, buffer + offset, size - offset, 0);
7a5ca864 150 } else {
03ff3ca3 151 len = send(fd, buffer + offset, size - offset, 0);
7a5ca864
FB
152 }
153
fc19f8a0 154 if (len < 0) {
185b4338 155 err = socket_error();
03ff3ca3 156
fc19f8a0 157 /* recoverable error */
7fe7b68b 158 if (err == EINTR || (offset > 0 && err == EAGAIN)) {
fc19f8a0
PB
159 continue;
160 }
161
162 /* unrecoverable error */
185b4338 163 return -err;
7a5ca864
FB
164 }
165
166 /* eof */
167 if (len == 0) {
168 break;
169 }
170
7a5ca864
FB
171 offset += len;
172 }
173
174 return offset;
175}
176
7fe7b68b
PB
177static ssize_t read_sync(int fd, void *buffer, size_t size)
178{
179 /* Sockets are kept in blocking mode in the negotiation phase. After
180 * that, a non-readable socket simply means that another thread stole
181 * our request/reply. Synchronization is done with recv_coroutine, so
182 * that this is coroutine-safe.
183 */
184 return nbd_wr_sync(fd, buffer, size, true);
185}
186
187static ssize_t write_sync(int fd, void *buffer, size_t size)
188{
189 int ret;
190 do {
191 /* For writes, we do expect the socket to be writable. */
192 ret = nbd_wr_sync(fd, buffer, size, false);
193 } while (ret == -EAGAIN);
194 return ret;
195}
196
6b8c01e7 197/* Basic flow for negotiation
7a5ca864
FB
198
199 Server Client
7a5ca864 200 Negotiate
6b8c01e7
PB
201
202 or
203
204 Server Client
205 Negotiate #1
206 Option
207 Negotiate #2
208
209 ----
210
211 followed by
212
213 Server Client
7a5ca864
FB
214 Request
215 Response
216 Request
217 Response
218 ...
219 ...
220 Request (type == 2)
6b8c01e7 221
7a5ca864
FB
222*/
223
f5076b5a 224static int nbd_send_rep(int csock, uint32_t type, uint32_t opt)
6b8c01e7 225{
6b8c01e7 226 uint64_t magic;
f5076b5a 227 uint32_t len;
6b8c01e7 228
f5076b5a
HB
229 magic = cpu_to_be64(NBD_REP_MAGIC);
230 if (write_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
231 LOG("write failed (rep magic)");
232 return -EINVAL;
6b8c01e7 233 }
f5076b5a
HB
234 opt = cpu_to_be32(opt);
235 if (write_sync(csock, &opt, sizeof(opt)) != sizeof(opt)) {
236 LOG("write failed (rep opt)");
237 return -EINVAL;
6b8c01e7 238 }
f5076b5a
HB
239 type = cpu_to_be32(type);
240 if (write_sync(csock, &type, sizeof(type)) != sizeof(type)) {
241 LOG("write failed (rep type)");
242 return -EINVAL;
6b8c01e7 243 }
f5076b5a
HB
244 len = cpu_to_be32(0);
245 if (write_sync(csock, &len, sizeof(len)) != sizeof(len)) {
246 LOG("write failed (rep data length)");
247 return -EINVAL;
6b8c01e7 248 }
f5076b5a
HB
249 return 0;
250}
6b8c01e7 251
f5076b5a
HB
252static int nbd_handle_export_name(NBDClient *client, uint32_t length)
253{
254 int rc = -EINVAL, csock = client->sock;
255 char name[256];
6b8c01e7 256
f5076b5a
HB
257 /* Client sends:
258 [20 .. xx] export name (length bytes)
259 */
6b8c01e7 260 TRACE("Checking length");
6b8c01e7
PB
261 if (length > 255) {
262 LOG("Bad length received");
263 goto fail;
264 }
265 if (read_sync(csock, name, length) != length) {
266 LOG("read failed");
267 goto fail;
268 }
269 name[length] = '\0';
270
271 client->exp = nbd_export_find(name);
272 if (!client->exp) {
273 LOG("export not found");
274 goto fail;
275 }
276
277 QTAILQ_INSERT_TAIL(&client->exp->clients, client, next);
278 nbd_export_get(client->exp);
6b8c01e7
PB
279 rc = 0;
280fail:
281 return rc;
282}
283
f5076b5a
HB
284static int nbd_receive_options(NBDClient *client)
285{
286 while (1) {
287 int csock = client->sock;
288 uint32_t tmp, length;
289 uint64_t magic;
290
291 /* Client sends:
292 [ 0 .. 3] client flags
293 [ 4 .. 11] NBD_OPTS_MAGIC
294 [12 .. 15] NBD option
295 [16 .. 19] length
296 ... Rest of request
297 */
298
299 if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
300 LOG("read failed");
301 return -EINVAL;
302 }
303 TRACE("Checking client flags");
304 tmp = be32_to_cpu(tmp);
305 if (tmp != 0 && tmp != NBD_FLAG_C_FIXED_NEWSTYLE) {
306 LOG("Bad client flags received");
307 return -EINVAL;
308 }
309
310 if (read_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
311 LOG("read failed");
312 return -EINVAL;
313 }
314 TRACE("Checking opts magic");
315 if (magic != be64_to_cpu(NBD_OPTS_MAGIC)) {
316 LOG("Bad magic received");
317 return -EINVAL;
318 }
319
320 if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
321 LOG("read failed");
322 return -EINVAL;
323 }
324
325 if (read_sync(csock, &length, sizeof(length)) != sizeof(length)) {
326 LOG("read failed");
327 return -EINVAL;
328 }
329 length = be32_to_cpu(length);
330
331 TRACE("Checking option");
332 switch (be32_to_cpu(tmp)) {
333 case NBD_OPT_ABORT:
334 return -EINVAL;
335
336 case NBD_OPT_EXPORT_NAME:
337 return nbd_handle_export_name(client, length);
338
339 default:
340 tmp = be32_to_cpu(tmp);
341 LOG("Unsupported option 0x%x", tmp);
342 nbd_send_rep(client->sock, NBD_REP_ERR_UNSUP, tmp);
343 return -EINVAL;
344 }
345 }
346}
347
9a304d29 348static int nbd_send_negotiate(NBDClient *client)
7a5ca864 349{
9a304d29 350 int csock = client->sock;
b2e3d87f 351 char buf[8 + 8 + 8 + 128];
185b4338 352 int rc;
6b8c01e7
PB
353 const int myflags = (NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_TRIM |
354 NBD_FLAG_SEND_FLUSH | NBD_FLAG_SEND_FUA);
b2e3d87f 355
6b8c01e7
PB
356 /* Negotiation header without options:
357 [ 0 .. 7] passwd ("NBDMAGIC")
358 [ 8 .. 15] magic (NBD_CLIENT_MAGIC)
b2e3d87f 359 [16 .. 23] size
6b8c01e7 360 [24 .. 25] server flags (0)
5672ee54 361 [26 .. 27] export flags
6b8c01e7
PB
362 [28 .. 151] reserved (0)
363
364 Negotiation header with options, part 1:
365 [ 0 .. 7] passwd ("NBDMAGIC")
366 [ 8 .. 15] magic (NBD_OPTS_MAGIC)
367 [16 .. 17] server flags (0)
368
369 part 2 (after options are sent):
370 [18 .. 25] size
371 [26 .. 27] export flags
372 [28 .. 151] reserved (0)
b2e3d87f
NT
373 */
374
f9e8cacc 375 qemu_set_block(csock);
185b4338
PB
376 rc = -EINVAL;
377
b2e3d87f 378 TRACE("Beginning negotiation.");
8ffaaba0 379 memset(buf, 0, sizeof(buf));
b2e3d87f 380 memcpy(buf, "NBDMAGIC", 8);
6b8c01e7
PB
381 if (client->exp) {
382 assert ((client->exp->nbdflags & ~65535) == 0);
383 cpu_to_be64w((uint64_t*)(buf + 8), NBD_CLIENT_MAGIC);
384 cpu_to_be64w((uint64_t*)(buf + 16), client->exp->size);
385 cpu_to_be16w((uint16_t*)(buf + 26), client->exp->nbdflags | myflags);
386 } else {
387 cpu_to_be64w((uint64_t*)(buf + 8), NBD_OPTS_MAGIC);
f5076b5a 388 cpu_to_be16w((uint16_t *)(buf + 16), NBD_FLAG_FIXED_NEWSTYLE);
6b8c01e7 389 }
b2e3d87f 390
6b8c01e7
PB
391 if (client->exp) {
392 if (write_sync(csock, buf, sizeof(buf)) != sizeof(buf)) {
393 LOG("write failed");
394 goto fail;
395 }
396 } else {
397 if (write_sync(csock, buf, 18) != 18) {
398 LOG("write failed");
399 goto fail;
400 }
401 rc = nbd_receive_options(client);
f5076b5a 402 if (rc != 0) {
6b8c01e7
PB
403 LOG("option negotiation failed");
404 goto fail;
405 }
406
407 assert ((client->exp->nbdflags & ~65535) == 0);
408 cpu_to_be64w((uint64_t*)(buf + 18), client->exp->size);
409 cpu_to_be16w((uint16_t*)(buf + 26), client->exp->nbdflags | myflags);
410 if (write_sync(csock, buf + 18, sizeof(buf) - 18) != sizeof(buf) - 18) {
411 LOG("write failed");
412 goto fail;
413 }
b2e3d87f
NT
414 }
415
07f35073 416 TRACE("Negotiation succeeded.");
185b4338
PB
417 rc = 0;
418fail:
f9e8cacc 419 qemu_set_nonblock(csock);
185b4338 420 return rc;
7a5ca864
FB
421}
422
1d45f8b5
LV
423int nbd_receive_negotiate(int csock, const char *name, uint32_t *flags,
424 off_t *size, size_t *blocksize)
7a5ca864 425{
b2e3d87f
NT
426 char buf[256];
427 uint64_t magic, s;
428 uint16_t tmp;
185b4338 429 int rc;
b2e3d87f 430
07f35073 431 TRACE("Receiving negotiation.");
b2e3d87f 432
185b4338
PB
433 rc = -EINVAL;
434
b2e3d87f
NT
435 if (read_sync(csock, buf, 8) != 8) {
436 LOG("read failed");
185b4338 437 goto fail;
b2e3d87f
NT
438 }
439
440 buf[8] = '\0';
441 if (strlen(buf) == 0) {
442 LOG("server connection closed");
185b4338 443 goto fail;
b2e3d87f
NT
444 }
445
446 TRACE("Magic is %c%c%c%c%c%c%c%c",
447 qemu_isprint(buf[0]) ? buf[0] : '.',
448 qemu_isprint(buf[1]) ? buf[1] : '.',
449 qemu_isprint(buf[2]) ? buf[2] : '.',
450 qemu_isprint(buf[3]) ? buf[3] : '.',
451 qemu_isprint(buf[4]) ? buf[4] : '.',
452 qemu_isprint(buf[5]) ? buf[5] : '.',
453 qemu_isprint(buf[6]) ? buf[6] : '.',
454 qemu_isprint(buf[7]) ? buf[7] : '.');
455
456 if (memcmp(buf, "NBDMAGIC", 8) != 0) {
457 LOG("Invalid magic received");
185b4338 458 goto fail;
b2e3d87f
NT
459 }
460
461 if (read_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
462 LOG("read failed");
185b4338 463 goto fail;
b2e3d87f
NT
464 }
465 magic = be64_to_cpu(magic);
466 TRACE("Magic is 0x%" PRIx64, magic);
467
468 if (name) {
469 uint32_t reserved = 0;
470 uint32_t opt;
471 uint32_t namesize;
472
473 TRACE("Checking magic (opts_magic)");
fa26c26b 474 if (magic != NBD_OPTS_MAGIC) {
b2e3d87f 475 LOG("Bad magic received");
185b4338 476 goto fail;
b2e3d87f
NT
477 }
478 if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
479 LOG("flags read failed");
185b4338 480 goto fail;
b2e3d87f
NT
481 }
482 *flags = be16_to_cpu(tmp) << 16;
483 /* reserved for future use */
484 if (write_sync(csock, &reserved, sizeof(reserved)) !=
485 sizeof(reserved)) {
486 LOG("write failed (reserved)");
185b4338 487 goto fail;
b2e3d87f
NT
488 }
489 /* write the export name */
490 magic = cpu_to_be64(magic);
491 if (write_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
492 LOG("write failed (magic)");
185b4338 493 goto fail;
b2e3d87f
NT
494 }
495 opt = cpu_to_be32(NBD_OPT_EXPORT_NAME);
496 if (write_sync(csock, &opt, sizeof(opt)) != sizeof(opt)) {
497 LOG("write failed (opt)");
185b4338 498 goto fail;
b2e3d87f
NT
499 }
500 namesize = cpu_to_be32(strlen(name));
501 if (write_sync(csock, &namesize, sizeof(namesize)) !=
502 sizeof(namesize)) {
503 LOG("write failed (namesize)");
185b4338 504 goto fail;
b2e3d87f
NT
505 }
506 if (write_sync(csock, (char*)name, strlen(name)) != strlen(name)) {
507 LOG("write failed (name)");
185b4338 508 goto fail;
b2e3d87f
NT
509 }
510 } else {
511 TRACE("Checking magic (cli_magic)");
512
fa26c26b 513 if (magic != NBD_CLIENT_MAGIC) {
b2e3d87f 514 LOG("Bad magic received");
185b4338 515 goto fail;
b2e3d87f
NT
516 }
517 }
518
519 if (read_sync(csock, &s, sizeof(s)) != sizeof(s)) {
520 LOG("read failed");
185b4338 521 goto fail;
b2e3d87f
NT
522 }
523 *size = be64_to_cpu(s);
524 *blocksize = 1024;
525 TRACE("Size is %" PRIu64, *size);
526
527 if (!name) {
528 if (read_sync(csock, flags, sizeof(*flags)) != sizeof(*flags)) {
529 LOG("read failed (flags)");
185b4338 530 goto fail;
b2e3d87f
NT
531 }
532 *flags = be32_to_cpup(flags);
533 } else {
534 if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
535 LOG("read failed (tmp)");
185b4338 536 goto fail;
b2e3d87f
NT
537 }
538 *flags |= be32_to_cpu(tmp);
539 }
540 if (read_sync(csock, &buf, 124) != 124) {
541 LOG("read failed (buf)");
185b4338 542 goto fail;
b2e3d87f 543 }
185b4338
PB
544 rc = 0;
545
546fail:
547 return rc;
cd831bd7 548}
7a5ca864 549
b90fb4b8
PB
550#ifdef __linux__
551int nbd_init(int fd, int csock, uint32_t flags, off_t size, size_t blocksize)
cd831bd7 552{
3e05c785
CL
553 TRACE("Setting NBD socket");
554
fc19f8a0 555 if (ioctl(fd, NBD_SET_SOCK, csock) < 0) {
3e05c785
CL
556 int serrno = errno;
557 LOG("Failed to set NBD socket");
185b4338 558 return -serrno;
3e05c785
CL
559 }
560
b2e3d87f 561 TRACE("Setting block size to %lu", (unsigned long)blocksize);
7a5ca864 562
fc19f8a0 563 if (ioctl(fd, NBD_SET_BLKSIZE, blocksize) < 0) {
b2e3d87f
NT
564 int serrno = errno;
565 LOG("Failed setting NBD block size");
185b4338 566 return -serrno;
b2e3d87f 567 }
7a5ca864 568
0bfcd599 569 TRACE("Setting size to %zd block(s)", (size_t)(size / blocksize));
7a5ca864 570
fc19f8a0 571 if (ioctl(fd, NBD_SET_SIZE_BLOCKS, size / blocksize) < 0) {
b2e3d87f
NT
572 int serrno = errno;
573 LOG("Failed setting size (in blocks)");
185b4338 574 return -serrno;
b2e3d87f 575 }
7a5ca864 576
c8969ede
PB
577 if (ioctl(fd, NBD_SET_FLAGS, flags) < 0) {
578 if (errno == ENOTTY) {
579 int read_only = (flags & NBD_FLAG_READ_ONLY) != 0;
580 TRACE("Setting readonly attribute");
581
582 if (ioctl(fd, BLKROSET, (unsigned long) &read_only) < 0) {
583 int serrno = errno;
584 LOG("Failed setting read-only attribute");
585 return -serrno;
586 }
587 } else {
b90fb4b8 588 int serrno = errno;
c8969ede 589 LOG("Failed setting flags");
185b4338 590 return -serrno;
b90fb4b8
PB
591 }
592 }
593
b2e3d87f 594 TRACE("Negotiation ended");
7a5ca864 595
b2e3d87f 596 return 0;
7a5ca864
FB
597}
598
599int nbd_disconnect(int fd)
600{
b2e3d87f
NT
601 ioctl(fd, NBD_CLEAR_QUE);
602 ioctl(fd, NBD_DISCONNECT);
603 ioctl(fd, NBD_CLEAR_SOCK);
604 return 0;
7a5ca864
FB
605}
606
0a4eb864 607int nbd_client(int fd)
7a5ca864 608{
b2e3d87f
NT
609 int ret;
610 int serrno;
7a5ca864 611
b2e3d87f 612 TRACE("Doing NBD loop");
7a5ca864 613
b2e3d87f 614 ret = ioctl(fd, NBD_DO_IT);
fc19f8a0 615 if (ret < 0 && errno == EPIPE) {
74624688
PB
616 /* NBD_DO_IT normally returns EPIPE when someone has disconnected
617 * the socket via NBD_DISCONNECT. We do not want to return 1 in
618 * that case.
619 */
620 ret = 0;
621 }
b2e3d87f 622 serrno = errno;
7a5ca864 623
b2e3d87f 624 TRACE("NBD loop returned %d: %s", ret, strerror(serrno));
7a5ca864 625
b2e3d87f
NT
626 TRACE("Clearing NBD queue");
627 ioctl(fd, NBD_CLEAR_QUE);
7a5ca864 628
b2e3d87f
NT
629 TRACE("Clearing NBD socket");
630 ioctl(fd, NBD_CLEAR_SOCK);
7a5ca864 631
b2e3d87f
NT
632 errno = serrno;
633 return ret;
7a5ca864 634}
03ff3ca3 635#else
8e72506e 636int nbd_init(int fd, int csock, uint32_t flags, off_t size, size_t blocksize)
03ff3ca3 637{
185b4338 638 return -ENOTSUP;
03ff3ca3
AL
639}
640
641int nbd_disconnect(int fd)
642{
185b4338 643 return -ENOTSUP;
03ff3ca3
AL
644}
645
0a4eb864 646int nbd_client(int fd)
03ff3ca3 647{
185b4338 648 return -ENOTSUP;
03ff3ca3
AL
649}
650#endif
7a5ca864 651
94e7340b 652ssize_t nbd_send_request(int csock, struct nbd_request *request)
7a5ca864 653{
fa26c26b 654 uint8_t buf[NBD_REQUEST_SIZE];
185b4338 655 ssize_t ret;
b2e3d87f
NT
656
657 cpu_to_be32w((uint32_t*)buf, NBD_REQUEST_MAGIC);
658 cpu_to_be32w((uint32_t*)(buf + 4), request->type);
659 cpu_to_be64w((uint64_t*)(buf + 8), request->handle);
660 cpu_to_be64w((uint64_t*)(buf + 16), request->from);
661 cpu_to_be32w((uint32_t*)(buf + 24), request->len);
75818250 662
b2e3d87f
NT
663 TRACE("Sending request to client: "
664 "{ .from = %" PRIu64", .len = %u, .handle = %" PRIu64", .type=%i}",
665 request->from, request->len, request->handle, request->type);
666
185b4338
PB
667 ret = write_sync(csock, buf, sizeof(buf));
668 if (ret < 0) {
669 return ret;
670 }
671
672 if (ret != sizeof(buf)) {
b2e3d87f 673 LOG("writing to socket failed");
185b4338 674 return -EINVAL;
b2e3d87f
NT
675 }
676 return 0;
677}
75818250 678
94e7340b 679static ssize_t nbd_receive_request(int csock, struct nbd_request *request)
75818250 680{
fa26c26b 681 uint8_t buf[NBD_REQUEST_SIZE];
b2e3d87f 682 uint32_t magic;
185b4338 683 ssize_t ret;
b2e3d87f 684
185b4338
PB
685 ret = read_sync(csock, buf, sizeof(buf));
686 if (ret < 0) {
687 return ret;
688 }
689
690 if (ret != sizeof(buf)) {
b2e3d87f 691 LOG("read failed");
185b4338 692 return -EINVAL;
b2e3d87f
NT
693 }
694
695 /* Request
696 [ 0 .. 3] magic (NBD_REQUEST_MAGIC)
697 [ 4 .. 7] type (0 == READ, 1 == WRITE)
698 [ 8 .. 15] handle
699 [16 .. 23] from
700 [24 .. 27] len
701 */
702
703 magic = be32_to_cpup((uint32_t*)buf);
704 request->type = be32_to_cpup((uint32_t*)(buf + 4));
705 request->handle = be64_to_cpup((uint64_t*)(buf + 8));
706 request->from = be64_to_cpup((uint64_t*)(buf + 16));
707 request->len = be32_to_cpup((uint32_t*)(buf + 24));
708
709 TRACE("Got request: "
710 "{ magic = 0x%x, .type = %d, from = %" PRIu64" , len = %u }",
711 magic, request->type, request->from, request->len);
712
713 if (magic != NBD_REQUEST_MAGIC) {
714 LOG("invalid magic (got 0x%x)", magic);
185b4338 715 return -EINVAL;
b2e3d87f
NT
716 }
717 return 0;
75818250
TS
718}
719
94e7340b 720ssize_t nbd_receive_reply(int csock, struct nbd_reply *reply)
75818250 721{
b2e3d87f
NT
722 uint8_t buf[NBD_REPLY_SIZE];
723 uint32_t magic;
185b4338 724 ssize_t ret;
b2e3d87f 725
185b4338
PB
726 ret = read_sync(csock, buf, sizeof(buf));
727 if (ret < 0) {
728 return ret;
729 }
730
731 if (ret != sizeof(buf)) {
b2e3d87f 732 LOG("read failed");
185b4338 733 return -EINVAL;
b2e3d87f
NT
734 }
735
736 /* Reply
737 [ 0 .. 3] magic (NBD_REPLY_MAGIC)
738 [ 4 .. 7] error (0 == no error)
739 [ 7 .. 15] handle
740 */
741
742 magic = be32_to_cpup((uint32_t*)buf);
743 reply->error = be32_to_cpup((uint32_t*)(buf + 4));
744 reply->handle = be64_to_cpup((uint64_t*)(buf + 8));
745
746 TRACE("Got reply: "
747 "{ magic = 0x%x, .error = %d, handle = %" PRIu64" }",
748 magic, reply->error, reply->handle);
749
750 if (magic != NBD_REPLY_MAGIC) {
751 LOG("invalid magic (got 0x%x)", magic);
185b4338 752 return -EINVAL;
b2e3d87f
NT
753 }
754 return 0;
75818250
TS
755}
756
94e7340b 757static ssize_t nbd_send_reply(int csock, struct nbd_reply *reply)
75818250 758{
fa26c26b 759 uint8_t buf[NBD_REPLY_SIZE];
185b4338 760 ssize_t ret;
b2e3d87f
NT
761
762 /* Reply
763 [ 0 .. 3] magic (NBD_REPLY_MAGIC)
764 [ 4 .. 7] error (0 == no error)
765 [ 7 .. 15] handle
766 */
767 cpu_to_be32w((uint32_t*)buf, NBD_REPLY_MAGIC);
768 cpu_to_be32w((uint32_t*)(buf + 4), reply->error);
769 cpu_to_be64w((uint64_t*)(buf + 8), reply->handle);
770
771 TRACE("Sending response to client");
772
185b4338
PB
773 ret = write_sync(csock, buf, sizeof(buf));
774 if (ret < 0) {
775 return ret;
776 }
777
778 if (ret != sizeof(buf)) {
b2e3d87f 779 LOG("writing to socket failed");
185b4338 780 return -EINVAL;
b2e3d87f
NT
781 }
782 return 0;
75818250 783}
7a5ca864 784
41996e38
PB
785#define MAX_NBD_REQUESTS 16
786
ce33967a 787void nbd_client_get(NBDClient *client)
1743b515
PB
788{
789 client->refcount++;
790}
791
ce33967a 792void nbd_client_put(NBDClient *client)
1743b515
PB
793{
794 if (--client->refcount == 0) {
ff2b68aa
PB
795 /* The last reference should be dropped by client->close,
796 * which is called by nbd_client_close.
797 */
798 assert(client->closing);
799
800 qemu_set_fd_handler2(client->sock, NULL, NULL, NULL, NULL);
801 close(client->sock);
802 client->sock = -1;
6b8c01e7
PB
803 if (client->exp) {
804 QTAILQ_REMOVE(&client->exp->clients, client, next);
805 nbd_export_put(client->exp);
806 }
1743b515
PB
807 g_free(client);
808 }
809}
810
ff2b68aa 811void nbd_client_close(NBDClient *client)
1743b515 812{
ff2b68aa
PB
813 if (client->closing) {
814 return;
815 }
816
817 client->closing = true;
818
819 /* Force requests to finish. They will drop their own references,
820 * then we'll close the socket and free the NBDClient.
821 */
822 shutdown(client->sock, 2);
823
824 /* Also tell the client, so that they release their reference. */
1743b515
PB
825 if (client->close) {
826 client->close(client);
827 }
1743b515
PB
828}
829
72deddc5 830static NBDRequest *nbd_request_get(NBDClient *client)
d9a73806
PB
831{
832 NBDRequest *req;
72deddc5 833
41996e38
PB
834 assert(client->nb_requests <= MAX_NBD_REQUESTS - 1);
835 client->nb_requests++;
836
e1adb27a 837 req = g_slice_new0(NBDRequest);
72deddc5
PB
838 nbd_client_get(client);
839 req->client = client;
d9a73806
PB
840 return req;
841}
842
72deddc5 843static void nbd_request_put(NBDRequest *req)
d9a73806 844{
72deddc5 845 NBDClient *client = req->client;
e1adb27a 846
2d821488
SH
847 if (req->data) {
848 qemu_vfree(req->data);
849 }
e1adb27a
SH
850 g_slice_free(NBDRequest, req);
851
41996e38
PB
852 if (client->nb_requests-- == MAX_NBD_REQUESTS) {
853 qemu_notify_event();
854 }
72deddc5 855 nbd_client_put(client);
d9a73806
PB
856}
857
af49bbbe 858NBDExport *nbd_export_new(BlockDriverState *bs, off_t dev_offset,
0ddf08db
PB
859 off_t size, uint32_t nbdflags,
860 void (*close)(NBDExport *))
af49bbbe
PB
861{
862 NBDExport *exp = g_malloc0(sizeof(NBDExport));
2c8d9f06 863 exp->refcount = 1;
4b9441f6 864 QTAILQ_INIT(&exp->clients);
af49bbbe
PB
865 exp->bs = bs;
866 exp->dev_offset = dev_offset;
867 exp->nbdflags = nbdflags;
38ceff04 868 exp->size = size == -1 ? bdrv_getlength(bs) : size;
0ddf08db 869 exp->close = close;
38b54b6d 870 bdrv_ref(bs);
af49bbbe
PB
871 return exp;
872}
873
ee0a19ec
PB
874NBDExport *nbd_export_find(const char *name)
875{
876 NBDExport *exp;
877 QTAILQ_FOREACH(exp, &exports, next) {
878 if (strcmp(name, exp->name) == 0) {
879 return exp;
880 }
881 }
882
883 return NULL;
884}
885
886void nbd_export_set_name(NBDExport *exp, const char *name)
887{
888 if (exp->name == name) {
889 return;
890 }
891
892 nbd_export_get(exp);
893 if (exp->name != NULL) {
894 g_free(exp->name);
895 exp->name = NULL;
896 QTAILQ_REMOVE(&exports, exp, next);
897 nbd_export_put(exp);
898 }
899 if (name != NULL) {
900 nbd_export_get(exp);
901 exp->name = g_strdup(name);
902 QTAILQ_INSERT_TAIL(&exports, exp, next);
903 }
904 nbd_export_put(exp);
905}
906
af49bbbe
PB
907void nbd_export_close(NBDExport *exp)
908{
4b9441f6 909 NBDClient *client, *next;
2c8d9f06 910
4b9441f6
PB
911 nbd_export_get(exp);
912 QTAILQ_FOREACH_SAFE(client, &exp->clients, next, next) {
913 nbd_client_close(client);
914 }
125afda8 915 nbd_export_set_name(exp, NULL);
4b9441f6 916 nbd_export_put(exp);
38b54b6d
FZ
917 if (exp->bs) {
918 bdrv_unref(exp->bs);
919 exp->bs = NULL;
920 }
2c8d9f06
PB
921}
922
923void nbd_export_get(NBDExport *exp)
924{
925 assert(exp->refcount > 0);
926 exp->refcount++;
927}
928
929void nbd_export_put(NBDExport *exp)
930{
931 assert(exp->refcount > 0);
932 if (exp->refcount == 1) {
933 nbd_export_close(exp);
d9a73806
PB
934 }
935
2c8d9f06 936 if (--exp->refcount == 0) {
ee0a19ec
PB
937 assert(exp->name == NULL);
938
0ddf08db
PB
939 if (exp->close) {
940 exp->close(exp);
941 }
942
2c8d9f06
PB
943 g_free(exp);
944 }
af49bbbe
PB
945}
946
125afda8
PB
947BlockDriverState *nbd_export_get_blockdev(NBDExport *exp)
948{
949 return exp->bs;
950}
951
ee0a19ec
PB
952void nbd_export_close_all(void)
953{
954 NBDExport *exp, *next;
955
956 QTAILQ_FOREACH_SAFE(exp, &exports, next, next) {
957 nbd_export_close(exp);
ee0a19ec
PB
958 }
959}
960
41996e38 961static int nbd_can_read(void *opaque);
262db388
PB
962static void nbd_read(void *opaque);
963static void nbd_restart_write(void *opaque);
964
94e7340b
PB
965static ssize_t nbd_co_send_reply(NBDRequest *req, struct nbd_reply *reply,
966 int len)
22045592 967{
72deddc5
PB
968 NBDClient *client = req->client;
969 int csock = client->sock;
94e7340b 970 ssize_t rc, ret;
22045592 971
262db388 972 qemu_co_mutex_lock(&client->send_lock);
41996e38
PB
973 qemu_set_fd_handler2(csock, nbd_can_read, nbd_read,
974 nbd_restart_write, client);
262db388
PB
975 client->send_coroutine = qemu_coroutine_self();
976
22045592
PB
977 if (!len) {
978 rc = nbd_send_reply(csock, reply);
22045592
PB
979 } else {
980 socket_set_cork(csock, 1);
981 rc = nbd_send_reply(csock, reply);
fc19f8a0 982 if (rc >= 0) {
262db388 983 ret = qemu_co_send(csock, req->data, len);
22045592 984 if (ret != len) {
185b4338 985 rc = -EIO;
22045592
PB
986 }
987 }
22045592
PB
988 socket_set_cork(csock, 0);
989 }
262db388
PB
990
991 client->send_coroutine = NULL;
41996e38 992 qemu_set_fd_handler2(csock, nbd_can_read, nbd_read, NULL, client);
262db388 993 qemu_co_mutex_unlock(&client->send_lock);
22045592
PB
994 return rc;
995}
996
94e7340b 997static ssize_t nbd_co_receive_request(NBDRequest *req, struct nbd_request *request)
a030b347 998{
72deddc5
PB
999 NBDClient *client = req->client;
1000 int csock = client->sock;
2d821488 1001 uint32_t command;
94e7340b 1002 ssize_t rc;
a030b347 1003
262db388 1004 client->recv_coroutine = qemu_coroutine_self();
7fe7b68b
PB
1005 rc = nbd_receive_request(csock, request);
1006 if (rc < 0) {
1007 if (rc != -EAGAIN) {
1008 rc = -EIO;
1009 }
a030b347
PB
1010 goto out;
1011 }
1012
2d821488 1013 if (request->len > NBD_MAX_BUFFER_SIZE) {
a030b347 1014 LOG("len (%u) is larger than max len (%u)",
2d821488 1015 request->len, NBD_MAX_BUFFER_SIZE);
a030b347
PB
1016 rc = -EINVAL;
1017 goto out;
1018 }
1019
1020 if ((request->from + request->len) < request->from) {
1021 LOG("integer overflow detected! "
1022 "you're probably being attacked");
1023 rc = -EINVAL;
1024 goto out;
1025 }
1026
1027 TRACE("Decoding type");
1028
2d821488
SH
1029 command = request->type & NBD_CMD_MASK_COMMAND;
1030 if (command == NBD_CMD_READ || command == NBD_CMD_WRITE) {
1031 req->data = qemu_blockalign(client->exp->bs, request->len);
1032 }
1033 if (command == NBD_CMD_WRITE) {
a030b347
PB
1034 TRACE("Reading %u byte(s)", request->len);
1035
262db388 1036 if (qemu_co_recv(csock, req->data, request->len) != request->len) {
a030b347
PB
1037 LOG("reading from socket failed");
1038 rc = -EIO;
1039 goto out;
1040 }
1041 }
1042 rc = 0;
1043
1044out:
262db388 1045 client->recv_coroutine = NULL;
a030b347
PB
1046 return rc;
1047}
1048
262db388 1049static void nbd_trip(void *opaque)
75818250 1050{
262db388 1051 NBDClient *client = opaque;
1743b515 1052 NBDExport *exp = client->exp;
ff2b68aa 1053 NBDRequest *req;
b2e3d87f
NT
1054 struct nbd_request request;
1055 struct nbd_reply reply;
94e7340b 1056 ssize_t ret;
8c5d1abb 1057 uint32_t command;
b2e3d87f
NT
1058
1059 TRACE("Reading request.");
ff2b68aa
PB
1060 if (client->closing) {
1061 return;
1062 }
b2e3d87f 1063
ff2b68aa 1064 req = nbd_request_get(client);
262db388 1065 ret = nbd_co_receive_request(req, &request);
7fe7b68b
PB
1066 if (ret == -EAGAIN) {
1067 goto done;
1068 }
a030b347 1069 if (ret == -EIO) {
d9a73806 1070 goto out;
a030b347 1071 }
b2e3d87f 1072
fae69416
PB
1073 reply.handle = request.handle;
1074 reply.error = 0;
1075
a030b347
PB
1076 if (ret < 0) {
1077 reply.error = -ret;
1078 goto error_reply;
b2e3d87f 1079 }
8c5d1abb
HB
1080 command = request.type & NBD_CMD_MASK_COMMAND;
1081 if (command != NBD_CMD_DISC && (request.from + request.len) > exp->size) {
b2e3d87f
NT
1082 LOG("From: %" PRIu64 ", Len: %u, Size: %" PRIu64
1083 ", Offset: %" PRIu64 "\n",
af49bbbe 1084 request.from, request.len,
0fee8f34 1085 (uint64_t)exp->size, (uint64_t)exp->dev_offset);
b2e3d87f 1086 LOG("requested operation past EOF--bad client?");
fae69416 1087 goto invalid_request;
b2e3d87f
NT
1088 }
1089
8c5d1abb 1090 switch (command) {
b2e3d87f
NT
1091 case NBD_CMD_READ:
1092 TRACE("Request type is READ");
1093
e25ceb76
PB
1094 if (request.type & NBD_CMD_FLAG_FUA) {
1095 ret = bdrv_co_flush(exp->bs);
1096 if (ret < 0) {
1097 LOG("flush failed");
1098 reply.error = -ret;
1099 goto error_reply;
1100 }
1101 }
1102
af49bbbe 1103 ret = bdrv_read(exp->bs, (request.from + exp->dev_offset) / 512,
d9a73806 1104 req->data, request.len / 512);
adcf6302 1105 if (ret < 0) {
b2e3d87f 1106 LOG("reading from file failed");
adcf6302 1107 reply.error = -ret;
fae69416 1108 goto error_reply;
b2e3d87f 1109 }
b2e3d87f
NT
1110
1111 TRACE("Read %u byte(s)", request.len);
262db388 1112 if (nbd_co_send_reply(req, &reply, request.len) < 0)
d9a73806 1113 goto out;
b2e3d87f
NT
1114 break;
1115 case NBD_CMD_WRITE:
1116 TRACE("Request type is WRITE");
1117
af49bbbe 1118 if (exp->nbdflags & NBD_FLAG_READ_ONLY) {
b2e3d87f 1119 TRACE("Server is read-only, return error");
fae69416
PB
1120 reply.error = EROFS;
1121 goto error_reply;
1122 }
1123
1124 TRACE("Writing to device");
1125
af49bbbe 1126 ret = bdrv_write(exp->bs, (request.from + exp->dev_offset) / 512,
d9a73806 1127 req->data, request.len / 512);
fae69416
PB
1128 if (ret < 0) {
1129 LOG("writing to file failed");
1130 reply.error = -ret;
1131 goto error_reply;
1132 }
b2e3d87f 1133
fae69416 1134 if (request.type & NBD_CMD_FLAG_FUA) {
262db388 1135 ret = bdrv_co_flush(exp->bs);
adcf6302 1136 if (ret < 0) {
fae69416 1137 LOG("flush failed");
adcf6302 1138 reply.error = -ret;
fae69416 1139 goto error_reply;
2c7989a9 1140 }
b2e3d87f
NT
1141 }
1142
fc19f8a0 1143 if (nbd_co_send_reply(req, &reply, 0) < 0) {
d9a73806 1144 goto out;
fc19f8a0 1145 }
b2e3d87f
NT
1146 break;
1147 case NBD_CMD_DISC:
1148 TRACE("Request type is DISCONNECT");
1149 errno = 0;
262db388 1150 goto out;
1486d04a
PB
1151 case NBD_CMD_FLUSH:
1152 TRACE("Request type is FLUSH");
1153
262db388 1154 ret = bdrv_co_flush(exp->bs);
1486d04a
PB
1155 if (ret < 0) {
1156 LOG("flush failed");
1157 reply.error = -ret;
1158 }
fc19f8a0 1159 if (nbd_co_send_reply(req, &reply, 0) < 0) {
d9a73806 1160 goto out;
fc19f8a0 1161 }
7a706633
PB
1162 break;
1163 case NBD_CMD_TRIM:
1164 TRACE("Request type is TRIM");
262db388
PB
1165 ret = bdrv_co_discard(exp->bs, (request.from + exp->dev_offset) / 512,
1166 request.len / 512);
7a706633
PB
1167 if (ret < 0) {
1168 LOG("discard failed");
1169 reply.error = -ret;
1170 }
fc19f8a0 1171 if (nbd_co_send_reply(req, &reply, 0) < 0) {
d9a73806 1172 goto out;
fc19f8a0 1173 }
1486d04a 1174 break;
b2e3d87f
NT
1175 default:
1176 LOG("invalid request type (%u) received", request.type);
fae69416
PB
1177 invalid_request:
1178 reply.error = -EINVAL;
1179 error_reply:
fc19f8a0 1180 if (nbd_co_send_reply(req, &reply, 0) < 0) {
d9a73806 1181 goto out;
fc19f8a0 1182 }
fae69416 1183 break;
b2e3d87f
NT
1184 }
1185
1186 TRACE("Request/Reply complete");
1187
7fe7b68b 1188done:
262db388
PB
1189 nbd_request_put(req);
1190 return;
1191
d9a73806 1192out:
72deddc5 1193 nbd_request_put(req);
262db388 1194 nbd_client_close(client);
7a5ca864 1195}
af49bbbe 1196
41996e38
PB
1197static int nbd_can_read(void *opaque)
1198{
1199 NBDClient *client = opaque;
1200
1201 return client->recv_coroutine || client->nb_requests < MAX_NBD_REQUESTS;
1202}
1203
1743b515
PB
1204static void nbd_read(void *opaque)
1205{
1206 NBDClient *client = opaque;
1207
262db388
PB
1208 if (client->recv_coroutine) {
1209 qemu_coroutine_enter(client->recv_coroutine, NULL);
1210 } else {
1211 qemu_coroutine_enter(qemu_coroutine_create(nbd_trip), client);
1743b515 1212 }
1743b515
PB
1213}
1214
262db388
PB
1215static void nbd_restart_write(void *opaque)
1216{
1217 NBDClient *client = opaque;
1218
1219 qemu_coroutine_enter(client->send_coroutine, NULL);
1220}
1221
1743b515
PB
1222NBDClient *nbd_client_new(NBDExport *exp, int csock,
1223 void (*close)(NBDClient *))
af49bbbe 1224{
1743b515 1225 NBDClient *client;
1743b515
PB
1226 client = g_malloc0(sizeof(NBDClient));
1227 client->refcount = 1;
1228 client->exp = exp;
1229 client->sock = csock;
f5076b5a 1230 if (nbd_send_negotiate(client)) {
9a304d29
PB
1231 g_free(client);
1232 return NULL;
1233 }
1743b515 1234 client->close = close;
262db388 1235 qemu_co_mutex_init(&client->send_lock);
41996e38 1236 qemu_set_fd_handler2(csock, nbd_can_read, nbd_read, NULL, client);
2c8d9f06 1237
6b8c01e7
PB
1238 if (exp) {
1239 QTAILQ_INSERT_TAIL(&exp->clients, client, next);
1240 nbd_export_get(exp);
1241 }
1743b515 1242 return client;
af49bbbe 1243}