]> git.proxmox.com Git - qemu.git/blob - block/nbd.c
48bbecacaf942dc74b4cbb64bde05fffd56ecaaa
[qemu.git] / block / nbd.c
1 /*
2 * QEMU Block driver for NBD
3 *
4 * Copyright (C) 2008 Bull S.A.S.
5 * Author: Laurent Vivier <Laurent.Vivier@bull.net>
6 *
7 * Some parts:
8 * Copyright (C) 2007 Anthony Liguori <anthony@codemonkey.ws>
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
28
29 #include "qemu-common.h"
30 #include "nbd.h"
31 #include "block_int.h"
32 #include "module.h"
33 #include "qemu_socket.h"
34
35 #include <sys/types.h>
36 #include <unistd.h>
37
38 #define EN_OPTSTR ":exportname="
39
40 /* #define DEBUG_NBD */
41
42 #if defined(DEBUG_NBD)
43 #define logout(fmt, ...) \
44 fprintf(stderr, "nbd\t%-24s" fmt, __func__, ##__VA_ARGS__)
45 #else
46 #define logout(fmt, ...) ((void)0)
47 #endif
48
49 #define MAX_NBD_REQUESTS 16
50 #define HANDLE_TO_INDEX(bs, handle) ((handle) ^ ((uint64_t)(intptr_t)bs))
51 #define INDEX_TO_HANDLE(bs, index) ((index) ^ ((uint64_t)(intptr_t)bs))
52
53 typedef struct BDRVNBDState {
54 int sock;
55 uint32_t nbdflags;
56 off_t size;
57 size_t blocksize;
58
59 CoMutex send_mutex;
60 CoMutex free_sema;
61 Coroutine *send_coroutine;
62 int in_flight;
63
64 Coroutine *recv_coroutine[MAX_NBD_REQUESTS];
65 struct nbd_reply reply;
66
67 int is_unix;
68 char *host_spec;
69 char *export_name; /* An NBD server may export several devices */
70 } BDRVNBDState;
71
72 static int nbd_config(BDRVNBDState *s, const char *filename)
73 {
74 char *file;
75 char *export_name;
76 const char *host_spec;
77 const char *unixpath;
78 int err = -EINVAL;
79
80 file = g_strdup(filename);
81
82 export_name = strstr(file, EN_OPTSTR);
83 if (export_name) {
84 if (export_name[strlen(EN_OPTSTR)] == 0) {
85 goto out;
86 }
87 export_name[0] = 0; /* truncate 'file' */
88 export_name += strlen(EN_OPTSTR);
89 s->export_name = g_strdup(export_name);
90 }
91
92 /* extract the host_spec - fail if it's not nbd:... */
93 if (!strstart(file, "nbd:", &host_spec)) {
94 goto out;
95 }
96
97 /* are we a UNIX or TCP socket? */
98 if (strstart(host_spec, "unix:", &unixpath)) {
99 s->is_unix = true;
100 s->host_spec = g_strdup(unixpath);
101 } else {
102 s->is_unix = false;
103 s->host_spec = g_strdup(host_spec);
104 }
105
106 err = 0;
107
108 out:
109 g_free(file);
110 if (err != 0) {
111 g_free(s->export_name);
112 g_free(s->host_spec);
113 }
114 return err;
115 }
116
117 static void nbd_coroutine_start(BDRVNBDState *s, struct nbd_request *request)
118 {
119 int i;
120
121 /* Poor man semaphore. The free_sema is locked when no other request
122 * can be accepted, and unlocked after receiving one reply. */
123 if (s->in_flight >= MAX_NBD_REQUESTS - 1) {
124 qemu_co_mutex_lock(&s->free_sema);
125 assert(s->in_flight < MAX_NBD_REQUESTS);
126 }
127 s->in_flight++;
128
129 for (i = 0; i < MAX_NBD_REQUESTS; i++) {
130 if (s->recv_coroutine[i] == NULL) {
131 s->recv_coroutine[i] = qemu_coroutine_self();
132 break;
133 }
134 }
135
136 assert(i < MAX_NBD_REQUESTS);
137 request->handle = INDEX_TO_HANDLE(s, i);
138 }
139
140 static int nbd_have_request(void *opaque)
141 {
142 BDRVNBDState *s = opaque;
143
144 return s->in_flight > 0;
145 }
146
147 static void nbd_reply_ready(void *opaque)
148 {
149 BDRVNBDState *s = opaque;
150 uint64_t i;
151 int ret;
152
153 if (s->reply.handle == 0) {
154 /* No reply already in flight. Fetch a header. It is possible
155 * that another thread has done the same thing in parallel, so
156 * the socket is not readable anymore.
157 */
158 ret = nbd_receive_reply(s->sock, &s->reply);
159 if (ret == -EAGAIN) {
160 return;
161 }
162 if (ret < 0) {
163 s->reply.handle = 0;
164 goto fail;
165 }
166 }
167
168 /* There's no need for a mutex on the receive side, because the
169 * handler acts as a synchronization point and ensures that only
170 * one coroutine is called until the reply finishes. */
171 i = HANDLE_TO_INDEX(s, s->reply.handle);
172 if (i >= MAX_NBD_REQUESTS) {
173 goto fail;
174 }
175
176 if (s->recv_coroutine[i]) {
177 qemu_coroutine_enter(s->recv_coroutine[i], NULL);
178 return;
179 }
180
181 fail:
182 for (i = 0; i < MAX_NBD_REQUESTS; i++) {
183 if (s->recv_coroutine[i]) {
184 qemu_coroutine_enter(s->recv_coroutine[i], NULL);
185 }
186 }
187 }
188
189 static void nbd_restart_write(void *opaque)
190 {
191 BDRVNBDState *s = opaque;
192 qemu_coroutine_enter(s->send_coroutine, NULL);
193 }
194
195 static int nbd_co_send_request(BDRVNBDState *s, struct nbd_request *request,
196 QEMUIOVector *qiov, int offset)
197 {
198 int rc, ret;
199
200 qemu_co_mutex_lock(&s->send_mutex);
201 s->send_coroutine = qemu_coroutine_self();
202 qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, nbd_restart_write,
203 nbd_have_request, s);
204 rc = nbd_send_request(s->sock, request);
205 if (rc >= 0 && qiov) {
206 ret = qemu_co_sendv(s->sock, qiov->iov, qiov->niov,
207 offset, request->len);
208 if (ret != request->len) {
209 return -EIO;
210 }
211 }
212 qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, NULL,
213 nbd_have_request, s);
214 s->send_coroutine = NULL;
215 qemu_co_mutex_unlock(&s->send_mutex);
216 return rc;
217 }
218
219 static void nbd_co_receive_reply(BDRVNBDState *s, struct nbd_request *request,
220 struct nbd_reply *reply,
221 QEMUIOVector *qiov, int offset)
222 {
223 int ret;
224
225 /* Wait until we're woken up by the read handler. TODO: perhaps
226 * peek at the next reply and avoid yielding if it's ours? */
227 qemu_coroutine_yield();
228 *reply = s->reply;
229 if (reply->handle != request->handle) {
230 reply->error = EIO;
231 } else {
232 if (qiov && reply->error == 0) {
233 ret = qemu_co_recvv(s->sock, qiov->iov, qiov->niov,
234 offset, request->len);
235 if (ret != request->len) {
236 reply->error = EIO;
237 }
238 }
239
240 /* Tell the read handler to read another header. */
241 s->reply.handle = 0;
242 }
243 }
244
245 static void nbd_coroutine_end(BDRVNBDState *s, struct nbd_request *request)
246 {
247 int i = HANDLE_TO_INDEX(s, request->handle);
248 s->recv_coroutine[i] = NULL;
249 if (s->in_flight-- == MAX_NBD_REQUESTS) {
250 qemu_co_mutex_unlock(&s->free_sema);
251 }
252 }
253
254 static int nbd_establish_connection(BlockDriverState *bs)
255 {
256 BDRVNBDState *s = bs->opaque;
257 int sock;
258 int ret;
259 off_t size;
260 size_t blocksize;
261
262 if (s->is_unix) {
263 sock = unix_socket_outgoing(s->host_spec);
264 } else {
265 sock = tcp_socket_outgoing_spec(s->host_spec);
266 }
267
268 /* Failed to establish connection */
269 if (sock < 0) {
270 logout("Failed to establish connection to NBD server\n");
271 return -errno;
272 }
273
274 /* NBD handshake */
275 ret = nbd_receive_negotiate(sock, s->export_name, &s->nbdflags, &size,
276 &blocksize);
277 if (ret < 0) {
278 logout("Failed to negotiate with the NBD server\n");
279 closesocket(sock);
280 return ret;
281 }
282
283 /* Now that we're connected, set the socket to be non-blocking and
284 * kick the reply mechanism. */
285 socket_set_nonblock(sock);
286 qemu_aio_set_fd_handler(sock, nbd_reply_ready, NULL,
287 nbd_have_request, s);
288
289 s->sock = sock;
290 s->size = size;
291 s->blocksize = blocksize;
292
293 logout("Established connection with NBD server\n");
294 return 0;
295 }
296
297 static void nbd_teardown_connection(BlockDriverState *bs)
298 {
299 BDRVNBDState *s = bs->opaque;
300 struct nbd_request request;
301
302 request.type = NBD_CMD_DISC;
303 request.from = 0;
304 request.len = 0;
305 nbd_send_request(s->sock, &request);
306
307 qemu_aio_set_fd_handler(s->sock, NULL, NULL, NULL, NULL);
308 closesocket(s->sock);
309 }
310
311 static int nbd_open(BlockDriverState *bs, const char* filename, int flags)
312 {
313 BDRVNBDState *s = bs->opaque;
314 int result;
315
316 qemu_co_mutex_init(&s->send_mutex);
317 qemu_co_mutex_init(&s->free_sema);
318
319 /* Pop the config into our state object. Exit if invalid. */
320 result = nbd_config(s, filename);
321 if (result != 0) {
322 return result;
323 }
324
325 /* establish TCP connection, return error if it fails
326 * TODO: Configurable retry-until-timeout behaviour.
327 */
328 result = nbd_establish_connection(bs);
329
330 return result;
331 }
332
333 static int nbd_co_readv_1(BlockDriverState *bs, int64_t sector_num,
334 int nb_sectors, QEMUIOVector *qiov,
335 int offset)
336 {
337 BDRVNBDState *s = bs->opaque;
338 struct nbd_request request;
339 struct nbd_reply reply;
340 ssize_t ret;
341
342 request.type = NBD_CMD_READ;
343 request.from = sector_num * 512;
344 request.len = nb_sectors * 512;
345
346 nbd_coroutine_start(s, &request);
347 ret = nbd_co_send_request(s, &request, NULL, 0);
348 if (ret < 0) {
349 reply.error = -ret;
350 } else {
351 nbd_co_receive_reply(s, &request, &reply, qiov, offset);
352 }
353 nbd_coroutine_end(s, &request);
354 return -reply.error;
355
356 }
357
358 static int nbd_co_writev_1(BlockDriverState *bs, int64_t sector_num,
359 int nb_sectors, QEMUIOVector *qiov,
360 int offset)
361 {
362 BDRVNBDState *s = bs->opaque;
363 struct nbd_request request;
364 struct nbd_reply reply;
365 ssize_t ret;
366
367 request.type = NBD_CMD_WRITE;
368 if (!bdrv_enable_write_cache(bs) && (s->nbdflags & NBD_FLAG_SEND_FUA)) {
369 request.type |= NBD_CMD_FLAG_FUA;
370 }
371
372 request.from = sector_num * 512;
373 request.len = nb_sectors * 512;
374
375 nbd_coroutine_start(s, &request);
376 ret = nbd_co_send_request(s, &request, qiov, offset);
377 if (ret < 0) {
378 reply.error = -ret;
379 } else {
380 nbd_co_receive_reply(s, &request, &reply, NULL, 0);
381 }
382 nbd_coroutine_end(s, &request);
383 return -reply.error;
384 }
385
386 /* qemu-nbd has a limit of slightly less than 1M per request. Try to
387 * remain aligned to 4K. */
388 #define NBD_MAX_SECTORS 2040
389
390 static int nbd_co_readv(BlockDriverState *bs, int64_t sector_num,
391 int nb_sectors, QEMUIOVector *qiov)
392 {
393 int offset = 0;
394 int ret;
395 while (nb_sectors > NBD_MAX_SECTORS) {
396 ret = nbd_co_readv_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset);
397 if (ret < 0) {
398 return ret;
399 }
400 offset += NBD_MAX_SECTORS * 512;
401 sector_num += NBD_MAX_SECTORS;
402 nb_sectors -= NBD_MAX_SECTORS;
403 }
404 return nbd_co_readv_1(bs, sector_num, nb_sectors, qiov, offset);
405 }
406
407 static int nbd_co_writev(BlockDriverState *bs, int64_t sector_num,
408 int nb_sectors, QEMUIOVector *qiov)
409 {
410 int offset = 0;
411 int ret;
412 while (nb_sectors > NBD_MAX_SECTORS) {
413 ret = nbd_co_writev_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset);
414 if (ret < 0) {
415 return ret;
416 }
417 offset += NBD_MAX_SECTORS * 512;
418 sector_num += NBD_MAX_SECTORS;
419 nb_sectors -= NBD_MAX_SECTORS;
420 }
421 return nbd_co_writev_1(bs, sector_num, nb_sectors, qiov, offset);
422 }
423
424 static int nbd_co_flush(BlockDriverState *bs)
425 {
426 BDRVNBDState *s = bs->opaque;
427 struct nbd_request request;
428 struct nbd_reply reply;
429 ssize_t ret;
430
431 if (!(s->nbdflags & NBD_FLAG_SEND_FLUSH)) {
432 return 0;
433 }
434
435 request.type = NBD_CMD_FLUSH;
436 if (s->nbdflags & NBD_FLAG_SEND_FUA) {
437 request.type |= NBD_CMD_FLAG_FUA;
438 }
439
440 request.from = 0;
441 request.len = 0;
442
443 nbd_coroutine_start(s, &request);
444 ret = nbd_co_send_request(s, &request, NULL, 0);
445 if (ret < 0) {
446 reply.error = -ret;
447 } else {
448 nbd_co_receive_reply(s, &request, &reply, NULL, 0);
449 }
450 nbd_coroutine_end(s, &request);
451 return -reply.error;
452 }
453
454 static int nbd_co_discard(BlockDriverState *bs, int64_t sector_num,
455 int nb_sectors)
456 {
457 BDRVNBDState *s = bs->opaque;
458 struct nbd_request request;
459 struct nbd_reply reply;
460 ssize_t ret;
461
462 if (!(s->nbdflags & NBD_FLAG_SEND_TRIM)) {
463 return 0;
464 }
465 request.type = NBD_CMD_TRIM;
466 request.from = sector_num * 512;;
467 request.len = nb_sectors * 512;
468
469 nbd_coroutine_start(s, &request);
470 ret = nbd_co_send_request(s, &request, NULL, 0);
471 if (ret < 0) {
472 reply.error = -ret;
473 } else {
474 nbd_co_receive_reply(s, &request, &reply, NULL, 0);
475 }
476 nbd_coroutine_end(s, &request);
477 return -reply.error;
478 }
479
480 static void nbd_close(BlockDriverState *bs)
481 {
482 BDRVNBDState *s = bs->opaque;
483 g_free(s->export_name);
484 g_free(s->host_spec);
485
486 nbd_teardown_connection(bs);
487 }
488
489 static int64_t nbd_getlength(BlockDriverState *bs)
490 {
491 BDRVNBDState *s = bs->opaque;
492
493 return s->size;
494 }
495
496 static BlockDriver bdrv_nbd = {
497 .format_name = "nbd",
498 .instance_size = sizeof(BDRVNBDState),
499 .bdrv_file_open = nbd_open,
500 .bdrv_co_readv = nbd_co_readv,
501 .bdrv_co_writev = nbd_co_writev,
502 .bdrv_close = nbd_close,
503 .bdrv_co_flush_to_os = nbd_co_flush,
504 .bdrv_co_discard = nbd_co_discard,
505 .bdrv_getlength = nbd_getlength,
506 .protocol_name = "nbd",
507 };
508
509 static void bdrv_nbd_init(void)
510 {
511 bdrv_register(&bdrv_nbd);
512 }
513
514 block_init(bdrv_nbd_init);