]> git.proxmox.com Git - mirror_qemu.git/blob - block/gluster.c
gluster: Switch .bdrv_co_discard() to byte-based
[mirror_qemu.git] / block / gluster.c
1 /*
2 * GlusterFS backend for QEMU
3 *
4 * Copyright (C) 2012 Bharata B Rao <bharata@linux.vnet.ibm.com>
5 *
6 * This work is licensed under the terms of the GNU GPL, version 2 or later.
7 * See the COPYING file in the top-level directory.
8 *
9 */
10 #include "qemu/osdep.h"
11 #include <glusterfs/api/glfs.h>
12 #include "block/block_int.h"
13 #include "qapi/error.h"
14 #include "qemu/uri.h"
15
16 typedef struct GlusterAIOCB {
17 int64_t size;
18 int ret;
19 QEMUBH *bh;
20 Coroutine *coroutine;
21 AioContext *aio_context;
22 } GlusterAIOCB;
23
24 typedef struct BDRVGlusterState {
25 struct glfs *glfs;
26 struct glfs_fd *fd;
27 bool supports_seek_data;
28 int debug_level;
29 } BDRVGlusterState;
30
31 typedef struct GlusterConf {
32 char *server;
33 int port;
34 char *volname;
35 char *image;
36 char *transport;
37 int debug_level;
38 } GlusterConf;
39
40 static void qemu_gluster_gconf_free(GlusterConf *gconf)
41 {
42 if (gconf) {
43 g_free(gconf->server);
44 g_free(gconf->volname);
45 g_free(gconf->image);
46 g_free(gconf->transport);
47 g_free(gconf);
48 }
49 }
50
51 static int parse_volume_options(GlusterConf *gconf, char *path)
52 {
53 char *p, *q;
54
55 if (!path) {
56 return -EINVAL;
57 }
58
59 /* volume */
60 p = q = path + strspn(path, "/");
61 p += strcspn(p, "/");
62 if (*p == '\0') {
63 return -EINVAL;
64 }
65 gconf->volname = g_strndup(q, p - q);
66
67 /* image */
68 p += strspn(p, "/");
69 if (*p == '\0') {
70 return -EINVAL;
71 }
72 gconf->image = g_strdup(p);
73 return 0;
74 }
75
76 /*
77 * file=gluster[+transport]://[server[:port]]/volname/image[?socket=...]
78 *
79 * 'gluster' is the protocol.
80 *
81 * 'transport' specifies the transport type used to connect to gluster
82 * management daemon (glusterd). Valid transport types are
83 * tcp, unix and rdma. If a transport type isn't specified, then tcp
84 * type is assumed.
85 *
86 * 'server' specifies the server where the volume file specification for
87 * the given volume resides. This can be either hostname, ipv4 address
88 * or ipv6 address. ipv6 address needs to be within square brackets [ ].
89 * If transport type is 'unix', then 'server' field should not be specified.
90 * The 'socket' field needs to be populated with the path to unix domain
91 * socket.
92 *
93 * 'port' is the port number on which glusterd is listening. This is optional
94 * and if not specified, QEMU will send 0 which will make gluster to use the
95 * default port. If the transport type is unix, then 'port' should not be
96 * specified.
97 *
98 * 'volname' is the name of the gluster volume which contains the VM image.
99 *
100 * 'image' is the path to the actual VM image that resides on gluster volume.
101 *
102 * Examples:
103 *
104 * file=gluster://1.2.3.4/testvol/a.img
105 * file=gluster+tcp://1.2.3.4/testvol/a.img
106 * file=gluster+tcp://1.2.3.4:24007/testvol/dir/a.img
107 * file=gluster+tcp://[1:2:3:4:5:6:7:8]/testvol/dir/a.img
108 * file=gluster+tcp://[1:2:3:4:5:6:7:8]:24007/testvol/dir/a.img
109 * file=gluster+tcp://server.domain.com:24007/testvol/dir/a.img
110 * file=gluster+unix:///testvol/dir/a.img?socket=/tmp/glusterd.socket
111 * file=gluster+rdma://1.2.3.4:24007/testvol/a.img
112 */
113 static int qemu_gluster_parseuri(GlusterConf *gconf, const char *filename)
114 {
115 URI *uri;
116 QueryParams *qp = NULL;
117 bool is_unix = false;
118 int ret = 0;
119
120 uri = uri_parse(filename);
121 if (!uri) {
122 return -EINVAL;
123 }
124
125 /* transport */
126 if (!uri->scheme || !strcmp(uri->scheme, "gluster")) {
127 gconf->transport = g_strdup("tcp");
128 } else if (!strcmp(uri->scheme, "gluster+tcp")) {
129 gconf->transport = g_strdup("tcp");
130 } else if (!strcmp(uri->scheme, "gluster+unix")) {
131 gconf->transport = g_strdup("unix");
132 is_unix = true;
133 } else if (!strcmp(uri->scheme, "gluster+rdma")) {
134 gconf->transport = g_strdup("rdma");
135 } else {
136 ret = -EINVAL;
137 goto out;
138 }
139
140 ret = parse_volume_options(gconf, uri->path);
141 if (ret < 0) {
142 goto out;
143 }
144
145 qp = query_params_parse(uri->query);
146 if (qp->n > 1 || (is_unix && !qp->n) || (!is_unix && qp->n)) {
147 ret = -EINVAL;
148 goto out;
149 }
150
151 if (is_unix) {
152 if (uri->server || uri->port) {
153 ret = -EINVAL;
154 goto out;
155 }
156 if (strcmp(qp->p[0].name, "socket")) {
157 ret = -EINVAL;
158 goto out;
159 }
160 gconf->server = g_strdup(qp->p[0].value);
161 } else {
162 gconf->server = g_strdup(uri->server ? uri->server : "localhost");
163 gconf->port = uri->port;
164 }
165
166 out:
167 if (qp) {
168 query_params_free(qp);
169 }
170 uri_free(uri);
171 return ret;
172 }
173
174 static struct glfs *qemu_gluster_init(GlusterConf *gconf, const char *filename,
175 Error **errp)
176 {
177 struct glfs *glfs = NULL;
178 int ret;
179 int old_errno;
180
181 ret = qemu_gluster_parseuri(gconf, filename);
182 if (ret < 0) {
183 error_setg(errp, "Usage: file=gluster[+transport]://[server[:port]]/"
184 "volname/image[?socket=...]");
185 errno = -ret;
186 goto out;
187 }
188
189 glfs = glfs_new(gconf->volname);
190 if (!glfs) {
191 goto out;
192 }
193
194 ret = glfs_set_volfile_server(glfs, gconf->transport, gconf->server,
195 gconf->port);
196 if (ret < 0) {
197 goto out;
198 }
199
200 ret = glfs_set_logging(glfs, "-", gconf->debug_level);
201 if (ret < 0) {
202 goto out;
203 }
204
205 ret = glfs_init(glfs);
206 if (ret) {
207 error_setg_errno(errp, errno,
208 "Gluster connection failed for server=%s port=%d "
209 "volume=%s image=%s transport=%s", gconf->server,
210 gconf->port, gconf->volname, gconf->image,
211 gconf->transport);
212
213 /* glfs_init sometimes doesn't set errno although docs suggest that */
214 if (errno == 0)
215 errno = EINVAL;
216
217 goto out;
218 }
219 return glfs;
220
221 out:
222 if (glfs) {
223 old_errno = errno;
224 glfs_fini(glfs);
225 errno = old_errno;
226 }
227 return NULL;
228 }
229
230 static void qemu_gluster_complete_aio(void *opaque)
231 {
232 GlusterAIOCB *acb = (GlusterAIOCB *)opaque;
233
234 qemu_bh_delete(acb->bh);
235 acb->bh = NULL;
236 qemu_coroutine_enter(acb->coroutine);
237 }
238
239 /*
240 * AIO callback routine called from GlusterFS thread.
241 */
242 static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
243 {
244 GlusterAIOCB *acb = (GlusterAIOCB *)arg;
245
246 if (!ret || ret == acb->size) {
247 acb->ret = 0; /* Success */
248 } else if (ret < 0) {
249 acb->ret = -errno; /* Read/Write failed */
250 } else {
251 acb->ret = -EIO; /* Partial read/write - fail it */
252 }
253
254 acb->bh = aio_bh_new(acb->aio_context, qemu_gluster_complete_aio, acb);
255 qemu_bh_schedule(acb->bh);
256 }
257
258 #define GLUSTER_OPT_FILENAME "filename"
259 #define GLUSTER_OPT_DEBUG "debug"
260 #define GLUSTER_DEBUG_DEFAULT 4
261 #define GLUSTER_DEBUG_MAX 9
262
263 /* TODO Convert to fine grained options */
264 static QemuOptsList runtime_opts = {
265 .name = "gluster",
266 .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
267 .desc = {
268 {
269 .name = GLUSTER_OPT_FILENAME,
270 .type = QEMU_OPT_STRING,
271 .help = "URL to the gluster image",
272 },
273 {
274 .name = GLUSTER_OPT_DEBUG,
275 .type = QEMU_OPT_NUMBER,
276 .help = "Gluster log level, valid range is 0-9",
277 },
278 { /* end of list */ }
279 },
280 };
281
282 static void qemu_gluster_parse_flags(int bdrv_flags, int *open_flags)
283 {
284 assert(open_flags != NULL);
285
286 *open_flags |= O_BINARY;
287
288 if (bdrv_flags & BDRV_O_RDWR) {
289 *open_flags |= O_RDWR;
290 } else {
291 *open_flags |= O_RDONLY;
292 }
293
294 if ((bdrv_flags & BDRV_O_NOCACHE)) {
295 *open_flags |= O_DIRECT;
296 }
297 }
298
299 /*
300 * Do SEEK_DATA/HOLE to detect if it is functional. Older broken versions of
301 * gfapi incorrectly return the current offset when SEEK_DATA/HOLE is used.
302 * - Corrected versions return -1 and set errno to EINVAL.
303 * - Versions that support SEEK_DATA/HOLE correctly, will return -1 and set
304 * errno to ENXIO when SEEK_DATA is called with a position of EOF.
305 */
306 static bool qemu_gluster_test_seek(struct glfs_fd *fd)
307 {
308 off_t ret, eof;
309
310 eof = glfs_lseek(fd, 0, SEEK_END);
311 if (eof < 0) {
312 /* this should never occur */
313 return false;
314 }
315
316 /* this should always fail with ENXIO if SEEK_DATA is supported */
317 ret = glfs_lseek(fd, eof, SEEK_DATA);
318 return (ret < 0) && (errno == ENXIO);
319 }
320
321 static int qemu_gluster_open(BlockDriverState *bs, QDict *options,
322 int bdrv_flags, Error **errp)
323 {
324 BDRVGlusterState *s = bs->opaque;
325 int open_flags = 0;
326 int ret = 0;
327 GlusterConf *gconf = g_new0(GlusterConf, 1);
328 QemuOpts *opts;
329 Error *local_err = NULL;
330 const char *filename;
331
332 opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
333 qemu_opts_absorb_qdict(opts, options, &local_err);
334 if (local_err) {
335 error_propagate(errp, local_err);
336 ret = -EINVAL;
337 goto out;
338 }
339
340 filename = qemu_opt_get(opts, GLUSTER_OPT_FILENAME);
341
342 s->debug_level = qemu_opt_get_number(opts, GLUSTER_OPT_DEBUG,
343 GLUSTER_DEBUG_DEFAULT);
344 if (s->debug_level < 0) {
345 s->debug_level = 0;
346 } else if (s->debug_level > GLUSTER_DEBUG_MAX) {
347 s->debug_level = GLUSTER_DEBUG_MAX;
348 }
349
350 gconf->debug_level = s->debug_level;
351 s->glfs = qemu_gluster_init(gconf, filename, errp);
352 if (!s->glfs) {
353 ret = -errno;
354 goto out;
355 }
356
357 #ifdef CONFIG_GLUSTERFS_XLATOR_OPT
358 /* Without this, if fsync fails for a recoverable reason (for instance,
359 * ENOSPC), gluster will dump its cache, preventing retries. This means
360 * almost certain data loss. Not all gluster versions support the
361 * 'resync-failed-syncs-after-fsync' key value, but there is no way to
362 * discover during runtime if it is supported (this api returns success for
363 * unknown key/value pairs) */
364 ret = glfs_set_xlator_option(s->glfs, "*-write-behind",
365 "resync-failed-syncs-after-fsync",
366 "on");
367 if (ret < 0) {
368 error_setg_errno(errp, errno, "Unable to set xlator key/value pair");
369 ret = -errno;
370 goto out;
371 }
372 #endif
373
374 qemu_gluster_parse_flags(bdrv_flags, &open_flags);
375
376 s->fd = glfs_open(s->glfs, gconf->image, open_flags);
377 if (!s->fd) {
378 ret = -errno;
379 }
380
381 s->supports_seek_data = qemu_gluster_test_seek(s->fd);
382
383 out:
384 qemu_opts_del(opts);
385 qemu_gluster_gconf_free(gconf);
386 if (!ret) {
387 return ret;
388 }
389 if (s->fd) {
390 glfs_close(s->fd);
391 }
392 if (s->glfs) {
393 glfs_fini(s->glfs);
394 }
395 return ret;
396 }
397
398 typedef struct BDRVGlusterReopenState {
399 struct glfs *glfs;
400 struct glfs_fd *fd;
401 } BDRVGlusterReopenState;
402
403
404 static int qemu_gluster_reopen_prepare(BDRVReopenState *state,
405 BlockReopenQueue *queue, Error **errp)
406 {
407 int ret = 0;
408 BDRVGlusterState *s;
409 BDRVGlusterReopenState *reop_s;
410 GlusterConf *gconf = NULL;
411 int open_flags = 0;
412
413 assert(state != NULL);
414 assert(state->bs != NULL);
415
416 s = state->bs->opaque;
417
418 state->opaque = g_new0(BDRVGlusterReopenState, 1);
419 reop_s = state->opaque;
420
421 qemu_gluster_parse_flags(state->flags, &open_flags);
422
423 gconf = g_new0(GlusterConf, 1);
424
425 gconf->debug_level = s->debug_level;
426 reop_s->glfs = qemu_gluster_init(gconf, state->bs->filename, errp);
427 if (reop_s->glfs == NULL) {
428 ret = -errno;
429 goto exit;
430 }
431
432 #ifdef CONFIG_GLUSTERFS_XLATOR_OPT
433 ret = glfs_set_xlator_option(reop_s->glfs, "*-write-behind",
434 "resync-failed-syncs-after-fsync", "on");
435 if (ret < 0) {
436 error_setg_errno(errp, errno, "Unable to set xlator key/value pair");
437 ret = -errno;
438 goto exit;
439 }
440 #endif
441
442 reop_s->fd = glfs_open(reop_s->glfs, gconf->image, open_flags);
443 if (reop_s->fd == NULL) {
444 /* reops->glfs will be cleaned up in _abort */
445 ret = -errno;
446 goto exit;
447 }
448
449 exit:
450 /* state->opaque will be freed in either the _abort or _commit */
451 qemu_gluster_gconf_free(gconf);
452 return ret;
453 }
454
455 static void qemu_gluster_reopen_commit(BDRVReopenState *state)
456 {
457 BDRVGlusterReopenState *reop_s = state->opaque;
458 BDRVGlusterState *s = state->bs->opaque;
459
460
461 /* close the old */
462 if (s->fd) {
463 glfs_close(s->fd);
464 }
465 if (s->glfs) {
466 glfs_fini(s->glfs);
467 }
468
469 /* use the newly opened image / connection */
470 s->fd = reop_s->fd;
471 s->glfs = reop_s->glfs;
472
473 g_free(state->opaque);
474 state->opaque = NULL;
475
476 return;
477 }
478
479
480 static void qemu_gluster_reopen_abort(BDRVReopenState *state)
481 {
482 BDRVGlusterReopenState *reop_s = state->opaque;
483
484 if (reop_s == NULL) {
485 return;
486 }
487
488 if (reop_s->fd) {
489 glfs_close(reop_s->fd);
490 }
491
492 if (reop_s->glfs) {
493 glfs_fini(reop_s->glfs);
494 }
495
496 g_free(state->opaque);
497 state->opaque = NULL;
498
499 return;
500 }
501
502 #ifdef CONFIG_GLUSTERFS_ZEROFILL
503 static coroutine_fn int qemu_gluster_co_pwrite_zeroes(BlockDriverState *bs,
504 int64_t offset, int size, BdrvRequestFlags flags)
505 {
506 int ret;
507 GlusterAIOCB acb;
508 BDRVGlusterState *s = bs->opaque;
509
510 acb.size = size;
511 acb.ret = 0;
512 acb.coroutine = qemu_coroutine_self();
513 acb.aio_context = bdrv_get_aio_context(bs);
514
515 ret = glfs_zerofill_async(s->fd, offset, size, gluster_finish_aiocb, &acb);
516 if (ret < 0) {
517 return -errno;
518 }
519
520 qemu_coroutine_yield();
521 return acb.ret;
522 }
523
524 static inline bool gluster_supports_zerofill(void)
525 {
526 return 1;
527 }
528
529 static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset,
530 int64_t size)
531 {
532 return glfs_zerofill(fd, offset, size);
533 }
534
535 #else
536 static inline bool gluster_supports_zerofill(void)
537 {
538 return 0;
539 }
540
541 static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset,
542 int64_t size)
543 {
544 return 0;
545 }
546 #endif
547
548 static int qemu_gluster_create(const char *filename,
549 QemuOpts *opts, Error **errp)
550 {
551 struct glfs *glfs;
552 struct glfs_fd *fd;
553 int ret = 0;
554 int prealloc = 0;
555 int64_t total_size = 0;
556 char *tmp = NULL;
557 GlusterConf *gconf = g_new0(GlusterConf, 1);
558
559 gconf->debug_level = qemu_opt_get_number_del(opts, GLUSTER_OPT_DEBUG,
560 GLUSTER_DEBUG_DEFAULT);
561 if (gconf->debug_level < 0) {
562 gconf->debug_level = 0;
563 } else if (gconf->debug_level > GLUSTER_DEBUG_MAX) {
564 gconf->debug_level = GLUSTER_DEBUG_MAX;
565 }
566
567 glfs = qemu_gluster_init(gconf, filename, errp);
568 if (!glfs) {
569 ret = -errno;
570 goto out;
571 }
572
573 total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
574 BDRV_SECTOR_SIZE);
575
576 tmp = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
577 if (!tmp || !strcmp(tmp, "off")) {
578 prealloc = 0;
579 } else if (!strcmp(tmp, "full") &&
580 gluster_supports_zerofill()) {
581 prealloc = 1;
582 } else {
583 error_setg(errp, "Invalid preallocation mode: '%s'"
584 " or GlusterFS doesn't support zerofill API",
585 tmp);
586 ret = -EINVAL;
587 goto out;
588 }
589
590 fd = glfs_creat(glfs, gconf->image,
591 O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IRUSR | S_IWUSR);
592 if (!fd) {
593 ret = -errno;
594 } else {
595 if (!glfs_ftruncate(fd, total_size)) {
596 if (prealloc && qemu_gluster_zerofill(fd, 0, total_size)) {
597 ret = -errno;
598 }
599 } else {
600 ret = -errno;
601 }
602
603 if (glfs_close(fd) != 0) {
604 ret = -errno;
605 }
606 }
607 out:
608 g_free(tmp);
609 qemu_gluster_gconf_free(gconf);
610 if (glfs) {
611 glfs_fini(glfs);
612 }
613 return ret;
614 }
615
616 static coroutine_fn int qemu_gluster_co_rw(BlockDriverState *bs,
617 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int write)
618 {
619 int ret;
620 GlusterAIOCB acb;
621 BDRVGlusterState *s = bs->opaque;
622 size_t size = nb_sectors * BDRV_SECTOR_SIZE;
623 off_t offset = sector_num * BDRV_SECTOR_SIZE;
624
625 acb.size = size;
626 acb.ret = 0;
627 acb.coroutine = qemu_coroutine_self();
628 acb.aio_context = bdrv_get_aio_context(bs);
629
630 if (write) {
631 ret = glfs_pwritev_async(s->fd, qiov->iov, qiov->niov, offset, 0,
632 gluster_finish_aiocb, &acb);
633 } else {
634 ret = glfs_preadv_async(s->fd, qiov->iov, qiov->niov, offset, 0,
635 gluster_finish_aiocb, &acb);
636 }
637
638 if (ret < 0) {
639 return -errno;
640 }
641
642 qemu_coroutine_yield();
643 return acb.ret;
644 }
645
646 static int qemu_gluster_truncate(BlockDriverState *bs, int64_t offset)
647 {
648 int ret;
649 BDRVGlusterState *s = bs->opaque;
650
651 ret = glfs_ftruncate(s->fd, offset);
652 if (ret < 0) {
653 return -errno;
654 }
655
656 return 0;
657 }
658
659 static coroutine_fn int qemu_gluster_co_readv(BlockDriverState *bs,
660 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
661 {
662 return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 0);
663 }
664
665 static coroutine_fn int qemu_gluster_co_writev(BlockDriverState *bs,
666 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
667 {
668 return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 1);
669 }
670
671 static void qemu_gluster_close(BlockDriverState *bs)
672 {
673 BDRVGlusterState *s = bs->opaque;
674
675 if (s->fd) {
676 glfs_close(s->fd);
677 s->fd = NULL;
678 }
679 glfs_fini(s->glfs);
680 }
681
682 static coroutine_fn int qemu_gluster_co_flush_to_disk(BlockDriverState *bs)
683 {
684 int ret;
685 GlusterAIOCB acb;
686 BDRVGlusterState *s = bs->opaque;
687
688 acb.size = 0;
689 acb.ret = 0;
690 acb.coroutine = qemu_coroutine_self();
691 acb.aio_context = bdrv_get_aio_context(bs);
692
693 ret = glfs_fsync_async(s->fd, gluster_finish_aiocb, &acb);
694 if (ret < 0) {
695 ret = -errno;
696 goto error;
697 }
698
699 qemu_coroutine_yield();
700 if (acb.ret < 0) {
701 ret = acb.ret;
702 goto error;
703 }
704
705 return acb.ret;
706
707 error:
708 /* Some versions of Gluster (3.5.6 -> 3.5.8?) will not retain its cache
709 * after a fsync failure, so we have no way of allowing the guest to safely
710 * continue. Gluster versions prior to 3.5.6 don't retain the cache
711 * either, but will invalidate the fd on error, so this is again our only
712 * option.
713 *
714 * The 'resync-failed-syncs-after-fsync' xlator option for the
715 * write-behind cache will cause later gluster versions to retain its
716 * cache after error, so long as the fd remains open. However, we
717 * currently have no way of knowing if this option is supported.
718 *
719 * TODO: Once gluster provides a way for us to determine if the option
720 * is supported, bypass the closure and setting drv to NULL. */
721 qemu_gluster_close(bs);
722 bs->drv = NULL;
723 return ret;
724 }
725
726 #ifdef CONFIG_GLUSTERFS_DISCARD
727 static coroutine_fn int qemu_gluster_co_pdiscard(BlockDriverState *bs,
728 int64_t offset, int size)
729 {
730 int ret;
731 GlusterAIOCB acb;
732 BDRVGlusterState *s = bs->opaque;
733
734 acb.size = 0;
735 acb.ret = 0;
736 acb.coroutine = qemu_coroutine_self();
737 acb.aio_context = bdrv_get_aio_context(bs);
738
739 ret = glfs_discard_async(s->fd, offset, size, gluster_finish_aiocb, &acb);
740 if (ret < 0) {
741 return -errno;
742 }
743
744 qemu_coroutine_yield();
745 return acb.ret;
746 }
747 #endif
748
749 static int64_t qemu_gluster_getlength(BlockDriverState *bs)
750 {
751 BDRVGlusterState *s = bs->opaque;
752 int64_t ret;
753
754 ret = glfs_lseek(s->fd, 0, SEEK_END);
755 if (ret < 0) {
756 return -errno;
757 } else {
758 return ret;
759 }
760 }
761
762 static int64_t qemu_gluster_allocated_file_size(BlockDriverState *bs)
763 {
764 BDRVGlusterState *s = bs->opaque;
765 struct stat st;
766 int ret;
767
768 ret = glfs_fstat(s->fd, &st);
769 if (ret < 0) {
770 return -errno;
771 } else {
772 return st.st_blocks * 512;
773 }
774 }
775
776 static int qemu_gluster_has_zero_init(BlockDriverState *bs)
777 {
778 /* GlusterFS volume could be backed by a block device */
779 return 0;
780 }
781
782 /*
783 * Find allocation range in @bs around offset @start.
784 * May change underlying file descriptor's file offset.
785 * If @start is not in a hole, store @start in @data, and the
786 * beginning of the next hole in @hole, and return 0.
787 * If @start is in a non-trailing hole, store @start in @hole and the
788 * beginning of the next non-hole in @data, and return 0.
789 * If @start is in a trailing hole or beyond EOF, return -ENXIO.
790 * If we can't find out, return a negative errno other than -ENXIO.
791 *
792 * (Shamefully copied from raw-posix.c, only miniscule adaptions.)
793 */
794 static int find_allocation(BlockDriverState *bs, off_t start,
795 off_t *data, off_t *hole)
796 {
797 BDRVGlusterState *s = bs->opaque;
798 off_t offs;
799
800 if (!s->supports_seek_data) {
801 return -ENOTSUP;
802 }
803
804 /*
805 * SEEK_DATA cases:
806 * D1. offs == start: start is in data
807 * D2. offs > start: start is in a hole, next data at offs
808 * D3. offs < 0, errno = ENXIO: either start is in a trailing hole
809 * or start is beyond EOF
810 * If the latter happens, the file has been truncated behind
811 * our back since we opened it. All bets are off then.
812 * Treating like a trailing hole is simplest.
813 * D4. offs < 0, errno != ENXIO: we learned nothing
814 */
815 offs = glfs_lseek(s->fd, start, SEEK_DATA);
816 if (offs < 0) {
817 return -errno; /* D3 or D4 */
818 }
819 assert(offs >= start);
820
821 if (offs > start) {
822 /* D2: in hole, next data at offs */
823 *hole = start;
824 *data = offs;
825 return 0;
826 }
827
828 /* D1: in data, end not yet known */
829
830 /*
831 * SEEK_HOLE cases:
832 * H1. offs == start: start is in a hole
833 * If this happens here, a hole has been dug behind our back
834 * since the previous lseek().
835 * H2. offs > start: either start is in data, next hole at offs,
836 * or start is in trailing hole, EOF at offs
837 * Linux treats trailing holes like any other hole: offs ==
838 * start. Solaris seeks to EOF instead: offs > start (blech).
839 * If that happens here, a hole has been dug behind our back
840 * since the previous lseek().
841 * H3. offs < 0, errno = ENXIO: start is beyond EOF
842 * If this happens, the file has been truncated behind our
843 * back since we opened it. Treat it like a trailing hole.
844 * H4. offs < 0, errno != ENXIO: we learned nothing
845 * Pretend we know nothing at all, i.e. "forget" about D1.
846 */
847 offs = glfs_lseek(s->fd, start, SEEK_HOLE);
848 if (offs < 0) {
849 return -errno; /* D1 and (H3 or H4) */
850 }
851 assert(offs >= start);
852
853 if (offs > start) {
854 /*
855 * D1 and H2: either in data, next hole at offs, or it was in
856 * data but is now in a trailing hole. In the latter case,
857 * all bets are off. Treating it as if it there was data all
858 * the way to EOF is safe, so simply do that.
859 */
860 *data = start;
861 *hole = offs;
862 return 0;
863 }
864
865 /* D1 and H1 */
866 return -EBUSY;
867 }
868
869 /*
870 * Returns the allocation status of the specified sectors.
871 *
872 * If 'sector_num' is beyond the end of the disk image the return value is 0
873 * and 'pnum' is set to 0.
874 *
875 * 'pnum' is set to the number of sectors (including and immediately following
876 * the specified sector) that are known to be in the same
877 * allocated/unallocated state.
878 *
879 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
880 * beyond the end of the disk image it will be clamped.
881 *
882 * (Based on raw_co_get_block_status() from raw-posix.c.)
883 */
884 static int64_t coroutine_fn qemu_gluster_co_get_block_status(
885 BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum,
886 BlockDriverState **file)
887 {
888 BDRVGlusterState *s = bs->opaque;
889 off_t start, data = 0, hole = 0;
890 int64_t total_size;
891 int ret = -EINVAL;
892
893 if (!s->fd) {
894 return ret;
895 }
896
897 start = sector_num * BDRV_SECTOR_SIZE;
898 total_size = bdrv_getlength(bs);
899 if (total_size < 0) {
900 return total_size;
901 } else if (start >= total_size) {
902 *pnum = 0;
903 return 0;
904 } else if (start + nb_sectors * BDRV_SECTOR_SIZE > total_size) {
905 nb_sectors = DIV_ROUND_UP(total_size - start, BDRV_SECTOR_SIZE);
906 }
907
908 ret = find_allocation(bs, start, &data, &hole);
909 if (ret == -ENXIO) {
910 /* Trailing hole */
911 *pnum = nb_sectors;
912 ret = BDRV_BLOCK_ZERO;
913 } else if (ret < 0) {
914 /* No info available, so pretend there are no holes */
915 *pnum = nb_sectors;
916 ret = BDRV_BLOCK_DATA;
917 } else if (data == start) {
918 /* On a data extent, compute sectors to the end of the extent,
919 * possibly including a partial sector at EOF. */
920 *pnum = MIN(nb_sectors, DIV_ROUND_UP(hole - start, BDRV_SECTOR_SIZE));
921 ret = BDRV_BLOCK_DATA;
922 } else {
923 /* On a hole, compute sectors to the beginning of the next extent. */
924 assert(hole == start);
925 *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE);
926 ret = BDRV_BLOCK_ZERO;
927 }
928
929 *file = bs;
930
931 return ret | BDRV_BLOCK_OFFSET_VALID | start;
932 }
933
934
935 static QemuOptsList qemu_gluster_create_opts = {
936 .name = "qemu-gluster-create-opts",
937 .head = QTAILQ_HEAD_INITIALIZER(qemu_gluster_create_opts.head),
938 .desc = {
939 {
940 .name = BLOCK_OPT_SIZE,
941 .type = QEMU_OPT_SIZE,
942 .help = "Virtual disk size"
943 },
944 {
945 .name = BLOCK_OPT_PREALLOC,
946 .type = QEMU_OPT_STRING,
947 .help = "Preallocation mode (allowed values: off, full)"
948 },
949 {
950 .name = GLUSTER_OPT_DEBUG,
951 .type = QEMU_OPT_NUMBER,
952 .help = "Gluster log level, valid range is 0-9",
953 },
954 { /* end of list */ }
955 }
956 };
957
958 static BlockDriver bdrv_gluster = {
959 .format_name = "gluster",
960 .protocol_name = "gluster",
961 .instance_size = sizeof(BDRVGlusterState),
962 .bdrv_needs_filename = true,
963 .bdrv_file_open = qemu_gluster_open,
964 .bdrv_reopen_prepare = qemu_gluster_reopen_prepare,
965 .bdrv_reopen_commit = qemu_gluster_reopen_commit,
966 .bdrv_reopen_abort = qemu_gluster_reopen_abort,
967 .bdrv_close = qemu_gluster_close,
968 .bdrv_create = qemu_gluster_create,
969 .bdrv_getlength = qemu_gluster_getlength,
970 .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
971 .bdrv_truncate = qemu_gluster_truncate,
972 .bdrv_co_readv = qemu_gluster_co_readv,
973 .bdrv_co_writev = qemu_gluster_co_writev,
974 .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk,
975 .bdrv_has_zero_init = qemu_gluster_has_zero_init,
976 #ifdef CONFIG_GLUSTERFS_DISCARD
977 .bdrv_co_pdiscard = qemu_gluster_co_pdiscard,
978 #endif
979 #ifdef CONFIG_GLUSTERFS_ZEROFILL
980 .bdrv_co_pwrite_zeroes = qemu_gluster_co_pwrite_zeroes,
981 #endif
982 .bdrv_co_get_block_status = qemu_gluster_co_get_block_status,
983 .create_opts = &qemu_gluster_create_opts,
984 };
985
986 static BlockDriver bdrv_gluster_tcp = {
987 .format_name = "gluster",
988 .protocol_name = "gluster+tcp",
989 .instance_size = sizeof(BDRVGlusterState),
990 .bdrv_needs_filename = true,
991 .bdrv_file_open = qemu_gluster_open,
992 .bdrv_reopen_prepare = qemu_gluster_reopen_prepare,
993 .bdrv_reopen_commit = qemu_gluster_reopen_commit,
994 .bdrv_reopen_abort = qemu_gluster_reopen_abort,
995 .bdrv_close = qemu_gluster_close,
996 .bdrv_create = qemu_gluster_create,
997 .bdrv_getlength = qemu_gluster_getlength,
998 .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
999 .bdrv_truncate = qemu_gluster_truncate,
1000 .bdrv_co_readv = qemu_gluster_co_readv,
1001 .bdrv_co_writev = qemu_gluster_co_writev,
1002 .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk,
1003 .bdrv_has_zero_init = qemu_gluster_has_zero_init,
1004 #ifdef CONFIG_GLUSTERFS_DISCARD
1005 .bdrv_co_pdiscard = qemu_gluster_co_pdiscard,
1006 #endif
1007 #ifdef CONFIG_GLUSTERFS_ZEROFILL
1008 .bdrv_co_pwrite_zeroes = qemu_gluster_co_pwrite_zeroes,
1009 #endif
1010 .bdrv_co_get_block_status = qemu_gluster_co_get_block_status,
1011 .create_opts = &qemu_gluster_create_opts,
1012 };
1013
1014 static BlockDriver bdrv_gluster_unix = {
1015 .format_name = "gluster",
1016 .protocol_name = "gluster+unix",
1017 .instance_size = sizeof(BDRVGlusterState),
1018 .bdrv_needs_filename = true,
1019 .bdrv_file_open = qemu_gluster_open,
1020 .bdrv_reopen_prepare = qemu_gluster_reopen_prepare,
1021 .bdrv_reopen_commit = qemu_gluster_reopen_commit,
1022 .bdrv_reopen_abort = qemu_gluster_reopen_abort,
1023 .bdrv_close = qemu_gluster_close,
1024 .bdrv_create = qemu_gluster_create,
1025 .bdrv_getlength = qemu_gluster_getlength,
1026 .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
1027 .bdrv_truncate = qemu_gluster_truncate,
1028 .bdrv_co_readv = qemu_gluster_co_readv,
1029 .bdrv_co_writev = qemu_gluster_co_writev,
1030 .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk,
1031 .bdrv_has_zero_init = qemu_gluster_has_zero_init,
1032 #ifdef CONFIG_GLUSTERFS_DISCARD
1033 .bdrv_co_pdiscard = qemu_gluster_co_pdiscard,
1034 #endif
1035 #ifdef CONFIG_GLUSTERFS_ZEROFILL
1036 .bdrv_co_pwrite_zeroes = qemu_gluster_co_pwrite_zeroes,
1037 #endif
1038 .bdrv_co_get_block_status = qemu_gluster_co_get_block_status,
1039 .create_opts = &qemu_gluster_create_opts,
1040 };
1041
1042 static BlockDriver bdrv_gluster_rdma = {
1043 .format_name = "gluster",
1044 .protocol_name = "gluster+rdma",
1045 .instance_size = sizeof(BDRVGlusterState),
1046 .bdrv_needs_filename = true,
1047 .bdrv_file_open = qemu_gluster_open,
1048 .bdrv_reopen_prepare = qemu_gluster_reopen_prepare,
1049 .bdrv_reopen_commit = qemu_gluster_reopen_commit,
1050 .bdrv_reopen_abort = qemu_gluster_reopen_abort,
1051 .bdrv_close = qemu_gluster_close,
1052 .bdrv_create = qemu_gluster_create,
1053 .bdrv_getlength = qemu_gluster_getlength,
1054 .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
1055 .bdrv_truncate = qemu_gluster_truncate,
1056 .bdrv_co_readv = qemu_gluster_co_readv,
1057 .bdrv_co_writev = qemu_gluster_co_writev,
1058 .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk,
1059 .bdrv_has_zero_init = qemu_gluster_has_zero_init,
1060 #ifdef CONFIG_GLUSTERFS_DISCARD
1061 .bdrv_co_pdiscard = qemu_gluster_co_pdiscard,
1062 #endif
1063 #ifdef CONFIG_GLUSTERFS_ZEROFILL
1064 .bdrv_co_pwrite_zeroes = qemu_gluster_co_pwrite_zeroes,
1065 #endif
1066 .bdrv_co_get_block_status = qemu_gluster_co_get_block_status,
1067 .create_opts = &qemu_gluster_create_opts,
1068 };
1069
1070 static void bdrv_gluster_init(void)
1071 {
1072 bdrv_register(&bdrv_gluster_rdma);
1073 bdrv_register(&bdrv_gluster_unix);
1074 bdrv_register(&bdrv_gluster_tcp);
1075 bdrv_register(&bdrv_gluster);
1076 }
1077
1078 block_init(bdrv_gluster_init);