]> git.proxmox.com Git - mirror_qemu.git/blame - block/gluster.c
block/gluster: deprecate rdma support
[mirror_qemu.git] / block / gluster.c
CommitLineData
8d6d89cb
BR
1/*
2 * GlusterFS backend for QEMU
3 *
4 * Copyright (C) 2012 Bharata B Rao <bharata@linux.vnet.ibm.com>
5 *
85c09bc0
BR
6 * This work is licensed under the terms of the GNU GPL, version 2 or later.
7 * See the COPYING file in the top-level directory.
8d6d89cb 8 *
8d6d89cb 9 */
80c71a24 10#include "qemu/osdep.h"
8d6d89cb 11#include <glusterfs/api/glfs.h>
737e150e 12#include "block/block_int.h"
da34e65c 13#include "qapi/error.h"
1de7afc9 14#include "qemu/uri.h"
0552ff24 15#include "qemu/error-report.h"
8d6d89cb 16
f70c50c8
PKK
17#define GLUSTER_OPT_FILENAME "filename"
18#define GLUSTER_OPT_DEBUG "debug"
19#define GLUSTER_DEBUG_DEFAULT 4
20#define GLUSTER_DEBUG_MAX 9
21
22
8d6d89cb 23typedef struct GlusterAIOCB {
8d6d89cb
BR
24 int64_t size;
25 int ret;
8d6d89cb 26 QEMUBH *bh;
15744b0b 27 Coroutine *coroutine;
6ee50af2 28 AioContext *aio_context;
8d6d89cb
BR
29} GlusterAIOCB;
30
31typedef struct BDRVGlusterState {
32 struct glfs *glfs;
8d6d89cb 33 struct glfs_fd *fd;
947eb203 34 bool supports_seek_data;
7eac868a 35 int debug_level;
8d6d89cb
BR
36} BDRVGlusterState;
37
f70c50c8
PKK
38typedef struct BDRVGlusterReopenState {
39 struct glfs *glfs;
40 struct glfs_fd *fd;
41} BDRVGlusterReopenState;
42
8d6d89cb 43typedef struct GlusterConf {
d5cf4079 44 char *host;
8d6d89cb 45 int port;
d5cf4079
PKK
46 char *volume;
47 char *path;
8d6d89cb 48 char *transport;
7eac868a 49 int debug_level;
8d6d89cb
BR
50} GlusterConf;
51
f70c50c8
PKK
52
53static QemuOptsList qemu_gluster_create_opts = {
54 .name = "qemu-gluster-create-opts",
55 .head = QTAILQ_HEAD_INITIALIZER(qemu_gluster_create_opts.head),
56 .desc = {
57 {
58 .name = BLOCK_OPT_SIZE,
59 .type = QEMU_OPT_SIZE,
60 .help = "Virtual disk size"
61 },
62 {
63 .name = BLOCK_OPT_PREALLOC,
64 .type = QEMU_OPT_STRING,
65 .help = "Preallocation mode (allowed values: off, full)"
66 },
67 {
68 .name = GLUSTER_OPT_DEBUG,
69 .type = QEMU_OPT_NUMBER,
70 .help = "Gluster log level, valid range is 0-9",
71 },
72 { /* end of list */ }
73 }
74};
75
76static QemuOptsList runtime_opts = {
77 .name = "gluster",
78 .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
79 .desc = {
80 {
81 .name = GLUSTER_OPT_FILENAME,
82 .type = QEMU_OPT_STRING,
83 .help = "URL to the gluster image",
84 },
85 {
86 .name = GLUSTER_OPT_DEBUG,
87 .type = QEMU_OPT_NUMBER,
88 .help = "Gluster log level, valid range is 0-9",
89 },
90 { /* end of list */ }
91 },
92};
93
94
8d6d89cb
BR
95static void qemu_gluster_gconf_free(GlusterConf *gconf)
96{
1b37b344 97 if (gconf) {
d5cf4079
PKK
98 g_free(gconf->host);
99 g_free(gconf->volume);
100 g_free(gconf->path);
1b37b344
JC
101 g_free(gconf->transport);
102 g_free(gconf);
103 }
8d6d89cb
BR
104}
105
106static int parse_volume_options(GlusterConf *gconf, char *path)
107{
108 char *p, *q;
109
110 if (!path) {
111 return -EINVAL;
112 }
113
114 /* volume */
115 p = q = path + strspn(path, "/");
116 p += strcspn(p, "/");
117 if (*p == '\0') {
118 return -EINVAL;
119 }
d5cf4079 120 gconf->volume = g_strndup(q, p - q);
8d6d89cb 121
d5cf4079 122 /* path */
8d6d89cb
BR
123 p += strspn(p, "/");
124 if (*p == '\0') {
125 return -EINVAL;
126 }
d5cf4079 127 gconf->path = g_strdup(p);
8d6d89cb
BR
128 return 0;
129}
130
131/*
d5cf4079 132 * file=gluster[+transport]://[host[:port]]/volume/path[?socket=...]
8d6d89cb
BR
133 *
134 * 'gluster' is the protocol.
135 *
136 * 'transport' specifies the transport type used to connect to gluster
137 * management daemon (glusterd). Valid transport types are
0552ff24 138 * tcp or unix. If a transport type isn't specified, then tcp type is assumed.
8d6d89cb 139 *
d5cf4079 140 * 'host' specifies the host where the volume file specification for
0552ff24 141 * the given volume resides. This can be either hostname or ipv4 address.
d5cf4079 142 * If transport type is 'unix', then 'host' field should not be specified.
8d6d89cb
BR
143 * The 'socket' field needs to be populated with the path to unix domain
144 * socket.
145 *
146 * 'port' is the port number on which glusterd is listening. This is optional
147 * and if not specified, QEMU will send 0 which will make gluster to use the
148 * default port. If the transport type is unix, then 'port' should not be
149 * specified.
150 *
d5cf4079 151 * 'volume' is the name of the gluster volume which contains the VM image.
8d6d89cb 152 *
d5cf4079 153 * 'path' is the path to the actual VM image that resides on gluster volume.
8d6d89cb
BR
154 *
155 * Examples:
156 *
157 * file=gluster://1.2.3.4/testvol/a.img
158 * file=gluster+tcp://1.2.3.4/testvol/a.img
159 * file=gluster+tcp://1.2.3.4:24007/testvol/dir/a.img
d5cf4079 160 * file=gluster+tcp://host.domain.com:24007/testvol/dir/a.img
8d6d89cb 161 * file=gluster+unix:///testvol/dir/a.img?socket=/tmp/glusterd.socket
8d6d89cb
BR
162 */
163static int qemu_gluster_parseuri(GlusterConf *gconf, const char *filename)
164{
165 URI *uri;
166 QueryParams *qp = NULL;
167 bool is_unix = false;
168 int ret = 0;
169
170 uri = uri_parse(filename);
171 if (!uri) {
172 return -EINVAL;
173 }
174
175 /* transport */
24897a76 176 if (!uri->scheme || !strcmp(uri->scheme, "gluster")) {
8d6d89cb
BR
177 gconf->transport = g_strdup("tcp");
178 } else if (!strcmp(uri->scheme, "gluster+tcp")) {
179 gconf->transport = g_strdup("tcp");
180 } else if (!strcmp(uri->scheme, "gluster+unix")) {
181 gconf->transport = g_strdup("unix");
182 is_unix = true;
183 } else if (!strcmp(uri->scheme, "gluster+rdma")) {
0552ff24
PKK
184 gconf->transport = g_strdup("tcp");
185 error_report("Warning: rdma feature is not supported, falling "
186 "back to tcp");
8d6d89cb
BR
187 } else {
188 ret = -EINVAL;
189 goto out;
190 }
191
192 ret = parse_volume_options(gconf, uri->path);
193 if (ret < 0) {
194 goto out;
195 }
196
197 qp = query_params_parse(uri->query);
198 if (qp->n > 1 || (is_unix && !qp->n) || (!is_unix && qp->n)) {
199 ret = -EINVAL;
200 goto out;
201 }
202
203 if (is_unix) {
204 if (uri->server || uri->port) {
205 ret = -EINVAL;
206 goto out;
207 }
208 if (strcmp(qp->p[0].name, "socket")) {
209 ret = -EINVAL;
210 goto out;
211 }
d5cf4079 212 gconf->host = g_strdup(qp->p[0].value);
8d6d89cb 213 } else {
d5cf4079 214 gconf->host = g_strdup(uri->server ? uri->server : "localhost");
8d6d89cb
BR
215 gconf->port = uri->port;
216 }
217
218out:
219 if (qp) {
220 query_params_free(qp);
221 }
222 uri_free(uri);
223 return ret;
224}
225
a7451cb8
PB
226static struct glfs *qemu_gluster_init(GlusterConf *gconf, const char *filename,
227 Error **errp)
8d6d89cb
BR
228{
229 struct glfs *glfs = NULL;
230 int ret;
231 int old_errno;
232
233 ret = qemu_gluster_parseuri(gconf, filename);
234 if (ret < 0) {
d5cf4079 235 error_setg(errp, "Usage: file=gluster[+transport]://[host[:port]]/"
f70c50c8 236 "volume/path[?socket=...]");
8d6d89cb
BR
237 errno = -ret;
238 goto out;
239 }
240
d5cf4079 241 glfs = glfs_new(gconf->volume);
8d6d89cb
BR
242 if (!glfs) {
243 goto out;
244 }
245
d5cf4079 246 ret = glfs_set_volfile_server(glfs, gconf->transport, gconf->host,
8d6d89cb
BR
247 gconf->port);
248 if (ret < 0) {
249 goto out;
250 }
251
7eac868a 252 ret = glfs_set_logging(glfs, "-", gconf->debug_level);
8d6d89cb
BR
253 if (ret < 0) {
254 goto out;
255 }
256
257 ret = glfs_init(glfs);
258 if (ret) {
a7451cb8 259 error_setg_errno(errp, errno,
d5cf4079
PKK
260 "Gluster connection failed for host=%s port=%d "
261 "volume=%s path=%s transport=%s", gconf->host,
262 gconf->port, gconf->volume, gconf->path,
a7451cb8 263 gconf->transport);
4557117d
PK
264
265 /* glfs_init sometimes doesn't set errno although docs suggest that */
266 if (errno == 0)
267 errno = EINVAL;
268
8d6d89cb
BR
269 goto out;
270 }
271 return glfs;
272
273out:
274 if (glfs) {
275 old_errno = errno;
276 glfs_fini(glfs);
277 errno = old_errno;
278 }
279 return NULL;
280}
281
15744b0b 282static void qemu_gluster_complete_aio(void *opaque)
8d6d89cb 283{
15744b0b 284 GlusterAIOCB *acb = (GlusterAIOCB *)opaque;
8d6d89cb 285
15744b0b
BR
286 qemu_bh_delete(acb->bh);
287 acb->bh = NULL;
0b8b8753 288 qemu_coroutine_enter(acb->coroutine);
8d6d89cb
BR
289}
290
7c815372
BR
291/*
292 * AIO callback routine called from GlusterFS thread.
293 */
294static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
295{
296 GlusterAIOCB *acb = (GlusterAIOCB *)arg;
297
298 if (!ret || ret == acb->size) {
299 acb->ret = 0; /* Success */
300 } else if (ret < 0) {
a8827453 301 acb->ret = -errno; /* Read/Write failed */
7c815372
BR
302 } else {
303 acb->ret = -EIO; /* Partial read/write - fail it */
304 }
305
6ee50af2 306 acb->bh = aio_bh_new(acb->aio_context, qemu_gluster_complete_aio, acb);
7c815372
BR
307 qemu_bh_schedule(acb->bh);
308}
309
1b37b344
JC
310static void qemu_gluster_parse_flags(int bdrv_flags, int *open_flags)
311{
312 assert(open_flags != NULL);
313
314 *open_flags |= O_BINARY;
315
316 if (bdrv_flags & BDRV_O_RDWR) {
317 *open_flags |= O_RDWR;
318 } else {
319 *open_flags |= O_RDONLY;
320 }
321
322 if ((bdrv_flags & BDRV_O_NOCACHE)) {
323 *open_flags |= O_DIRECT;
324 }
325}
326
947eb203
NV
327/*
328 * Do SEEK_DATA/HOLE to detect if it is functional. Older broken versions of
329 * gfapi incorrectly return the current offset when SEEK_DATA/HOLE is used.
330 * - Corrected versions return -1 and set errno to EINVAL.
331 * - Versions that support SEEK_DATA/HOLE correctly, will return -1 and set
332 * errno to ENXIO when SEEK_DATA is called with a position of EOF.
333 */
334static bool qemu_gluster_test_seek(struct glfs_fd *fd)
335{
336 off_t ret, eof;
337
338 eof = glfs_lseek(fd, 0, SEEK_END);
339 if (eof < 0) {
340 /* this should never occur */
341 return false;
342 }
343
344 /* this should always fail with ENXIO if SEEK_DATA is supported */
345 ret = glfs_lseek(fd, eof, SEEK_DATA);
346 return (ret < 0) && (errno == ENXIO);
347}
348
56d1b4d2 349static int qemu_gluster_open(BlockDriverState *bs, QDict *options,
015a1036 350 int bdrv_flags, Error **errp)
8d6d89cb
BR
351{
352 BDRVGlusterState *s = bs->opaque;
1b37b344 353 int open_flags = 0;
8d6d89cb 354 int ret = 0;
5839e53b 355 GlusterConf *gconf = g_new0(GlusterConf, 1);
b4894776
KW
356 QemuOpts *opts;
357 Error *local_err = NULL;
358 const char *filename;
359
87ea75d5 360 opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
b4894776 361 qemu_opts_absorb_qdict(opts, options, &local_err);
84d18f06 362 if (local_err) {
a7451cb8 363 error_propagate(errp, local_err);
b4894776
KW
364 ret = -EINVAL;
365 goto out;
366 }
367
7eac868a
JC
368 filename = qemu_opt_get(opts, GLUSTER_OPT_FILENAME);
369
370 s->debug_level = qemu_opt_get_number(opts, GLUSTER_OPT_DEBUG,
371 GLUSTER_DEBUG_DEFAULT);
372 if (s->debug_level < 0) {
373 s->debug_level = 0;
374 } else if (s->debug_level > GLUSTER_DEBUG_MAX) {
375 s->debug_level = GLUSTER_DEBUG_MAX;
376 }
b4894776 377
7eac868a 378 gconf->debug_level = s->debug_level;
a7451cb8 379 s->glfs = qemu_gluster_init(gconf, filename, errp);
8d6d89cb
BR
380 if (!s->glfs) {
381 ret = -errno;
382 goto out;
383 }
384
d85fa9eb
JC
385#ifdef CONFIG_GLUSTERFS_XLATOR_OPT
386 /* Without this, if fsync fails for a recoverable reason (for instance,
387 * ENOSPC), gluster will dump its cache, preventing retries. This means
388 * almost certain data loss. Not all gluster versions support the
389 * 'resync-failed-syncs-after-fsync' key value, but there is no way to
390 * discover during runtime if it is supported (this api returns success for
391 * unknown key/value pairs) */
392 ret = glfs_set_xlator_option(s->glfs, "*-write-behind",
393 "resync-failed-syncs-after-fsync",
394 "on");
395 if (ret < 0) {
396 error_setg_errno(errp, errno, "Unable to set xlator key/value pair");
397 ret = -errno;
398 goto out;
399 }
400#endif
401
1b37b344 402 qemu_gluster_parse_flags(bdrv_flags, &open_flags);
8d6d89cb 403
d5cf4079 404 s->fd = glfs_open(s->glfs, gconf->path, open_flags);
8d6d89cb
BR
405 if (!s->fd) {
406 ret = -errno;
8d6d89cb 407 }
8d6d89cb 408
947eb203
NV
409 s->supports_seek_data = qemu_gluster_test_seek(s->fd);
410
8d6d89cb 411out:
b4894776 412 qemu_opts_del(opts);
8d6d89cb
BR
413 qemu_gluster_gconf_free(gconf);
414 if (!ret) {
415 return ret;
416 }
417 if (s->fd) {
418 glfs_close(s->fd);
419 }
420 if (s->glfs) {
421 glfs_fini(s->glfs);
422 }
423 return ret;
424}
425
adccfbcd
JC
426static int qemu_gluster_reopen_prepare(BDRVReopenState *state,
427 BlockReopenQueue *queue, Error **errp)
428{
429 int ret = 0;
7eac868a 430 BDRVGlusterState *s;
adccfbcd
JC
431 BDRVGlusterReopenState *reop_s;
432 GlusterConf *gconf = NULL;
433 int open_flags = 0;
434
435 assert(state != NULL);
436 assert(state->bs != NULL);
437
7eac868a
JC
438 s = state->bs->opaque;
439
5839e53b 440 state->opaque = g_new0(BDRVGlusterReopenState, 1);
adccfbcd
JC
441 reop_s = state->opaque;
442
443 qemu_gluster_parse_flags(state->flags, &open_flags);
444
5839e53b 445 gconf = g_new0(GlusterConf, 1);
adccfbcd 446
7eac868a 447 gconf->debug_level = s->debug_level;
f55ea629 448 reop_s->glfs = qemu_gluster_init(gconf, state->bs->filename, errp);
adccfbcd
JC
449 if (reop_s->glfs == NULL) {
450 ret = -errno;
451 goto exit;
452 }
453
d85fa9eb
JC
454#ifdef CONFIG_GLUSTERFS_XLATOR_OPT
455 ret = glfs_set_xlator_option(reop_s->glfs, "*-write-behind",
456 "resync-failed-syncs-after-fsync", "on");
457 if (ret < 0) {
458 error_setg_errno(errp, errno, "Unable to set xlator key/value pair");
459 ret = -errno;
460 goto exit;
461 }
462#endif
463
d5cf4079 464 reop_s->fd = glfs_open(reop_s->glfs, gconf->path, open_flags);
adccfbcd
JC
465 if (reop_s->fd == NULL) {
466 /* reops->glfs will be cleaned up in _abort */
467 ret = -errno;
468 goto exit;
469 }
470
471exit:
472 /* state->opaque will be freed in either the _abort or _commit */
473 qemu_gluster_gconf_free(gconf);
474 return ret;
475}
476
477static void qemu_gluster_reopen_commit(BDRVReopenState *state)
478{
479 BDRVGlusterReopenState *reop_s = state->opaque;
480 BDRVGlusterState *s = state->bs->opaque;
481
482
483 /* close the old */
484 if (s->fd) {
485 glfs_close(s->fd);
486 }
487 if (s->glfs) {
488 glfs_fini(s->glfs);
489 }
490
491 /* use the newly opened image / connection */
492 s->fd = reop_s->fd;
493 s->glfs = reop_s->glfs;
494
495 g_free(state->opaque);
496 state->opaque = NULL;
497
498 return;
499}
500
501
502static void qemu_gluster_reopen_abort(BDRVReopenState *state)
503{
504 BDRVGlusterReopenState *reop_s = state->opaque;
505
506 if (reop_s == NULL) {
507 return;
508 }
509
510 if (reop_s->fd) {
511 glfs_close(reop_s->fd);
512 }
513
514 if (reop_s->glfs) {
515 glfs_fini(reop_s->glfs);
516 }
517
518 g_free(state->opaque);
519 state->opaque = NULL;
520
521 return;
522}
523
7c815372 524#ifdef CONFIG_GLUSTERFS_ZEROFILL
e88a36eb 525static coroutine_fn int qemu_gluster_co_pwrite_zeroes(BlockDriverState *bs,
f70c50c8
PKK
526 int64_t offset,
527 int size,
528 BdrvRequestFlags flags)
7c815372
BR
529{
530 int ret;
c833d1e8 531 GlusterAIOCB acb;
7c815372 532 BDRVGlusterState *s = bs->opaque;
7c815372 533
c833d1e8
PB
534 acb.size = size;
535 acb.ret = 0;
536 acb.coroutine = qemu_coroutine_self();
537 acb.aio_context = bdrv_get_aio_context(bs);
7c815372 538
c833d1e8 539 ret = glfs_zerofill_async(s->fd, offset, size, gluster_finish_aiocb, &acb);
7c815372 540 if (ret < 0) {
c833d1e8 541 return -errno;
7c815372
BR
542 }
543
544 qemu_coroutine_yield();
c833d1e8 545 return acb.ret;
7c815372 546}
cf7f616b
BR
547
548static inline bool gluster_supports_zerofill(void)
549{
550 return 1;
551}
552
553static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset,
f70c50c8 554 int64_t size)
cf7f616b
BR
555{
556 return glfs_zerofill(fd, offset, size);
557}
558
559#else
560static inline bool gluster_supports_zerofill(void)
561{
562 return 0;
563}
564
565static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset,
f70c50c8 566 int64_t size)
cf7f616b
BR
567{
568 return 0;
569}
7c815372
BR
570#endif
571
8d6d89cb 572static int qemu_gluster_create(const char *filename,
90c772de 573 QemuOpts *opts, Error **errp)
8d6d89cb
BR
574{
575 struct glfs *glfs;
576 struct glfs_fd *fd;
577 int ret = 0;
cf7f616b 578 int prealloc = 0;
8d6d89cb 579 int64_t total_size = 0;
90c772de 580 char *tmp = NULL;
5839e53b 581 GlusterConf *gconf = g_new0(GlusterConf, 1);
8d6d89cb 582
7eac868a
JC
583 gconf->debug_level = qemu_opt_get_number_del(opts, GLUSTER_OPT_DEBUG,
584 GLUSTER_DEBUG_DEFAULT);
585 if (gconf->debug_level < 0) {
586 gconf->debug_level = 0;
587 } else if (gconf->debug_level > GLUSTER_DEBUG_MAX) {
588 gconf->debug_level = GLUSTER_DEBUG_MAX;
589 }
590
a7451cb8 591 glfs = qemu_gluster_init(gconf, filename, errp);
8d6d89cb 592 if (!glfs) {
4557117d 593 ret = -errno;
8d6d89cb
BR
594 goto out;
595 }
596
180e9526
HT
597 total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
598 BDRV_SECTOR_SIZE);
90c772de
CL
599
600 tmp = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
601 if (!tmp || !strcmp(tmp, "off")) {
602 prealloc = 0;
f70c50c8 603 } else if (!strcmp(tmp, "full") && gluster_supports_zerofill()) {
90c772de
CL
604 prealloc = 1;
605 } else {
606 error_setg(errp, "Invalid preallocation mode: '%s'"
f70c50c8 607 " or GlusterFS doesn't support zerofill API", tmp);
90c772de
CL
608 ret = -EINVAL;
609 goto out;
8d6d89cb
BR
610 }
611
d5cf4079 612 fd = glfs_creat(glfs, gconf->path,
f70c50c8 613 O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IRUSR | S_IWUSR);
8d6d89cb
BR
614 if (!fd) {
615 ret = -errno;
616 } else {
180e9526
HT
617 if (!glfs_ftruncate(fd, total_size)) {
618 if (prealloc && qemu_gluster_zerofill(fd, 0, total_size)) {
cf7f616b
BR
619 ret = -errno;
620 }
621 } else {
8d6d89cb
BR
622 ret = -errno;
623 }
cf7f616b 624
8d6d89cb
BR
625 if (glfs_close(fd) != 0) {
626 ret = -errno;
627 }
628 }
629out:
90c772de 630 g_free(tmp);
8d6d89cb
BR
631 qemu_gluster_gconf_free(gconf);
632 if (glfs) {
633 glfs_fini(glfs);
634 }
635 return ret;
636}
637
15744b0b 638static coroutine_fn int qemu_gluster_co_rw(BlockDriverState *bs,
f70c50c8
PKK
639 int64_t sector_num, int nb_sectors,
640 QEMUIOVector *qiov, int write)
8d6d89cb
BR
641{
642 int ret;
c833d1e8 643 GlusterAIOCB acb;
8d6d89cb 644 BDRVGlusterState *s = bs->opaque;
15744b0b
BR
645 size_t size = nb_sectors * BDRV_SECTOR_SIZE;
646 off_t offset = sector_num * BDRV_SECTOR_SIZE;
8d6d89cb 647
c833d1e8
PB
648 acb.size = size;
649 acb.ret = 0;
650 acb.coroutine = qemu_coroutine_self();
651 acb.aio_context = bdrv_get_aio_context(bs);
8d6d89cb
BR
652
653 if (write) {
654 ret = glfs_pwritev_async(s->fd, qiov->iov, qiov->niov, offset, 0,
f70c50c8 655 gluster_finish_aiocb, &acb);
8d6d89cb
BR
656 } else {
657 ret = glfs_preadv_async(s->fd, qiov->iov, qiov->niov, offset, 0,
f70c50c8 658 gluster_finish_aiocb, &acb);
8d6d89cb
BR
659 }
660
661 if (ret < 0) {
c833d1e8 662 return -errno;
8d6d89cb 663 }
15744b0b
BR
664
665 qemu_coroutine_yield();
c833d1e8 666 return acb.ret;
8d6d89cb
BR
667}
668
42ec24e2
PB
669static int qemu_gluster_truncate(BlockDriverState *bs, int64_t offset)
670{
671 int ret;
672 BDRVGlusterState *s = bs->opaque;
673
674 ret = glfs_ftruncate(s->fd, offset);
675 if (ret < 0) {
676 return -errno;
677 }
678
679 return 0;
680}
681
15744b0b 682static coroutine_fn int qemu_gluster_co_readv(BlockDriverState *bs,
f70c50c8
PKK
683 int64_t sector_num,
684 int nb_sectors,
685 QEMUIOVector *qiov)
8d6d89cb 686{
15744b0b 687 return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 0);
8d6d89cb
BR
688}
689
15744b0b 690static coroutine_fn int qemu_gluster_co_writev(BlockDriverState *bs,
f70c50c8
PKK
691 int64_t sector_num,
692 int nb_sectors,
693 QEMUIOVector *qiov)
8d6d89cb 694{
15744b0b 695 return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 1);
8d6d89cb
BR
696}
697
5d4343e6
JC
698static void qemu_gluster_close(BlockDriverState *bs)
699{
700 BDRVGlusterState *s = bs->opaque;
701
702 if (s->fd) {
703 glfs_close(s->fd);
704 s->fd = NULL;
705 }
706 glfs_fini(s->glfs);
707}
708
15744b0b 709static coroutine_fn int qemu_gluster_co_flush_to_disk(BlockDriverState *bs)
8d6d89cb
BR
710{
711 int ret;
c833d1e8 712 GlusterAIOCB acb;
8d6d89cb
BR
713 BDRVGlusterState *s = bs->opaque;
714
c833d1e8
PB
715 acb.size = 0;
716 acb.ret = 0;
717 acb.coroutine = qemu_coroutine_self();
718 acb.aio_context = bdrv_get_aio_context(bs);
8d6d89cb 719
c833d1e8 720 ret = glfs_fsync_async(s->fd, gluster_finish_aiocb, &acb);
8d6d89cb 721 if (ret < 0) {
d85fa9eb
JC
722 ret = -errno;
723 goto error;
8d6d89cb 724 }
15744b0b
BR
725
726 qemu_coroutine_yield();
d85fa9eb
JC
727 if (acb.ret < 0) {
728 ret = acb.ret;
729 goto error;
730 }
731
c833d1e8 732 return acb.ret;
d85fa9eb
JC
733
734error:
735 /* Some versions of Gluster (3.5.6 -> 3.5.8?) will not retain its cache
736 * after a fsync failure, so we have no way of allowing the guest to safely
737 * continue. Gluster versions prior to 3.5.6 don't retain the cache
738 * either, but will invalidate the fd on error, so this is again our only
739 * option.
740 *
741 * The 'resync-failed-syncs-after-fsync' xlator option for the
742 * write-behind cache will cause later gluster versions to retain its
743 * cache after error, so long as the fd remains open. However, we
744 * currently have no way of knowing if this option is supported.
745 *
746 * TODO: Once gluster provides a way for us to determine if the option
747 * is supported, bypass the closure and setting drv to NULL. */
748 qemu_gluster_close(bs);
749 bs->drv = NULL;
750 return ret;
8d6d89cb
BR
751}
752
0c14fb47 753#ifdef CONFIG_GLUSTERFS_DISCARD
15744b0b 754static coroutine_fn int qemu_gluster_co_discard(BlockDriverState *bs,
f70c50c8
PKK
755 int64_t sector_num,
756 int nb_sectors)
0c14fb47
BR
757{
758 int ret;
c833d1e8 759 GlusterAIOCB acb;
0c14fb47 760 BDRVGlusterState *s = bs->opaque;
15744b0b
BR
761 size_t size = nb_sectors * BDRV_SECTOR_SIZE;
762 off_t offset = sector_num * BDRV_SECTOR_SIZE;
0c14fb47 763
c833d1e8
PB
764 acb.size = 0;
765 acb.ret = 0;
766 acb.coroutine = qemu_coroutine_self();
767 acb.aio_context = bdrv_get_aio_context(bs);
0c14fb47 768
c833d1e8 769 ret = glfs_discard_async(s->fd, offset, size, gluster_finish_aiocb, &acb);
0c14fb47 770 if (ret < 0) {
c833d1e8 771 return -errno;
0c14fb47 772 }
15744b0b
BR
773
774 qemu_coroutine_yield();
c833d1e8 775 return acb.ret;
0c14fb47
BR
776}
777#endif
778
8d6d89cb
BR
779static int64_t qemu_gluster_getlength(BlockDriverState *bs)
780{
781 BDRVGlusterState *s = bs->opaque;
782 int64_t ret;
783
784 ret = glfs_lseek(s->fd, 0, SEEK_END);
785 if (ret < 0) {
786 return -errno;
787 } else {
788 return ret;
789 }
790}
791
792static int64_t qemu_gluster_allocated_file_size(BlockDriverState *bs)
793{
794 BDRVGlusterState *s = bs->opaque;
795 struct stat st;
796 int ret;
797
798 ret = glfs_fstat(s->fd, &st);
799 if (ret < 0) {
800 return -errno;
801 } else {
802 return st.st_blocks * 512;
803 }
804}
805
8ab6feec
KW
806static int qemu_gluster_has_zero_init(BlockDriverState *bs)
807{
808 /* GlusterFS volume could be backed by a block device */
809 return 0;
810}
811
947eb203
NV
812/*
813 * Find allocation range in @bs around offset @start.
814 * May change underlying file descriptor's file offset.
815 * If @start is not in a hole, store @start in @data, and the
816 * beginning of the next hole in @hole, and return 0.
817 * If @start is in a non-trailing hole, store @start in @hole and the
818 * beginning of the next non-hole in @data, and return 0.
819 * If @start is in a trailing hole or beyond EOF, return -ENXIO.
820 * If we can't find out, return a negative errno other than -ENXIO.
821 *
822 * (Shamefully copied from raw-posix.c, only miniscule adaptions.)
823 */
824static int find_allocation(BlockDriverState *bs, off_t start,
825 off_t *data, off_t *hole)
826{
827 BDRVGlusterState *s = bs->opaque;
828 off_t offs;
829
830 if (!s->supports_seek_data) {
831 return -ENOTSUP;
832 }
833
834 /*
835 * SEEK_DATA cases:
836 * D1. offs == start: start is in data
837 * D2. offs > start: start is in a hole, next data at offs
838 * D3. offs < 0, errno = ENXIO: either start is in a trailing hole
839 * or start is beyond EOF
840 * If the latter happens, the file has been truncated behind
841 * our back since we opened it. All bets are off then.
842 * Treating like a trailing hole is simplest.
843 * D4. offs < 0, errno != ENXIO: we learned nothing
844 */
845 offs = glfs_lseek(s->fd, start, SEEK_DATA);
846 if (offs < 0) {
847 return -errno; /* D3 or D4 */
848 }
849 assert(offs >= start);
850
851 if (offs > start) {
852 /* D2: in hole, next data at offs */
853 *hole = start;
854 *data = offs;
855 return 0;
856 }
857
858 /* D1: in data, end not yet known */
859
860 /*
861 * SEEK_HOLE cases:
862 * H1. offs == start: start is in a hole
863 * If this happens here, a hole has been dug behind our back
864 * since the previous lseek().
865 * H2. offs > start: either start is in data, next hole at offs,
866 * or start is in trailing hole, EOF at offs
867 * Linux treats trailing holes like any other hole: offs ==
868 * start. Solaris seeks to EOF instead: offs > start (blech).
869 * If that happens here, a hole has been dug behind our back
870 * since the previous lseek().
871 * H3. offs < 0, errno = ENXIO: start is beyond EOF
872 * If this happens, the file has been truncated behind our
873 * back since we opened it. Treat it like a trailing hole.
874 * H4. offs < 0, errno != ENXIO: we learned nothing
875 * Pretend we know nothing at all, i.e. "forget" about D1.
876 */
877 offs = glfs_lseek(s->fd, start, SEEK_HOLE);
878 if (offs < 0) {
879 return -errno; /* D1 and (H3 or H4) */
880 }
881 assert(offs >= start);
882
883 if (offs > start) {
884 /*
885 * D1 and H2: either in data, next hole at offs, or it was in
886 * data but is now in a trailing hole. In the latter case,
887 * all bets are off. Treating it as if it there was data all
888 * the way to EOF is safe, so simply do that.
889 */
890 *data = start;
891 *hole = offs;
892 return 0;
893 }
894
895 /* D1 and H1 */
896 return -EBUSY;
897}
898
899/*
900 * Returns the allocation status of the specified sectors.
901 *
902 * If 'sector_num' is beyond the end of the disk image the return value is 0
903 * and 'pnum' is set to 0.
904 *
905 * 'pnum' is set to the number of sectors (including and immediately following
906 * the specified sector) that are known to be in the same
907 * allocated/unallocated state.
908 *
909 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
910 * beyond the end of the disk image it will be clamped.
911 *
912 * (Based on raw_co_get_block_status() from raw-posix.c.)
913 */
914static int64_t coroutine_fn qemu_gluster_co_get_block_status(
915 BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum,
916 BlockDriverState **file)
917{
918 BDRVGlusterState *s = bs->opaque;
919 off_t start, data = 0, hole = 0;
920 int64_t total_size;
921 int ret = -EINVAL;
922
923 if (!s->fd) {
924 return ret;
925 }
926
927 start = sector_num * BDRV_SECTOR_SIZE;
928 total_size = bdrv_getlength(bs);
929 if (total_size < 0) {
930 return total_size;
931 } else if (start >= total_size) {
932 *pnum = 0;
933 return 0;
934 } else if (start + nb_sectors * BDRV_SECTOR_SIZE > total_size) {
935 nb_sectors = DIV_ROUND_UP(total_size - start, BDRV_SECTOR_SIZE);
936 }
937
938 ret = find_allocation(bs, start, &data, &hole);
939 if (ret == -ENXIO) {
940 /* Trailing hole */
941 *pnum = nb_sectors;
942 ret = BDRV_BLOCK_ZERO;
943 } else if (ret < 0) {
944 /* No info available, so pretend there are no holes */
945 *pnum = nb_sectors;
946 ret = BDRV_BLOCK_DATA;
947 } else if (data == start) {
948 /* On a data extent, compute sectors to the end of the extent,
949 * possibly including a partial sector at EOF. */
950 *pnum = MIN(nb_sectors, DIV_ROUND_UP(hole - start, BDRV_SECTOR_SIZE));
951 ret = BDRV_BLOCK_DATA;
952 } else {
953 /* On a hole, compute sectors to the beginning of the next extent. */
954 assert(hole == start);
955 *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE);
956 ret = BDRV_BLOCK_ZERO;
957 }
958
959 *file = bs;
960
961 return ret | BDRV_BLOCK_OFFSET_VALID | start;
962}
963
964
8d6d89cb
BR
965static BlockDriver bdrv_gluster = {
966 .format_name = "gluster",
967 .protocol_name = "gluster",
968 .instance_size = sizeof(BDRVGlusterState),
030be321 969 .bdrv_needs_filename = true,
8d6d89cb 970 .bdrv_file_open = qemu_gluster_open,
adccfbcd
JC
971 .bdrv_reopen_prepare = qemu_gluster_reopen_prepare,
972 .bdrv_reopen_commit = qemu_gluster_reopen_commit,
973 .bdrv_reopen_abort = qemu_gluster_reopen_abort,
8d6d89cb 974 .bdrv_close = qemu_gluster_close,
c282e1fd 975 .bdrv_create = qemu_gluster_create,
8d6d89cb
BR
976 .bdrv_getlength = qemu_gluster_getlength,
977 .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
42ec24e2 978 .bdrv_truncate = qemu_gluster_truncate,
15744b0b
BR
979 .bdrv_co_readv = qemu_gluster_co_readv,
980 .bdrv_co_writev = qemu_gluster_co_writev,
981 .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk,
8ab6feec 982 .bdrv_has_zero_init = qemu_gluster_has_zero_init,
0c14fb47 983#ifdef CONFIG_GLUSTERFS_DISCARD
15744b0b 984 .bdrv_co_discard = qemu_gluster_co_discard,
7c815372
BR
985#endif
986#ifdef CONFIG_GLUSTERFS_ZEROFILL
e88a36eb 987 .bdrv_co_pwrite_zeroes = qemu_gluster_co_pwrite_zeroes,
0c14fb47 988#endif
947eb203 989 .bdrv_co_get_block_status = qemu_gluster_co_get_block_status,
90c772de 990 .create_opts = &qemu_gluster_create_opts,
8d6d89cb
BR
991};
992
993static BlockDriver bdrv_gluster_tcp = {
994 .format_name = "gluster",
995 .protocol_name = "gluster+tcp",
996 .instance_size = sizeof(BDRVGlusterState),
030be321 997 .bdrv_needs_filename = true,
8d6d89cb 998 .bdrv_file_open = qemu_gluster_open,
adccfbcd
JC
999 .bdrv_reopen_prepare = qemu_gluster_reopen_prepare,
1000 .bdrv_reopen_commit = qemu_gluster_reopen_commit,
1001 .bdrv_reopen_abort = qemu_gluster_reopen_abort,
8d6d89cb 1002 .bdrv_close = qemu_gluster_close,
c282e1fd 1003 .bdrv_create = qemu_gluster_create,
8d6d89cb
BR
1004 .bdrv_getlength = qemu_gluster_getlength,
1005 .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
42ec24e2 1006 .bdrv_truncate = qemu_gluster_truncate,
15744b0b
BR
1007 .bdrv_co_readv = qemu_gluster_co_readv,
1008 .bdrv_co_writev = qemu_gluster_co_writev,
1009 .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk,
8ab6feec 1010 .bdrv_has_zero_init = qemu_gluster_has_zero_init,
0c14fb47 1011#ifdef CONFIG_GLUSTERFS_DISCARD
15744b0b 1012 .bdrv_co_discard = qemu_gluster_co_discard,
7c815372
BR
1013#endif
1014#ifdef CONFIG_GLUSTERFS_ZEROFILL
e88a36eb 1015 .bdrv_co_pwrite_zeroes = qemu_gluster_co_pwrite_zeroes,
0c14fb47 1016#endif
947eb203 1017 .bdrv_co_get_block_status = qemu_gluster_co_get_block_status,
90c772de 1018 .create_opts = &qemu_gluster_create_opts,
8d6d89cb
BR
1019};
1020
1021static BlockDriver bdrv_gluster_unix = {
1022 .format_name = "gluster",
1023 .protocol_name = "gluster+unix",
1024 .instance_size = sizeof(BDRVGlusterState),
030be321 1025 .bdrv_needs_filename = true,
8d6d89cb 1026 .bdrv_file_open = qemu_gluster_open,
adccfbcd
JC
1027 .bdrv_reopen_prepare = qemu_gluster_reopen_prepare,
1028 .bdrv_reopen_commit = qemu_gluster_reopen_commit,
1029 .bdrv_reopen_abort = qemu_gluster_reopen_abort,
8d6d89cb 1030 .bdrv_close = qemu_gluster_close,
c282e1fd 1031 .bdrv_create = qemu_gluster_create,
8d6d89cb
BR
1032 .bdrv_getlength = qemu_gluster_getlength,
1033 .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
42ec24e2 1034 .bdrv_truncate = qemu_gluster_truncate,
15744b0b
BR
1035 .bdrv_co_readv = qemu_gluster_co_readv,
1036 .bdrv_co_writev = qemu_gluster_co_writev,
1037 .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk,
8ab6feec 1038 .bdrv_has_zero_init = qemu_gluster_has_zero_init,
0c14fb47 1039#ifdef CONFIG_GLUSTERFS_DISCARD
15744b0b 1040 .bdrv_co_discard = qemu_gluster_co_discard,
7c815372
BR
1041#endif
1042#ifdef CONFIG_GLUSTERFS_ZEROFILL
e88a36eb 1043 .bdrv_co_pwrite_zeroes = qemu_gluster_co_pwrite_zeroes,
0c14fb47 1044#endif
947eb203 1045 .bdrv_co_get_block_status = qemu_gluster_co_get_block_status,
90c772de 1046 .create_opts = &qemu_gluster_create_opts,
8d6d89cb
BR
1047};
1048
0552ff24
PKK
1049/* rdma is deprecated (actually never supported for volfile fetch).
1050 * Let's maintain it for the protocol compatibility, to make sure things
1051 * won't break immediately. For now, gluster+rdma will fall back to gluster+tcp
1052 * protocol with a warning.
1053 * TODO: remove gluster+rdma interface support
1054 */
8d6d89cb
BR
1055static BlockDriver bdrv_gluster_rdma = {
1056 .format_name = "gluster",
1057 .protocol_name = "gluster+rdma",
1058 .instance_size = sizeof(BDRVGlusterState),
030be321 1059 .bdrv_needs_filename = true,
8d6d89cb 1060 .bdrv_file_open = qemu_gluster_open,
adccfbcd
JC
1061 .bdrv_reopen_prepare = qemu_gluster_reopen_prepare,
1062 .bdrv_reopen_commit = qemu_gluster_reopen_commit,
1063 .bdrv_reopen_abort = qemu_gluster_reopen_abort,
8d6d89cb 1064 .bdrv_close = qemu_gluster_close,
c282e1fd 1065 .bdrv_create = qemu_gluster_create,
8d6d89cb
BR
1066 .bdrv_getlength = qemu_gluster_getlength,
1067 .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
42ec24e2 1068 .bdrv_truncate = qemu_gluster_truncate,
15744b0b
BR
1069 .bdrv_co_readv = qemu_gluster_co_readv,
1070 .bdrv_co_writev = qemu_gluster_co_writev,
1071 .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk,
8ab6feec 1072 .bdrv_has_zero_init = qemu_gluster_has_zero_init,
0c14fb47 1073#ifdef CONFIG_GLUSTERFS_DISCARD
15744b0b 1074 .bdrv_co_discard = qemu_gluster_co_discard,
7c815372
BR
1075#endif
1076#ifdef CONFIG_GLUSTERFS_ZEROFILL
e88a36eb 1077 .bdrv_co_pwrite_zeroes = qemu_gluster_co_pwrite_zeroes,
0c14fb47 1078#endif
947eb203 1079 .bdrv_co_get_block_status = qemu_gluster_co_get_block_status,
90c772de 1080 .create_opts = &qemu_gluster_create_opts,
8d6d89cb
BR
1081};
1082
1083static void bdrv_gluster_init(void)
1084{
1085 bdrv_register(&bdrv_gluster_rdma);
1086 bdrv_register(&bdrv_gluster_unix);
1087 bdrv_register(&bdrv_gluster_tcp);
1088 bdrv_register(&bdrv_gluster);
1089}
1090
1091block_init(bdrv_gluster_init);