]> git.proxmox.com Git - mirror_qemu.git/blame - block/gluster.c
block/gluster: using new qapi schema
[mirror_qemu.git] / block / gluster.c
CommitLineData
8d6d89cb
BR
1/*
2 * GlusterFS backend for QEMU
3 *
4 * Copyright (C) 2012 Bharata B Rao <bharata@linux.vnet.ibm.com>
5 *
85c09bc0
BR
6 * This work is licensed under the terms of the GNU GPL, version 2 or later.
7 * See the COPYING file in the top-level directory.
8d6d89cb 8 *
8d6d89cb 9 */
80c71a24 10#include "qemu/osdep.h"
8d6d89cb 11#include <glusterfs/api/glfs.h>
737e150e 12#include "block/block_int.h"
da34e65c 13#include "qapi/error.h"
1de7afc9 14#include "qemu/uri.h"
0552ff24 15#include "qemu/error-report.h"
8d6d89cb 16
f70c50c8
PKK
17#define GLUSTER_OPT_FILENAME "filename"
18#define GLUSTER_OPT_DEBUG "debug"
7edac2dd 19#define GLUSTER_DEFAULT_PORT 24007
f70c50c8
PKK
20#define GLUSTER_DEBUG_DEFAULT 4
21#define GLUSTER_DEBUG_MAX 9
22
23
8d6d89cb 24typedef struct GlusterAIOCB {
8d6d89cb
BR
25 int64_t size;
26 int ret;
8d6d89cb 27 QEMUBH *bh;
15744b0b 28 Coroutine *coroutine;
6ee50af2 29 AioContext *aio_context;
8d6d89cb
BR
30} GlusterAIOCB;
31
32typedef struct BDRVGlusterState {
33 struct glfs *glfs;
8d6d89cb 34 struct glfs_fd *fd;
947eb203 35 bool supports_seek_data;
7eac868a 36 int debug_level;
8d6d89cb
BR
37} BDRVGlusterState;
38
f70c50c8
PKK
39typedef struct BDRVGlusterReopenState {
40 struct glfs *glfs;
41 struct glfs_fd *fd;
42} BDRVGlusterReopenState;
43
f70c50c8
PKK
44
45static QemuOptsList qemu_gluster_create_opts = {
46 .name = "qemu-gluster-create-opts",
47 .head = QTAILQ_HEAD_INITIALIZER(qemu_gluster_create_opts.head),
48 .desc = {
49 {
50 .name = BLOCK_OPT_SIZE,
51 .type = QEMU_OPT_SIZE,
52 .help = "Virtual disk size"
53 },
54 {
55 .name = BLOCK_OPT_PREALLOC,
56 .type = QEMU_OPT_STRING,
57 .help = "Preallocation mode (allowed values: off, full)"
58 },
59 {
60 .name = GLUSTER_OPT_DEBUG,
61 .type = QEMU_OPT_NUMBER,
62 .help = "Gluster log level, valid range is 0-9",
63 },
64 { /* end of list */ }
65 }
66};
67
68static QemuOptsList runtime_opts = {
69 .name = "gluster",
70 .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
71 .desc = {
72 {
73 .name = GLUSTER_OPT_FILENAME,
74 .type = QEMU_OPT_STRING,
75 .help = "URL to the gluster image",
76 },
77 {
78 .name = GLUSTER_OPT_DEBUG,
79 .type = QEMU_OPT_NUMBER,
80 .help = "Gluster log level, valid range is 0-9",
81 },
82 { /* end of list */ }
83 },
84};
85
86
7edac2dd 87static int parse_volume_options(BlockdevOptionsGluster *gconf, char *path)
8d6d89cb
BR
88{
89 char *p, *q;
90
91 if (!path) {
92 return -EINVAL;
93 }
94
95 /* volume */
96 p = q = path + strspn(path, "/");
97 p += strcspn(p, "/");
98 if (*p == '\0') {
99 return -EINVAL;
100 }
d5cf4079 101 gconf->volume = g_strndup(q, p - q);
8d6d89cb 102
d5cf4079 103 /* path */
8d6d89cb
BR
104 p += strspn(p, "/");
105 if (*p == '\0') {
106 return -EINVAL;
107 }
d5cf4079 108 gconf->path = g_strdup(p);
8d6d89cb
BR
109 return 0;
110}
111
112/*
d5cf4079 113 * file=gluster[+transport]://[host[:port]]/volume/path[?socket=...]
8d6d89cb
BR
114 *
115 * 'gluster' is the protocol.
116 *
117 * 'transport' specifies the transport type used to connect to gluster
118 * management daemon (glusterd). Valid transport types are
0552ff24 119 * tcp or unix. If a transport type isn't specified, then tcp type is assumed.
8d6d89cb 120 *
d5cf4079 121 * 'host' specifies the host where the volume file specification for
0552ff24 122 * the given volume resides. This can be either hostname or ipv4 address.
d5cf4079 123 * If transport type is 'unix', then 'host' field should not be specified.
8d6d89cb
BR
124 * The 'socket' field needs to be populated with the path to unix domain
125 * socket.
126 *
127 * 'port' is the port number on which glusterd is listening. This is optional
128 * and if not specified, QEMU will send 0 which will make gluster to use the
129 * default port. If the transport type is unix, then 'port' should not be
130 * specified.
131 *
d5cf4079 132 * 'volume' is the name of the gluster volume which contains the VM image.
8d6d89cb 133 *
d5cf4079 134 * 'path' is the path to the actual VM image that resides on gluster volume.
8d6d89cb
BR
135 *
136 * Examples:
137 *
138 * file=gluster://1.2.3.4/testvol/a.img
139 * file=gluster+tcp://1.2.3.4/testvol/a.img
140 * file=gluster+tcp://1.2.3.4:24007/testvol/dir/a.img
d5cf4079 141 * file=gluster+tcp://host.domain.com:24007/testvol/dir/a.img
8d6d89cb 142 * file=gluster+unix:///testvol/dir/a.img?socket=/tmp/glusterd.socket
8d6d89cb 143 */
7edac2dd
PKK
144static int qemu_gluster_parse_uri(BlockdevOptionsGluster *gconf,
145 const char *filename)
8d6d89cb 146{
7edac2dd 147 GlusterServer *gsconf;
8d6d89cb
BR
148 URI *uri;
149 QueryParams *qp = NULL;
150 bool is_unix = false;
151 int ret = 0;
152
153 uri = uri_parse(filename);
154 if (!uri) {
155 return -EINVAL;
156 }
157
7edac2dd
PKK
158 gconf->server = gsconf = g_new0(GlusterServer, 1);
159
8d6d89cb 160 /* transport */
24897a76 161 if (!uri->scheme || !strcmp(uri->scheme, "gluster")) {
7edac2dd 162 gsconf->type = GLUSTER_TRANSPORT_TCP;
8d6d89cb 163 } else if (!strcmp(uri->scheme, "gluster+tcp")) {
7edac2dd 164 gsconf->type = GLUSTER_TRANSPORT_TCP;
8d6d89cb 165 } else if (!strcmp(uri->scheme, "gluster+unix")) {
7edac2dd 166 gsconf->type = GLUSTER_TRANSPORT_UNIX;
8d6d89cb
BR
167 is_unix = true;
168 } else if (!strcmp(uri->scheme, "gluster+rdma")) {
7edac2dd 169 gsconf->type = GLUSTER_TRANSPORT_TCP;
0552ff24
PKK
170 error_report("Warning: rdma feature is not supported, falling "
171 "back to tcp");
8d6d89cb
BR
172 } else {
173 ret = -EINVAL;
174 goto out;
175 }
176
177 ret = parse_volume_options(gconf, uri->path);
178 if (ret < 0) {
179 goto out;
180 }
181
182 qp = query_params_parse(uri->query);
183 if (qp->n > 1 || (is_unix && !qp->n) || (!is_unix && qp->n)) {
184 ret = -EINVAL;
185 goto out;
186 }
187
188 if (is_unix) {
189 if (uri->server || uri->port) {
190 ret = -EINVAL;
191 goto out;
192 }
193 if (strcmp(qp->p[0].name, "socket")) {
194 ret = -EINVAL;
195 goto out;
196 }
7edac2dd 197 gsconf->u.q_unix.path = g_strdup(qp->p[0].value);
8d6d89cb 198 } else {
7edac2dd
PKK
199 gsconf->u.tcp.host = g_strdup(uri->server ? uri->server : "localhost");
200 if (uri->port) {
201 gsconf->u.tcp.port = g_strdup_printf("%d", uri->port);
202 } else {
203 gsconf->u.tcp.port = g_strdup_printf("%d", GLUSTER_DEFAULT_PORT);
204 }
8d6d89cb
BR
205 }
206
207out:
208 if (qp) {
209 query_params_free(qp);
210 }
211 uri_free(uri);
212 return ret;
213}
214
7edac2dd
PKK
215static struct glfs *qemu_gluster_init(BlockdevOptionsGluster *gconf,
216 const char *filename, Error **errp)
8d6d89cb
BR
217{
218 struct glfs *glfs = NULL;
219 int ret;
220 int old_errno;
221
7edac2dd 222 ret = qemu_gluster_parse_uri(gconf, filename);
8d6d89cb 223 if (ret < 0) {
7edac2dd
PKK
224 error_setg(errp, "Invalid URI");
225 error_append_hint(errp, "Usage: file=gluster[+transport]://"
226 "[host[:port]]/volume/path[?socket=...]\n");
8d6d89cb
BR
227 errno = -ret;
228 goto out;
229 }
230
d5cf4079 231 glfs = glfs_new(gconf->volume);
8d6d89cb
BR
232 if (!glfs) {
233 goto out;
234 }
235
7edac2dd
PKK
236 if (gconf->server->type == GLUSTER_TRANSPORT_UNIX) {
237 ret = glfs_set_volfile_server(glfs,
238 GlusterTransport_lookup[gconf->server->type],
239 gconf->server->u.q_unix.path, 0);
240 } else {
241 ret = glfs_set_volfile_server(glfs,
242 GlusterTransport_lookup[gconf->server->type],
243 gconf->server->u.tcp.host,
244 atoi(gconf->server->u.tcp.port));
245 }
8d6d89cb
BR
246 if (ret < 0) {
247 goto out;
248 }
249
7eac868a 250 ret = glfs_set_logging(glfs, "-", gconf->debug_level);
8d6d89cb
BR
251 if (ret < 0) {
252 goto out;
253 }
254
255 ret = glfs_init(glfs);
256 if (ret) {
7edac2dd
PKK
257 if (gconf->server->type == GLUSTER_TRANSPORT_UNIX) {
258 error_setg(errp,
259 "Gluster connection for volume %s, path %s failed on "
260 "socket %s ", gconf->volume, gconf->path,
261 gconf->server->u.q_unix.path);
262 } else {
263 error_setg(errp,
264 "Gluster connection for volume %s, path %s failed on "
265 "host %s and port %s ", gconf->volume, gconf->path,
266 gconf->server->u.tcp.host, gconf->server->u.tcp.port);
267 }
4557117d
PK
268
269 /* glfs_init sometimes doesn't set errno although docs suggest that */
7edac2dd 270 if (errno == 0) {
4557117d 271 errno = EINVAL;
7edac2dd 272 }
4557117d 273
8d6d89cb
BR
274 goto out;
275 }
276 return glfs;
277
278out:
279 if (glfs) {
280 old_errno = errno;
281 glfs_fini(glfs);
282 errno = old_errno;
283 }
284 return NULL;
285}
286
15744b0b 287static void qemu_gluster_complete_aio(void *opaque)
8d6d89cb 288{
15744b0b 289 GlusterAIOCB *acb = (GlusterAIOCB *)opaque;
8d6d89cb 290
15744b0b
BR
291 qemu_bh_delete(acb->bh);
292 acb->bh = NULL;
0b8b8753 293 qemu_coroutine_enter(acb->coroutine);
8d6d89cb
BR
294}
295
7c815372
BR
296/*
297 * AIO callback routine called from GlusterFS thread.
298 */
299static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
300{
301 GlusterAIOCB *acb = (GlusterAIOCB *)arg;
302
303 if (!ret || ret == acb->size) {
304 acb->ret = 0; /* Success */
305 } else if (ret < 0) {
a8827453 306 acb->ret = -errno; /* Read/Write failed */
7c815372
BR
307 } else {
308 acb->ret = -EIO; /* Partial read/write - fail it */
309 }
310
6ee50af2 311 acb->bh = aio_bh_new(acb->aio_context, qemu_gluster_complete_aio, acb);
7c815372
BR
312 qemu_bh_schedule(acb->bh);
313}
314
1b37b344
JC
315static void qemu_gluster_parse_flags(int bdrv_flags, int *open_flags)
316{
317 assert(open_flags != NULL);
318
319 *open_flags |= O_BINARY;
320
321 if (bdrv_flags & BDRV_O_RDWR) {
322 *open_flags |= O_RDWR;
323 } else {
324 *open_flags |= O_RDONLY;
325 }
326
327 if ((bdrv_flags & BDRV_O_NOCACHE)) {
328 *open_flags |= O_DIRECT;
329 }
330}
331
947eb203
NV
332/*
333 * Do SEEK_DATA/HOLE to detect if it is functional. Older broken versions of
334 * gfapi incorrectly return the current offset when SEEK_DATA/HOLE is used.
335 * - Corrected versions return -1 and set errno to EINVAL.
336 * - Versions that support SEEK_DATA/HOLE correctly, will return -1 and set
337 * errno to ENXIO when SEEK_DATA is called with a position of EOF.
338 */
339static bool qemu_gluster_test_seek(struct glfs_fd *fd)
340{
341 off_t ret, eof;
342
343 eof = glfs_lseek(fd, 0, SEEK_END);
344 if (eof < 0) {
345 /* this should never occur */
346 return false;
347 }
348
349 /* this should always fail with ENXIO if SEEK_DATA is supported */
350 ret = glfs_lseek(fd, eof, SEEK_DATA);
351 return (ret < 0) && (errno == ENXIO);
352}
353
56d1b4d2 354static int qemu_gluster_open(BlockDriverState *bs, QDict *options,
015a1036 355 int bdrv_flags, Error **errp)
8d6d89cb
BR
356{
357 BDRVGlusterState *s = bs->opaque;
1b37b344 358 int open_flags = 0;
8d6d89cb 359 int ret = 0;
7edac2dd 360 BlockdevOptionsGluster *gconf = NULL;
b4894776
KW
361 QemuOpts *opts;
362 Error *local_err = NULL;
363 const char *filename;
364
87ea75d5 365 opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
b4894776 366 qemu_opts_absorb_qdict(opts, options, &local_err);
84d18f06 367 if (local_err) {
a7451cb8 368 error_propagate(errp, local_err);
b4894776
KW
369 ret = -EINVAL;
370 goto out;
371 }
372
7eac868a
JC
373 filename = qemu_opt_get(opts, GLUSTER_OPT_FILENAME);
374
375 s->debug_level = qemu_opt_get_number(opts, GLUSTER_OPT_DEBUG,
376 GLUSTER_DEBUG_DEFAULT);
377 if (s->debug_level < 0) {
378 s->debug_level = 0;
379 } else if (s->debug_level > GLUSTER_DEBUG_MAX) {
380 s->debug_level = GLUSTER_DEBUG_MAX;
381 }
b4894776 382
7edac2dd 383 gconf = g_new0(BlockdevOptionsGluster, 1);
7eac868a 384 gconf->debug_level = s->debug_level;
7edac2dd 385 gconf->has_debug_level = true;
a7451cb8 386 s->glfs = qemu_gluster_init(gconf, filename, errp);
8d6d89cb
BR
387 if (!s->glfs) {
388 ret = -errno;
389 goto out;
390 }
391
d85fa9eb
JC
392#ifdef CONFIG_GLUSTERFS_XLATOR_OPT
393 /* Without this, if fsync fails for a recoverable reason (for instance,
394 * ENOSPC), gluster will dump its cache, preventing retries. This means
395 * almost certain data loss. Not all gluster versions support the
396 * 'resync-failed-syncs-after-fsync' key value, but there is no way to
397 * discover during runtime if it is supported (this api returns success for
398 * unknown key/value pairs) */
399 ret = glfs_set_xlator_option(s->glfs, "*-write-behind",
400 "resync-failed-syncs-after-fsync",
401 "on");
402 if (ret < 0) {
403 error_setg_errno(errp, errno, "Unable to set xlator key/value pair");
404 ret = -errno;
405 goto out;
406 }
407#endif
408
1b37b344 409 qemu_gluster_parse_flags(bdrv_flags, &open_flags);
8d6d89cb 410
d5cf4079 411 s->fd = glfs_open(s->glfs, gconf->path, open_flags);
8d6d89cb
BR
412 if (!s->fd) {
413 ret = -errno;
8d6d89cb 414 }
8d6d89cb 415
947eb203
NV
416 s->supports_seek_data = qemu_gluster_test_seek(s->fd);
417
8d6d89cb 418out:
b4894776 419 qemu_opts_del(opts);
7edac2dd 420 qapi_free_BlockdevOptionsGluster(gconf);
8d6d89cb
BR
421 if (!ret) {
422 return ret;
423 }
424 if (s->fd) {
425 glfs_close(s->fd);
426 }
427 if (s->glfs) {
428 glfs_fini(s->glfs);
429 }
430 return ret;
431}
432
adccfbcd
JC
433static int qemu_gluster_reopen_prepare(BDRVReopenState *state,
434 BlockReopenQueue *queue, Error **errp)
435{
436 int ret = 0;
7eac868a 437 BDRVGlusterState *s;
adccfbcd 438 BDRVGlusterReopenState *reop_s;
7edac2dd 439 BlockdevOptionsGluster *gconf;
adccfbcd
JC
440 int open_flags = 0;
441
442 assert(state != NULL);
443 assert(state->bs != NULL);
444
7eac868a
JC
445 s = state->bs->opaque;
446
5839e53b 447 state->opaque = g_new0(BDRVGlusterReopenState, 1);
adccfbcd
JC
448 reop_s = state->opaque;
449
450 qemu_gluster_parse_flags(state->flags, &open_flags);
451
7edac2dd 452 gconf = g_new0(BlockdevOptionsGluster, 1);
7eac868a 453 gconf->debug_level = s->debug_level;
7edac2dd 454 gconf->has_debug_level = true;
f55ea629 455 reop_s->glfs = qemu_gluster_init(gconf, state->bs->filename, errp);
adccfbcd
JC
456 if (reop_s->glfs == NULL) {
457 ret = -errno;
458 goto exit;
459 }
460
d85fa9eb
JC
461#ifdef CONFIG_GLUSTERFS_XLATOR_OPT
462 ret = glfs_set_xlator_option(reop_s->glfs, "*-write-behind",
463 "resync-failed-syncs-after-fsync", "on");
464 if (ret < 0) {
465 error_setg_errno(errp, errno, "Unable to set xlator key/value pair");
466 ret = -errno;
467 goto exit;
468 }
469#endif
470
d5cf4079 471 reop_s->fd = glfs_open(reop_s->glfs, gconf->path, open_flags);
adccfbcd
JC
472 if (reop_s->fd == NULL) {
473 /* reops->glfs will be cleaned up in _abort */
474 ret = -errno;
475 goto exit;
476 }
477
478exit:
479 /* state->opaque will be freed in either the _abort or _commit */
7edac2dd 480 qapi_free_BlockdevOptionsGluster(gconf);
adccfbcd
JC
481 return ret;
482}
483
484static void qemu_gluster_reopen_commit(BDRVReopenState *state)
485{
486 BDRVGlusterReopenState *reop_s = state->opaque;
487 BDRVGlusterState *s = state->bs->opaque;
488
489
490 /* close the old */
491 if (s->fd) {
492 glfs_close(s->fd);
493 }
494 if (s->glfs) {
495 glfs_fini(s->glfs);
496 }
497
498 /* use the newly opened image / connection */
499 s->fd = reop_s->fd;
500 s->glfs = reop_s->glfs;
501
502 g_free(state->opaque);
503 state->opaque = NULL;
504
505 return;
506}
507
508
509static void qemu_gluster_reopen_abort(BDRVReopenState *state)
510{
511 BDRVGlusterReopenState *reop_s = state->opaque;
512
513 if (reop_s == NULL) {
514 return;
515 }
516
517 if (reop_s->fd) {
518 glfs_close(reop_s->fd);
519 }
520
521 if (reop_s->glfs) {
522 glfs_fini(reop_s->glfs);
523 }
524
525 g_free(state->opaque);
526 state->opaque = NULL;
527
528 return;
529}
530
7c815372 531#ifdef CONFIG_GLUSTERFS_ZEROFILL
e88a36eb 532static coroutine_fn int qemu_gluster_co_pwrite_zeroes(BlockDriverState *bs,
f70c50c8
PKK
533 int64_t offset,
534 int size,
535 BdrvRequestFlags flags)
7c815372
BR
536{
537 int ret;
c833d1e8 538 GlusterAIOCB acb;
7c815372 539 BDRVGlusterState *s = bs->opaque;
7c815372 540
c833d1e8
PB
541 acb.size = size;
542 acb.ret = 0;
543 acb.coroutine = qemu_coroutine_self();
544 acb.aio_context = bdrv_get_aio_context(bs);
7c815372 545
c833d1e8 546 ret = glfs_zerofill_async(s->fd, offset, size, gluster_finish_aiocb, &acb);
7c815372 547 if (ret < 0) {
c833d1e8 548 return -errno;
7c815372
BR
549 }
550
551 qemu_coroutine_yield();
c833d1e8 552 return acb.ret;
7c815372 553}
cf7f616b
BR
554
555static inline bool gluster_supports_zerofill(void)
556{
557 return 1;
558}
559
560static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset,
f70c50c8 561 int64_t size)
cf7f616b
BR
562{
563 return glfs_zerofill(fd, offset, size);
564}
565
566#else
567static inline bool gluster_supports_zerofill(void)
568{
569 return 0;
570}
571
572static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset,
f70c50c8 573 int64_t size)
cf7f616b
BR
574{
575 return 0;
576}
7c815372
BR
577#endif
578
8d6d89cb 579static int qemu_gluster_create(const char *filename,
90c772de 580 QemuOpts *opts, Error **errp)
8d6d89cb 581{
7edac2dd 582 BlockdevOptionsGluster *gconf;
8d6d89cb
BR
583 struct glfs *glfs;
584 struct glfs_fd *fd;
585 int ret = 0;
cf7f616b 586 int prealloc = 0;
8d6d89cb 587 int64_t total_size = 0;
90c772de 588 char *tmp = NULL;
8d6d89cb 589
7edac2dd 590 gconf = g_new0(BlockdevOptionsGluster, 1);
7eac868a
JC
591 gconf->debug_level = qemu_opt_get_number_del(opts, GLUSTER_OPT_DEBUG,
592 GLUSTER_DEBUG_DEFAULT);
593 if (gconf->debug_level < 0) {
594 gconf->debug_level = 0;
595 } else if (gconf->debug_level > GLUSTER_DEBUG_MAX) {
596 gconf->debug_level = GLUSTER_DEBUG_MAX;
597 }
7edac2dd 598 gconf->has_debug_level = true;
7eac868a 599
a7451cb8 600 glfs = qemu_gluster_init(gconf, filename, errp);
8d6d89cb 601 if (!glfs) {
4557117d 602 ret = -errno;
8d6d89cb
BR
603 goto out;
604 }
605
180e9526
HT
606 total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
607 BDRV_SECTOR_SIZE);
90c772de
CL
608
609 tmp = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
610 if (!tmp || !strcmp(tmp, "off")) {
611 prealloc = 0;
f70c50c8 612 } else if (!strcmp(tmp, "full") && gluster_supports_zerofill()) {
90c772de
CL
613 prealloc = 1;
614 } else {
615 error_setg(errp, "Invalid preallocation mode: '%s'"
f70c50c8 616 " or GlusterFS doesn't support zerofill API", tmp);
90c772de
CL
617 ret = -EINVAL;
618 goto out;
8d6d89cb
BR
619 }
620
d5cf4079 621 fd = glfs_creat(glfs, gconf->path,
f70c50c8 622 O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IRUSR | S_IWUSR);
8d6d89cb
BR
623 if (!fd) {
624 ret = -errno;
625 } else {
180e9526
HT
626 if (!glfs_ftruncate(fd, total_size)) {
627 if (prealloc && qemu_gluster_zerofill(fd, 0, total_size)) {
cf7f616b
BR
628 ret = -errno;
629 }
630 } else {
8d6d89cb
BR
631 ret = -errno;
632 }
cf7f616b 633
8d6d89cb
BR
634 if (glfs_close(fd) != 0) {
635 ret = -errno;
636 }
637 }
638out:
90c772de 639 g_free(tmp);
7edac2dd 640 qapi_free_BlockdevOptionsGluster(gconf);
8d6d89cb
BR
641 if (glfs) {
642 glfs_fini(glfs);
643 }
644 return ret;
645}
646
15744b0b 647static coroutine_fn int qemu_gluster_co_rw(BlockDriverState *bs,
f70c50c8
PKK
648 int64_t sector_num, int nb_sectors,
649 QEMUIOVector *qiov, int write)
8d6d89cb
BR
650{
651 int ret;
c833d1e8 652 GlusterAIOCB acb;
8d6d89cb 653 BDRVGlusterState *s = bs->opaque;
15744b0b
BR
654 size_t size = nb_sectors * BDRV_SECTOR_SIZE;
655 off_t offset = sector_num * BDRV_SECTOR_SIZE;
8d6d89cb 656
c833d1e8
PB
657 acb.size = size;
658 acb.ret = 0;
659 acb.coroutine = qemu_coroutine_self();
660 acb.aio_context = bdrv_get_aio_context(bs);
8d6d89cb
BR
661
662 if (write) {
663 ret = glfs_pwritev_async(s->fd, qiov->iov, qiov->niov, offset, 0,
f70c50c8 664 gluster_finish_aiocb, &acb);
8d6d89cb
BR
665 } else {
666 ret = glfs_preadv_async(s->fd, qiov->iov, qiov->niov, offset, 0,
f70c50c8 667 gluster_finish_aiocb, &acb);
8d6d89cb
BR
668 }
669
670 if (ret < 0) {
c833d1e8 671 return -errno;
8d6d89cb 672 }
15744b0b
BR
673
674 qemu_coroutine_yield();
c833d1e8 675 return acb.ret;
8d6d89cb
BR
676}
677
42ec24e2
PB
678static int qemu_gluster_truncate(BlockDriverState *bs, int64_t offset)
679{
680 int ret;
681 BDRVGlusterState *s = bs->opaque;
682
683 ret = glfs_ftruncate(s->fd, offset);
684 if (ret < 0) {
685 return -errno;
686 }
687
688 return 0;
689}
690
15744b0b 691static coroutine_fn int qemu_gluster_co_readv(BlockDriverState *bs,
f70c50c8
PKK
692 int64_t sector_num,
693 int nb_sectors,
694 QEMUIOVector *qiov)
8d6d89cb 695{
15744b0b 696 return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 0);
8d6d89cb
BR
697}
698
15744b0b 699static coroutine_fn int qemu_gluster_co_writev(BlockDriverState *bs,
f70c50c8
PKK
700 int64_t sector_num,
701 int nb_sectors,
702 QEMUIOVector *qiov)
8d6d89cb 703{
15744b0b 704 return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 1);
8d6d89cb
BR
705}
706
5d4343e6
JC
707static void qemu_gluster_close(BlockDriverState *bs)
708{
709 BDRVGlusterState *s = bs->opaque;
710
711 if (s->fd) {
712 glfs_close(s->fd);
713 s->fd = NULL;
714 }
715 glfs_fini(s->glfs);
716}
717
15744b0b 718static coroutine_fn int qemu_gluster_co_flush_to_disk(BlockDriverState *bs)
8d6d89cb
BR
719{
720 int ret;
c833d1e8 721 GlusterAIOCB acb;
8d6d89cb
BR
722 BDRVGlusterState *s = bs->opaque;
723
c833d1e8
PB
724 acb.size = 0;
725 acb.ret = 0;
726 acb.coroutine = qemu_coroutine_self();
727 acb.aio_context = bdrv_get_aio_context(bs);
8d6d89cb 728
c833d1e8 729 ret = glfs_fsync_async(s->fd, gluster_finish_aiocb, &acb);
8d6d89cb 730 if (ret < 0) {
d85fa9eb
JC
731 ret = -errno;
732 goto error;
8d6d89cb 733 }
15744b0b
BR
734
735 qemu_coroutine_yield();
d85fa9eb
JC
736 if (acb.ret < 0) {
737 ret = acb.ret;
738 goto error;
739 }
740
c833d1e8 741 return acb.ret;
d85fa9eb
JC
742
743error:
744 /* Some versions of Gluster (3.5.6 -> 3.5.8?) will not retain its cache
745 * after a fsync failure, so we have no way of allowing the guest to safely
746 * continue. Gluster versions prior to 3.5.6 don't retain the cache
747 * either, but will invalidate the fd on error, so this is again our only
748 * option.
749 *
750 * The 'resync-failed-syncs-after-fsync' xlator option for the
751 * write-behind cache will cause later gluster versions to retain its
752 * cache after error, so long as the fd remains open. However, we
753 * currently have no way of knowing if this option is supported.
754 *
755 * TODO: Once gluster provides a way for us to determine if the option
756 * is supported, bypass the closure and setting drv to NULL. */
757 qemu_gluster_close(bs);
758 bs->drv = NULL;
759 return ret;
8d6d89cb
BR
760}
761
0c14fb47 762#ifdef CONFIG_GLUSTERFS_DISCARD
15744b0b 763static coroutine_fn int qemu_gluster_co_discard(BlockDriverState *bs,
f70c50c8
PKK
764 int64_t sector_num,
765 int nb_sectors)
0c14fb47
BR
766{
767 int ret;
c833d1e8 768 GlusterAIOCB acb;
0c14fb47 769 BDRVGlusterState *s = bs->opaque;
15744b0b
BR
770 size_t size = nb_sectors * BDRV_SECTOR_SIZE;
771 off_t offset = sector_num * BDRV_SECTOR_SIZE;
0c14fb47 772
c833d1e8
PB
773 acb.size = 0;
774 acb.ret = 0;
775 acb.coroutine = qemu_coroutine_self();
776 acb.aio_context = bdrv_get_aio_context(bs);
0c14fb47 777
c833d1e8 778 ret = glfs_discard_async(s->fd, offset, size, gluster_finish_aiocb, &acb);
0c14fb47 779 if (ret < 0) {
c833d1e8 780 return -errno;
0c14fb47 781 }
15744b0b
BR
782
783 qemu_coroutine_yield();
c833d1e8 784 return acb.ret;
0c14fb47
BR
785}
786#endif
787
8d6d89cb
BR
788static int64_t qemu_gluster_getlength(BlockDriverState *bs)
789{
790 BDRVGlusterState *s = bs->opaque;
791 int64_t ret;
792
793 ret = glfs_lseek(s->fd, 0, SEEK_END);
794 if (ret < 0) {
795 return -errno;
796 } else {
797 return ret;
798 }
799}
800
801static int64_t qemu_gluster_allocated_file_size(BlockDriverState *bs)
802{
803 BDRVGlusterState *s = bs->opaque;
804 struct stat st;
805 int ret;
806
807 ret = glfs_fstat(s->fd, &st);
808 if (ret < 0) {
809 return -errno;
810 } else {
811 return st.st_blocks * 512;
812 }
813}
814
8ab6feec
KW
815static int qemu_gluster_has_zero_init(BlockDriverState *bs)
816{
817 /* GlusterFS volume could be backed by a block device */
818 return 0;
819}
820
947eb203
NV
821/*
822 * Find allocation range in @bs around offset @start.
823 * May change underlying file descriptor's file offset.
824 * If @start is not in a hole, store @start in @data, and the
825 * beginning of the next hole in @hole, and return 0.
826 * If @start is in a non-trailing hole, store @start in @hole and the
827 * beginning of the next non-hole in @data, and return 0.
828 * If @start is in a trailing hole or beyond EOF, return -ENXIO.
829 * If we can't find out, return a negative errno other than -ENXIO.
830 *
831 * (Shamefully copied from raw-posix.c, only miniscule adaptions.)
832 */
833static int find_allocation(BlockDriverState *bs, off_t start,
834 off_t *data, off_t *hole)
835{
836 BDRVGlusterState *s = bs->opaque;
837 off_t offs;
838
839 if (!s->supports_seek_data) {
840 return -ENOTSUP;
841 }
842
843 /*
844 * SEEK_DATA cases:
845 * D1. offs == start: start is in data
846 * D2. offs > start: start is in a hole, next data at offs
847 * D3. offs < 0, errno = ENXIO: either start is in a trailing hole
848 * or start is beyond EOF
849 * If the latter happens, the file has been truncated behind
850 * our back since we opened it. All bets are off then.
851 * Treating like a trailing hole is simplest.
852 * D4. offs < 0, errno != ENXIO: we learned nothing
853 */
854 offs = glfs_lseek(s->fd, start, SEEK_DATA);
855 if (offs < 0) {
856 return -errno; /* D3 or D4 */
857 }
858 assert(offs >= start);
859
860 if (offs > start) {
861 /* D2: in hole, next data at offs */
862 *hole = start;
863 *data = offs;
864 return 0;
865 }
866
867 /* D1: in data, end not yet known */
868
869 /*
870 * SEEK_HOLE cases:
871 * H1. offs == start: start is in a hole
872 * If this happens here, a hole has been dug behind our back
873 * since the previous lseek().
874 * H2. offs > start: either start is in data, next hole at offs,
875 * or start is in trailing hole, EOF at offs
876 * Linux treats trailing holes like any other hole: offs ==
877 * start. Solaris seeks to EOF instead: offs > start (blech).
878 * If that happens here, a hole has been dug behind our back
879 * since the previous lseek().
880 * H3. offs < 0, errno = ENXIO: start is beyond EOF
881 * If this happens, the file has been truncated behind our
882 * back since we opened it. Treat it like a trailing hole.
883 * H4. offs < 0, errno != ENXIO: we learned nothing
884 * Pretend we know nothing at all, i.e. "forget" about D1.
885 */
886 offs = glfs_lseek(s->fd, start, SEEK_HOLE);
887 if (offs < 0) {
888 return -errno; /* D1 and (H3 or H4) */
889 }
890 assert(offs >= start);
891
892 if (offs > start) {
893 /*
894 * D1 and H2: either in data, next hole at offs, or it was in
895 * data but is now in a trailing hole. In the latter case,
896 * all bets are off. Treating it as if it there was data all
897 * the way to EOF is safe, so simply do that.
898 */
899 *data = start;
900 *hole = offs;
901 return 0;
902 }
903
904 /* D1 and H1 */
905 return -EBUSY;
906}
907
908/*
909 * Returns the allocation status of the specified sectors.
910 *
911 * If 'sector_num' is beyond the end of the disk image the return value is 0
912 * and 'pnum' is set to 0.
913 *
914 * 'pnum' is set to the number of sectors (including and immediately following
915 * the specified sector) that are known to be in the same
916 * allocated/unallocated state.
917 *
918 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
919 * beyond the end of the disk image it will be clamped.
920 *
921 * (Based on raw_co_get_block_status() from raw-posix.c.)
922 */
923static int64_t coroutine_fn qemu_gluster_co_get_block_status(
924 BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum,
925 BlockDriverState **file)
926{
927 BDRVGlusterState *s = bs->opaque;
928 off_t start, data = 0, hole = 0;
929 int64_t total_size;
930 int ret = -EINVAL;
931
932 if (!s->fd) {
933 return ret;
934 }
935
936 start = sector_num * BDRV_SECTOR_SIZE;
937 total_size = bdrv_getlength(bs);
938 if (total_size < 0) {
939 return total_size;
940 } else if (start >= total_size) {
941 *pnum = 0;
942 return 0;
943 } else if (start + nb_sectors * BDRV_SECTOR_SIZE > total_size) {
944 nb_sectors = DIV_ROUND_UP(total_size - start, BDRV_SECTOR_SIZE);
945 }
946
947 ret = find_allocation(bs, start, &data, &hole);
948 if (ret == -ENXIO) {
949 /* Trailing hole */
950 *pnum = nb_sectors;
951 ret = BDRV_BLOCK_ZERO;
952 } else if (ret < 0) {
953 /* No info available, so pretend there are no holes */
954 *pnum = nb_sectors;
955 ret = BDRV_BLOCK_DATA;
956 } else if (data == start) {
957 /* On a data extent, compute sectors to the end of the extent,
958 * possibly including a partial sector at EOF. */
959 *pnum = MIN(nb_sectors, DIV_ROUND_UP(hole - start, BDRV_SECTOR_SIZE));
960 ret = BDRV_BLOCK_DATA;
961 } else {
962 /* On a hole, compute sectors to the beginning of the next extent. */
963 assert(hole == start);
964 *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE);
965 ret = BDRV_BLOCK_ZERO;
966 }
967
968 *file = bs;
969
970 return ret | BDRV_BLOCK_OFFSET_VALID | start;
971}
972
973
8d6d89cb
BR
974static BlockDriver bdrv_gluster = {
975 .format_name = "gluster",
976 .protocol_name = "gluster",
977 .instance_size = sizeof(BDRVGlusterState),
030be321 978 .bdrv_needs_filename = true,
8d6d89cb 979 .bdrv_file_open = qemu_gluster_open,
adccfbcd
JC
980 .bdrv_reopen_prepare = qemu_gluster_reopen_prepare,
981 .bdrv_reopen_commit = qemu_gluster_reopen_commit,
982 .bdrv_reopen_abort = qemu_gluster_reopen_abort,
8d6d89cb 983 .bdrv_close = qemu_gluster_close,
c282e1fd 984 .bdrv_create = qemu_gluster_create,
8d6d89cb
BR
985 .bdrv_getlength = qemu_gluster_getlength,
986 .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
42ec24e2 987 .bdrv_truncate = qemu_gluster_truncate,
15744b0b
BR
988 .bdrv_co_readv = qemu_gluster_co_readv,
989 .bdrv_co_writev = qemu_gluster_co_writev,
990 .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk,
8ab6feec 991 .bdrv_has_zero_init = qemu_gluster_has_zero_init,
0c14fb47 992#ifdef CONFIG_GLUSTERFS_DISCARD
15744b0b 993 .bdrv_co_discard = qemu_gluster_co_discard,
7c815372
BR
994#endif
995#ifdef CONFIG_GLUSTERFS_ZEROFILL
e88a36eb 996 .bdrv_co_pwrite_zeroes = qemu_gluster_co_pwrite_zeroes,
0c14fb47 997#endif
947eb203 998 .bdrv_co_get_block_status = qemu_gluster_co_get_block_status,
90c772de 999 .create_opts = &qemu_gluster_create_opts,
8d6d89cb
BR
1000};
1001
1002static BlockDriver bdrv_gluster_tcp = {
1003 .format_name = "gluster",
1004 .protocol_name = "gluster+tcp",
1005 .instance_size = sizeof(BDRVGlusterState),
030be321 1006 .bdrv_needs_filename = true,
8d6d89cb 1007 .bdrv_file_open = qemu_gluster_open,
adccfbcd
JC
1008 .bdrv_reopen_prepare = qemu_gluster_reopen_prepare,
1009 .bdrv_reopen_commit = qemu_gluster_reopen_commit,
1010 .bdrv_reopen_abort = qemu_gluster_reopen_abort,
8d6d89cb 1011 .bdrv_close = qemu_gluster_close,
c282e1fd 1012 .bdrv_create = qemu_gluster_create,
8d6d89cb
BR
1013 .bdrv_getlength = qemu_gluster_getlength,
1014 .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
42ec24e2 1015 .bdrv_truncate = qemu_gluster_truncate,
15744b0b
BR
1016 .bdrv_co_readv = qemu_gluster_co_readv,
1017 .bdrv_co_writev = qemu_gluster_co_writev,
1018 .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk,
8ab6feec 1019 .bdrv_has_zero_init = qemu_gluster_has_zero_init,
0c14fb47 1020#ifdef CONFIG_GLUSTERFS_DISCARD
15744b0b 1021 .bdrv_co_discard = qemu_gluster_co_discard,
7c815372
BR
1022#endif
1023#ifdef CONFIG_GLUSTERFS_ZEROFILL
e88a36eb 1024 .bdrv_co_pwrite_zeroes = qemu_gluster_co_pwrite_zeroes,
0c14fb47 1025#endif
947eb203 1026 .bdrv_co_get_block_status = qemu_gluster_co_get_block_status,
90c772de 1027 .create_opts = &qemu_gluster_create_opts,
8d6d89cb
BR
1028};
1029
1030static BlockDriver bdrv_gluster_unix = {
1031 .format_name = "gluster",
1032 .protocol_name = "gluster+unix",
1033 .instance_size = sizeof(BDRVGlusterState),
030be321 1034 .bdrv_needs_filename = true,
8d6d89cb 1035 .bdrv_file_open = qemu_gluster_open,
adccfbcd
JC
1036 .bdrv_reopen_prepare = qemu_gluster_reopen_prepare,
1037 .bdrv_reopen_commit = qemu_gluster_reopen_commit,
1038 .bdrv_reopen_abort = qemu_gluster_reopen_abort,
8d6d89cb 1039 .bdrv_close = qemu_gluster_close,
c282e1fd 1040 .bdrv_create = qemu_gluster_create,
8d6d89cb
BR
1041 .bdrv_getlength = qemu_gluster_getlength,
1042 .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
42ec24e2 1043 .bdrv_truncate = qemu_gluster_truncate,
15744b0b
BR
1044 .bdrv_co_readv = qemu_gluster_co_readv,
1045 .bdrv_co_writev = qemu_gluster_co_writev,
1046 .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk,
8ab6feec 1047 .bdrv_has_zero_init = qemu_gluster_has_zero_init,
0c14fb47 1048#ifdef CONFIG_GLUSTERFS_DISCARD
15744b0b 1049 .bdrv_co_discard = qemu_gluster_co_discard,
7c815372
BR
1050#endif
1051#ifdef CONFIG_GLUSTERFS_ZEROFILL
e88a36eb 1052 .bdrv_co_pwrite_zeroes = qemu_gluster_co_pwrite_zeroes,
0c14fb47 1053#endif
947eb203 1054 .bdrv_co_get_block_status = qemu_gluster_co_get_block_status,
90c772de 1055 .create_opts = &qemu_gluster_create_opts,
8d6d89cb
BR
1056};
1057
0552ff24
PKK
1058/* rdma is deprecated (actually never supported for volfile fetch).
1059 * Let's maintain it for the protocol compatibility, to make sure things
1060 * won't break immediately. For now, gluster+rdma will fall back to gluster+tcp
1061 * protocol with a warning.
1062 * TODO: remove gluster+rdma interface support
1063 */
8d6d89cb
BR
1064static BlockDriver bdrv_gluster_rdma = {
1065 .format_name = "gluster",
1066 .protocol_name = "gluster+rdma",
1067 .instance_size = sizeof(BDRVGlusterState),
030be321 1068 .bdrv_needs_filename = true,
8d6d89cb 1069 .bdrv_file_open = qemu_gluster_open,
adccfbcd
JC
1070 .bdrv_reopen_prepare = qemu_gluster_reopen_prepare,
1071 .bdrv_reopen_commit = qemu_gluster_reopen_commit,
1072 .bdrv_reopen_abort = qemu_gluster_reopen_abort,
8d6d89cb 1073 .bdrv_close = qemu_gluster_close,
c282e1fd 1074 .bdrv_create = qemu_gluster_create,
8d6d89cb
BR
1075 .bdrv_getlength = qemu_gluster_getlength,
1076 .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
42ec24e2 1077 .bdrv_truncate = qemu_gluster_truncate,
15744b0b
BR
1078 .bdrv_co_readv = qemu_gluster_co_readv,
1079 .bdrv_co_writev = qemu_gluster_co_writev,
1080 .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk,
8ab6feec 1081 .bdrv_has_zero_init = qemu_gluster_has_zero_init,
0c14fb47 1082#ifdef CONFIG_GLUSTERFS_DISCARD
15744b0b 1083 .bdrv_co_discard = qemu_gluster_co_discard,
7c815372
BR
1084#endif
1085#ifdef CONFIG_GLUSTERFS_ZEROFILL
e88a36eb 1086 .bdrv_co_pwrite_zeroes = qemu_gluster_co_pwrite_zeroes,
0c14fb47 1087#endif
947eb203 1088 .bdrv_co_get_block_status = qemu_gluster_co_get_block_status,
90c772de 1089 .create_opts = &qemu_gluster_create_opts,
8d6d89cb
BR
1090};
1091
1092static void bdrv_gluster_init(void)
1093{
1094 bdrv_register(&bdrv_gluster_rdma);
1095 bdrv_register(&bdrv_gluster_unix);
1096 bdrv_register(&bdrv_gluster_tcp);
1097 bdrv_register(&bdrv_gluster);
1098}
1099
1100block_init(bdrv_gluster_init);