]> git.proxmox.com Git - mirror_qemu.git/blame - block/gluster.c
block/gluster: rename [server, volname, image] -> [host, volume, path]
[mirror_qemu.git] / block / gluster.c
CommitLineData
8d6d89cb
BR
1/*
2 * GlusterFS backend for QEMU
3 *
4 * Copyright (C) 2012 Bharata B Rao <bharata@linux.vnet.ibm.com>
5 *
85c09bc0
BR
6 * This work is licensed under the terms of the GNU GPL, version 2 or later.
7 * See the COPYING file in the top-level directory.
8d6d89cb 8 *
8d6d89cb 9 */
80c71a24 10#include "qemu/osdep.h"
8d6d89cb 11#include <glusterfs/api/glfs.h>
737e150e 12#include "block/block_int.h"
da34e65c 13#include "qapi/error.h"
1de7afc9 14#include "qemu/uri.h"
8d6d89cb
BR
15
16typedef struct GlusterAIOCB {
8d6d89cb
BR
17 int64_t size;
18 int ret;
8d6d89cb 19 QEMUBH *bh;
15744b0b 20 Coroutine *coroutine;
6ee50af2 21 AioContext *aio_context;
8d6d89cb
BR
22} GlusterAIOCB;
23
24typedef struct BDRVGlusterState {
25 struct glfs *glfs;
8d6d89cb 26 struct glfs_fd *fd;
947eb203 27 bool supports_seek_data;
7eac868a 28 int debug_level;
8d6d89cb
BR
29} BDRVGlusterState;
30
8d6d89cb 31typedef struct GlusterConf {
d5cf4079 32 char *host;
8d6d89cb 33 int port;
d5cf4079
PKK
34 char *volume;
35 char *path;
8d6d89cb 36 char *transport;
7eac868a 37 int debug_level;
8d6d89cb
BR
38} GlusterConf;
39
40static void qemu_gluster_gconf_free(GlusterConf *gconf)
41{
1b37b344 42 if (gconf) {
d5cf4079
PKK
43 g_free(gconf->host);
44 g_free(gconf->volume);
45 g_free(gconf->path);
1b37b344
JC
46 g_free(gconf->transport);
47 g_free(gconf);
48 }
8d6d89cb
BR
49}
50
51static int parse_volume_options(GlusterConf *gconf, char *path)
52{
53 char *p, *q;
54
55 if (!path) {
56 return -EINVAL;
57 }
58
59 /* volume */
60 p = q = path + strspn(path, "/");
61 p += strcspn(p, "/");
62 if (*p == '\0') {
63 return -EINVAL;
64 }
d5cf4079 65 gconf->volume = g_strndup(q, p - q);
8d6d89cb 66
d5cf4079 67 /* path */
8d6d89cb
BR
68 p += strspn(p, "/");
69 if (*p == '\0') {
70 return -EINVAL;
71 }
d5cf4079 72 gconf->path = g_strdup(p);
8d6d89cb
BR
73 return 0;
74}
75
76/*
d5cf4079 77 * file=gluster[+transport]://[host[:port]]/volume/path[?socket=...]
8d6d89cb
BR
78 *
79 * 'gluster' is the protocol.
80 *
81 * 'transport' specifies the transport type used to connect to gluster
82 * management daemon (glusterd). Valid transport types are
83 * tcp, unix and rdma. If a transport type isn't specified, then tcp
84 * type is assumed.
85 *
d5cf4079 86 * 'host' specifies the host where the volume file specification for
8d6d89cb
BR
87 * the given volume resides. This can be either hostname, ipv4 address
88 * or ipv6 address. ipv6 address needs to be within square brackets [ ].
d5cf4079 89 * If transport type is 'unix', then 'host' field should not be specified.
8d6d89cb
BR
90 * The 'socket' field needs to be populated with the path to unix domain
91 * socket.
92 *
93 * 'port' is the port number on which glusterd is listening. This is optional
94 * and if not specified, QEMU will send 0 which will make gluster to use the
95 * default port. If the transport type is unix, then 'port' should not be
96 * specified.
97 *
d5cf4079 98 * 'volume' is the name of the gluster volume which contains the VM image.
8d6d89cb 99 *
d5cf4079 100 * 'path' is the path to the actual VM image that resides on gluster volume.
8d6d89cb
BR
101 *
102 * Examples:
103 *
104 * file=gluster://1.2.3.4/testvol/a.img
105 * file=gluster+tcp://1.2.3.4/testvol/a.img
106 * file=gluster+tcp://1.2.3.4:24007/testvol/dir/a.img
107 * file=gluster+tcp://[1:2:3:4:5:6:7:8]/testvol/dir/a.img
108 * file=gluster+tcp://[1:2:3:4:5:6:7:8]:24007/testvol/dir/a.img
d5cf4079 109 * file=gluster+tcp://host.domain.com:24007/testvol/dir/a.img
8d6d89cb
BR
110 * file=gluster+unix:///testvol/dir/a.img?socket=/tmp/glusterd.socket
111 * file=gluster+rdma://1.2.3.4:24007/testvol/a.img
112 */
113static int qemu_gluster_parseuri(GlusterConf *gconf, const char *filename)
114{
115 URI *uri;
116 QueryParams *qp = NULL;
117 bool is_unix = false;
118 int ret = 0;
119
120 uri = uri_parse(filename);
121 if (!uri) {
122 return -EINVAL;
123 }
124
125 /* transport */
24897a76 126 if (!uri->scheme || !strcmp(uri->scheme, "gluster")) {
8d6d89cb
BR
127 gconf->transport = g_strdup("tcp");
128 } else if (!strcmp(uri->scheme, "gluster+tcp")) {
129 gconf->transport = g_strdup("tcp");
130 } else if (!strcmp(uri->scheme, "gluster+unix")) {
131 gconf->transport = g_strdup("unix");
132 is_unix = true;
133 } else if (!strcmp(uri->scheme, "gluster+rdma")) {
134 gconf->transport = g_strdup("rdma");
135 } else {
136 ret = -EINVAL;
137 goto out;
138 }
139
140 ret = parse_volume_options(gconf, uri->path);
141 if (ret < 0) {
142 goto out;
143 }
144
145 qp = query_params_parse(uri->query);
146 if (qp->n > 1 || (is_unix && !qp->n) || (!is_unix && qp->n)) {
147 ret = -EINVAL;
148 goto out;
149 }
150
151 if (is_unix) {
152 if (uri->server || uri->port) {
153 ret = -EINVAL;
154 goto out;
155 }
156 if (strcmp(qp->p[0].name, "socket")) {
157 ret = -EINVAL;
158 goto out;
159 }
d5cf4079 160 gconf->host = g_strdup(qp->p[0].value);
8d6d89cb 161 } else {
d5cf4079 162 gconf->host = g_strdup(uri->server ? uri->server : "localhost");
8d6d89cb
BR
163 gconf->port = uri->port;
164 }
165
166out:
167 if (qp) {
168 query_params_free(qp);
169 }
170 uri_free(uri);
171 return ret;
172}
173
a7451cb8
PB
174static struct glfs *qemu_gluster_init(GlusterConf *gconf, const char *filename,
175 Error **errp)
8d6d89cb
BR
176{
177 struct glfs *glfs = NULL;
178 int ret;
179 int old_errno;
180
181 ret = qemu_gluster_parseuri(gconf, filename);
182 if (ret < 0) {
d5cf4079
PKK
183 error_setg(errp, "Usage: file=gluster[+transport]://[host[:port]]/"
184 "volume/path[?socket=...]");
8d6d89cb
BR
185 errno = -ret;
186 goto out;
187 }
188
d5cf4079 189 glfs = glfs_new(gconf->volume);
8d6d89cb
BR
190 if (!glfs) {
191 goto out;
192 }
193
d5cf4079 194 ret = glfs_set_volfile_server(glfs, gconf->transport, gconf->host,
8d6d89cb
BR
195 gconf->port);
196 if (ret < 0) {
197 goto out;
198 }
199
7eac868a 200 ret = glfs_set_logging(glfs, "-", gconf->debug_level);
8d6d89cb
BR
201 if (ret < 0) {
202 goto out;
203 }
204
205 ret = glfs_init(glfs);
206 if (ret) {
a7451cb8 207 error_setg_errno(errp, errno,
d5cf4079
PKK
208 "Gluster connection failed for host=%s port=%d "
209 "volume=%s path=%s transport=%s", gconf->host,
210 gconf->port, gconf->volume, gconf->path,
a7451cb8 211 gconf->transport);
4557117d
PK
212
213 /* glfs_init sometimes doesn't set errno although docs suggest that */
214 if (errno == 0)
215 errno = EINVAL;
216
8d6d89cb
BR
217 goto out;
218 }
219 return glfs;
220
221out:
222 if (glfs) {
223 old_errno = errno;
224 glfs_fini(glfs);
225 errno = old_errno;
226 }
227 return NULL;
228}
229
15744b0b 230static void qemu_gluster_complete_aio(void *opaque)
8d6d89cb 231{
15744b0b 232 GlusterAIOCB *acb = (GlusterAIOCB *)opaque;
8d6d89cb 233
15744b0b
BR
234 qemu_bh_delete(acb->bh);
235 acb->bh = NULL;
0b8b8753 236 qemu_coroutine_enter(acb->coroutine);
8d6d89cb
BR
237}
238
7c815372
BR
239/*
240 * AIO callback routine called from GlusterFS thread.
241 */
242static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
243{
244 GlusterAIOCB *acb = (GlusterAIOCB *)arg;
245
246 if (!ret || ret == acb->size) {
247 acb->ret = 0; /* Success */
248 } else if (ret < 0) {
a8827453 249 acb->ret = -errno; /* Read/Write failed */
7c815372
BR
250 } else {
251 acb->ret = -EIO; /* Partial read/write - fail it */
252 }
253
6ee50af2 254 acb->bh = aio_bh_new(acb->aio_context, qemu_gluster_complete_aio, acb);
7c815372
BR
255 qemu_bh_schedule(acb->bh);
256}
257
7eac868a
JC
258#define GLUSTER_OPT_FILENAME "filename"
259#define GLUSTER_OPT_DEBUG "debug"
260#define GLUSTER_DEBUG_DEFAULT 4
261#define GLUSTER_DEBUG_MAX 9
262
b4894776
KW
263/* TODO Convert to fine grained options */
264static QemuOptsList runtime_opts = {
265 .name = "gluster",
266 .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
267 .desc = {
268 {
7eac868a 269 .name = GLUSTER_OPT_FILENAME,
b4894776
KW
270 .type = QEMU_OPT_STRING,
271 .help = "URL to the gluster image",
272 },
7eac868a
JC
273 {
274 .name = GLUSTER_OPT_DEBUG,
275 .type = QEMU_OPT_NUMBER,
276 .help = "Gluster log level, valid range is 0-9",
277 },
b4894776
KW
278 { /* end of list */ }
279 },
280};
281
1b37b344
JC
282static void qemu_gluster_parse_flags(int bdrv_flags, int *open_flags)
283{
284 assert(open_flags != NULL);
285
286 *open_flags |= O_BINARY;
287
288 if (bdrv_flags & BDRV_O_RDWR) {
289 *open_flags |= O_RDWR;
290 } else {
291 *open_flags |= O_RDONLY;
292 }
293
294 if ((bdrv_flags & BDRV_O_NOCACHE)) {
295 *open_flags |= O_DIRECT;
296 }
297}
298
947eb203
NV
299/*
300 * Do SEEK_DATA/HOLE to detect if it is functional. Older broken versions of
301 * gfapi incorrectly return the current offset when SEEK_DATA/HOLE is used.
302 * - Corrected versions return -1 and set errno to EINVAL.
303 * - Versions that support SEEK_DATA/HOLE correctly, will return -1 and set
304 * errno to ENXIO when SEEK_DATA is called with a position of EOF.
305 */
306static bool qemu_gluster_test_seek(struct glfs_fd *fd)
307{
308 off_t ret, eof;
309
310 eof = glfs_lseek(fd, 0, SEEK_END);
311 if (eof < 0) {
312 /* this should never occur */
313 return false;
314 }
315
316 /* this should always fail with ENXIO if SEEK_DATA is supported */
317 ret = glfs_lseek(fd, eof, SEEK_DATA);
318 return (ret < 0) && (errno == ENXIO);
319}
320
56d1b4d2 321static int qemu_gluster_open(BlockDriverState *bs, QDict *options,
015a1036 322 int bdrv_flags, Error **errp)
8d6d89cb
BR
323{
324 BDRVGlusterState *s = bs->opaque;
1b37b344 325 int open_flags = 0;
8d6d89cb 326 int ret = 0;
5839e53b 327 GlusterConf *gconf = g_new0(GlusterConf, 1);
b4894776
KW
328 QemuOpts *opts;
329 Error *local_err = NULL;
330 const char *filename;
331
87ea75d5 332 opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
b4894776 333 qemu_opts_absorb_qdict(opts, options, &local_err);
84d18f06 334 if (local_err) {
a7451cb8 335 error_propagate(errp, local_err);
b4894776
KW
336 ret = -EINVAL;
337 goto out;
338 }
339
7eac868a
JC
340 filename = qemu_opt_get(opts, GLUSTER_OPT_FILENAME);
341
342 s->debug_level = qemu_opt_get_number(opts, GLUSTER_OPT_DEBUG,
343 GLUSTER_DEBUG_DEFAULT);
344 if (s->debug_level < 0) {
345 s->debug_level = 0;
346 } else if (s->debug_level > GLUSTER_DEBUG_MAX) {
347 s->debug_level = GLUSTER_DEBUG_MAX;
348 }
b4894776 349
7eac868a 350 gconf->debug_level = s->debug_level;
a7451cb8 351 s->glfs = qemu_gluster_init(gconf, filename, errp);
8d6d89cb
BR
352 if (!s->glfs) {
353 ret = -errno;
354 goto out;
355 }
356
d85fa9eb
JC
357#ifdef CONFIG_GLUSTERFS_XLATOR_OPT
358 /* Without this, if fsync fails for a recoverable reason (for instance,
359 * ENOSPC), gluster will dump its cache, preventing retries. This means
360 * almost certain data loss. Not all gluster versions support the
361 * 'resync-failed-syncs-after-fsync' key value, but there is no way to
362 * discover during runtime if it is supported (this api returns success for
363 * unknown key/value pairs) */
364 ret = glfs_set_xlator_option(s->glfs, "*-write-behind",
365 "resync-failed-syncs-after-fsync",
366 "on");
367 if (ret < 0) {
368 error_setg_errno(errp, errno, "Unable to set xlator key/value pair");
369 ret = -errno;
370 goto out;
371 }
372#endif
373
1b37b344 374 qemu_gluster_parse_flags(bdrv_flags, &open_flags);
8d6d89cb 375
d5cf4079 376 s->fd = glfs_open(s->glfs, gconf->path, open_flags);
8d6d89cb
BR
377 if (!s->fd) {
378 ret = -errno;
8d6d89cb 379 }
8d6d89cb 380
947eb203
NV
381 s->supports_seek_data = qemu_gluster_test_seek(s->fd);
382
8d6d89cb 383out:
b4894776 384 qemu_opts_del(opts);
8d6d89cb
BR
385 qemu_gluster_gconf_free(gconf);
386 if (!ret) {
387 return ret;
388 }
389 if (s->fd) {
390 glfs_close(s->fd);
391 }
392 if (s->glfs) {
393 glfs_fini(s->glfs);
394 }
395 return ret;
396}
397
adccfbcd
JC
398typedef struct BDRVGlusterReopenState {
399 struct glfs *glfs;
400 struct glfs_fd *fd;
401} BDRVGlusterReopenState;
402
403
404static int qemu_gluster_reopen_prepare(BDRVReopenState *state,
405 BlockReopenQueue *queue, Error **errp)
406{
407 int ret = 0;
7eac868a 408 BDRVGlusterState *s;
adccfbcd
JC
409 BDRVGlusterReopenState *reop_s;
410 GlusterConf *gconf = NULL;
411 int open_flags = 0;
412
413 assert(state != NULL);
414 assert(state->bs != NULL);
415
7eac868a
JC
416 s = state->bs->opaque;
417
5839e53b 418 state->opaque = g_new0(BDRVGlusterReopenState, 1);
adccfbcd
JC
419 reop_s = state->opaque;
420
421 qemu_gluster_parse_flags(state->flags, &open_flags);
422
5839e53b 423 gconf = g_new0(GlusterConf, 1);
adccfbcd 424
7eac868a 425 gconf->debug_level = s->debug_level;
f55ea629 426 reop_s->glfs = qemu_gluster_init(gconf, state->bs->filename, errp);
adccfbcd
JC
427 if (reop_s->glfs == NULL) {
428 ret = -errno;
429 goto exit;
430 }
431
d85fa9eb
JC
432#ifdef CONFIG_GLUSTERFS_XLATOR_OPT
433 ret = glfs_set_xlator_option(reop_s->glfs, "*-write-behind",
434 "resync-failed-syncs-after-fsync", "on");
435 if (ret < 0) {
436 error_setg_errno(errp, errno, "Unable to set xlator key/value pair");
437 ret = -errno;
438 goto exit;
439 }
440#endif
441
d5cf4079 442 reop_s->fd = glfs_open(reop_s->glfs, gconf->path, open_flags);
adccfbcd
JC
443 if (reop_s->fd == NULL) {
444 /* reops->glfs will be cleaned up in _abort */
445 ret = -errno;
446 goto exit;
447 }
448
449exit:
450 /* state->opaque will be freed in either the _abort or _commit */
451 qemu_gluster_gconf_free(gconf);
452 return ret;
453}
454
455static void qemu_gluster_reopen_commit(BDRVReopenState *state)
456{
457 BDRVGlusterReopenState *reop_s = state->opaque;
458 BDRVGlusterState *s = state->bs->opaque;
459
460
461 /* close the old */
462 if (s->fd) {
463 glfs_close(s->fd);
464 }
465 if (s->glfs) {
466 glfs_fini(s->glfs);
467 }
468
469 /* use the newly opened image / connection */
470 s->fd = reop_s->fd;
471 s->glfs = reop_s->glfs;
472
473 g_free(state->opaque);
474 state->opaque = NULL;
475
476 return;
477}
478
479
480static void qemu_gluster_reopen_abort(BDRVReopenState *state)
481{
482 BDRVGlusterReopenState *reop_s = state->opaque;
483
484 if (reop_s == NULL) {
485 return;
486 }
487
488 if (reop_s->fd) {
489 glfs_close(reop_s->fd);
490 }
491
492 if (reop_s->glfs) {
493 glfs_fini(reop_s->glfs);
494 }
495
496 g_free(state->opaque);
497 state->opaque = NULL;
498
499 return;
500}
501
7c815372 502#ifdef CONFIG_GLUSTERFS_ZEROFILL
e88a36eb
EB
503static coroutine_fn int qemu_gluster_co_pwrite_zeroes(BlockDriverState *bs,
504 int64_t offset, int size, BdrvRequestFlags flags)
7c815372
BR
505{
506 int ret;
c833d1e8 507 GlusterAIOCB acb;
7c815372 508 BDRVGlusterState *s = bs->opaque;
7c815372 509
c833d1e8
PB
510 acb.size = size;
511 acb.ret = 0;
512 acb.coroutine = qemu_coroutine_self();
513 acb.aio_context = bdrv_get_aio_context(bs);
7c815372 514
c833d1e8 515 ret = glfs_zerofill_async(s->fd, offset, size, gluster_finish_aiocb, &acb);
7c815372 516 if (ret < 0) {
c833d1e8 517 return -errno;
7c815372
BR
518 }
519
520 qemu_coroutine_yield();
c833d1e8 521 return acb.ret;
7c815372 522}
cf7f616b
BR
523
524static inline bool gluster_supports_zerofill(void)
525{
526 return 1;
527}
528
529static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset,
530 int64_t size)
531{
532 return glfs_zerofill(fd, offset, size);
533}
534
535#else
536static inline bool gluster_supports_zerofill(void)
537{
538 return 0;
539}
540
541static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset,
542 int64_t size)
543{
544 return 0;
545}
7c815372
BR
546#endif
547
8d6d89cb 548static int qemu_gluster_create(const char *filename,
90c772de 549 QemuOpts *opts, Error **errp)
8d6d89cb
BR
550{
551 struct glfs *glfs;
552 struct glfs_fd *fd;
553 int ret = 0;
cf7f616b 554 int prealloc = 0;
8d6d89cb 555 int64_t total_size = 0;
90c772de 556 char *tmp = NULL;
5839e53b 557 GlusterConf *gconf = g_new0(GlusterConf, 1);
8d6d89cb 558
7eac868a
JC
559 gconf->debug_level = qemu_opt_get_number_del(opts, GLUSTER_OPT_DEBUG,
560 GLUSTER_DEBUG_DEFAULT);
561 if (gconf->debug_level < 0) {
562 gconf->debug_level = 0;
563 } else if (gconf->debug_level > GLUSTER_DEBUG_MAX) {
564 gconf->debug_level = GLUSTER_DEBUG_MAX;
565 }
566
a7451cb8 567 glfs = qemu_gluster_init(gconf, filename, errp);
8d6d89cb 568 if (!glfs) {
4557117d 569 ret = -errno;
8d6d89cb
BR
570 goto out;
571 }
572
180e9526
HT
573 total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
574 BDRV_SECTOR_SIZE);
90c772de
CL
575
576 tmp = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
577 if (!tmp || !strcmp(tmp, "off")) {
578 prealloc = 0;
579 } else if (!strcmp(tmp, "full") &&
580 gluster_supports_zerofill()) {
581 prealloc = 1;
582 } else {
583 error_setg(errp, "Invalid preallocation mode: '%s'"
584 " or GlusterFS doesn't support zerofill API",
585 tmp);
586 ret = -EINVAL;
587 goto out;
8d6d89cb
BR
588 }
589
d5cf4079 590 fd = glfs_creat(glfs, gconf->path,
8d6d89cb
BR
591 O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IRUSR | S_IWUSR);
592 if (!fd) {
593 ret = -errno;
594 } else {
180e9526
HT
595 if (!glfs_ftruncate(fd, total_size)) {
596 if (prealloc && qemu_gluster_zerofill(fd, 0, total_size)) {
cf7f616b
BR
597 ret = -errno;
598 }
599 } else {
8d6d89cb
BR
600 ret = -errno;
601 }
cf7f616b 602
8d6d89cb
BR
603 if (glfs_close(fd) != 0) {
604 ret = -errno;
605 }
606 }
607out:
90c772de 608 g_free(tmp);
8d6d89cb
BR
609 qemu_gluster_gconf_free(gconf);
610 if (glfs) {
611 glfs_fini(glfs);
612 }
613 return ret;
614}
615
15744b0b
BR
616static coroutine_fn int qemu_gluster_co_rw(BlockDriverState *bs,
617 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int write)
8d6d89cb
BR
618{
619 int ret;
c833d1e8 620 GlusterAIOCB acb;
8d6d89cb 621 BDRVGlusterState *s = bs->opaque;
15744b0b
BR
622 size_t size = nb_sectors * BDRV_SECTOR_SIZE;
623 off_t offset = sector_num * BDRV_SECTOR_SIZE;
8d6d89cb 624
c833d1e8
PB
625 acb.size = size;
626 acb.ret = 0;
627 acb.coroutine = qemu_coroutine_self();
628 acb.aio_context = bdrv_get_aio_context(bs);
8d6d89cb
BR
629
630 if (write) {
631 ret = glfs_pwritev_async(s->fd, qiov->iov, qiov->niov, offset, 0,
c833d1e8 632 gluster_finish_aiocb, &acb);
8d6d89cb
BR
633 } else {
634 ret = glfs_preadv_async(s->fd, qiov->iov, qiov->niov, offset, 0,
c833d1e8 635 gluster_finish_aiocb, &acb);
8d6d89cb
BR
636 }
637
638 if (ret < 0) {
c833d1e8 639 return -errno;
8d6d89cb 640 }
15744b0b
BR
641
642 qemu_coroutine_yield();
c833d1e8 643 return acb.ret;
8d6d89cb
BR
644}
645
42ec24e2
PB
646static int qemu_gluster_truncate(BlockDriverState *bs, int64_t offset)
647{
648 int ret;
649 BDRVGlusterState *s = bs->opaque;
650
651 ret = glfs_ftruncate(s->fd, offset);
652 if (ret < 0) {
653 return -errno;
654 }
655
656 return 0;
657}
658
15744b0b
BR
659static coroutine_fn int qemu_gluster_co_readv(BlockDriverState *bs,
660 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
8d6d89cb 661{
15744b0b 662 return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 0);
8d6d89cb
BR
663}
664
15744b0b
BR
665static coroutine_fn int qemu_gluster_co_writev(BlockDriverState *bs,
666 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
8d6d89cb 667{
15744b0b 668 return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 1);
8d6d89cb
BR
669}
670
5d4343e6
JC
671static void qemu_gluster_close(BlockDriverState *bs)
672{
673 BDRVGlusterState *s = bs->opaque;
674
675 if (s->fd) {
676 glfs_close(s->fd);
677 s->fd = NULL;
678 }
679 glfs_fini(s->glfs);
680}
681
15744b0b 682static coroutine_fn int qemu_gluster_co_flush_to_disk(BlockDriverState *bs)
8d6d89cb
BR
683{
684 int ret;
c833d1e8 685 GlusterAIOCB acb;
8d6d89cb
BR
686 BDRVGlusterState *s = bs->opaque;
687
c833d1e8
PB
688 acb.size = 0;
689 acb.ret = 0;
690 acb.coroutine = qemu_coroutine_self();
691 acb.aio_context = bdrv_get_aio_context(bs);
8d6d89cb 692
c833d1e8 693 ret = glfs_fsync_async(s->fd, gluster_finish_aiocb, &acb);
8d6d89cb 694 if (ret < 0) {
d85fa9eb
JC
695 ret = -errno;
696 goto error;
8d6d89cb 697 }
15744b0b
BR
698
699 qemu_coroutine_yield();
d85fa9eb
JC
700 if (acb.ret < 0) {
701 ret = acb.ret;
702 goto error;
703 }
704
c833d1e8 705 return acb.ret;
d85fa9eb
JC
706
707error:
708 /* Some versions of Gluster (3.5.6 -> 3.5.8?) will not retain its cache
709 * after a fsync failure, so we have no way of allowing the guest to safely
710 * continue. Gluster versions prior to 3.5.6 don't retain the cache
711 * either, but will invalidate the fd on error, so this is again our only
712 * option.
713 *
714 * The 'resync-failed-syncs-after-fsync' xlator option for the
715 * write-behind cache will cause later gluster versions to retain its
716 * cache after error, so long as the fd remains open. However, we
717 * currently have no way of knowing if this option is supported.
718 *
719 * TODO: Once gluster provides a way for us to determine if the option
720 * is supported, bypass the closure and setting drv to NULL. */
721 qemu_gluster_close(bs);
722 bs->drv = NULL;
723 return ret;
8d6d89cb
BR
724}
725
0c14fb47 726#ifdef CONFIG_GLUSTERFS_DISCARD
15744b0b
BR
727static coroutine_fn int qemu_gluster_co_discard(BlockDriverState *bs,
728 int64_t sector_num, int nb_sectors)
0c14fb47
BR
729{
730 int ret;
c833d1e8 731 GlusterAIOCB acb;
0c14fb47 732 BDRVGlusterState *s = bs->opaque;
15744b0b
BR
733 size_t size = nb_sectors * BDRV_SECTOR_SIZE;
734 off_t offset = sector_num * BDRV_SECTOR_SIZE;
0c14fb47 735
c833d1e8
PB
736 acb.size = 0;
737 acb.ret = 0;
738 acb.coroutine = qemu_coroutine_self();
739 acb.aio_context = bdrv_get_aio_context(bs);
0c14fb47 740
c833d1e8 741 ret = glfs_discard_async(s->fd, offset, size, gluster_finish_aiocb, &acb);
0c14fb47 742 if (ret < 0) {
c833d1e8 743 return -errno;
0c14fb47 744 }
15744b0b
BR
745
746 qemu_coroutine_yield();
c833d1e8 747 return acb.ret;
0c14fb47
BR
748}
749#endif
750
8d6d89cb
BR
751static int64_t qemu_gluster_getlength(BlockDriverState *bs)
752{
753 BDRVGlusterState *s = bs->opaque;
754 int64_t ret;
755
756 ret = glfs_lseek(s->fd, 0, SEEK_END);
757 if (ret < 0) {
758 return -errno;
759 } else {
760 return ret;
761 }
762}
763
764static int64_t qemu_gluster_allocated_file_size(BlockDriverState *bs)
765{
766 BDRVGlusterState *s = bs->opaque;
767 struct stat st;
768 int ret;
769
770 ret = glfs_fstat(s->fd, &st);
771 if (ret < 0) {
772 return -errno;
773 } else {
774 return st.st_blocks * 512;
775 }
776}
777
8ab6feec
KW
778static int qemu_gluster_has_zero_init(BlockDriverState *bs)
779{
780 /* GlusterFS volume could be backed by a block device */
781 return 0;
782}
783
947eb203
NV
784/*
785 * Find allocation range in @bs around offset @start.
786 * May change underlying file descriptor's file offset.
787 * If @start is not in a hole, store @start in @data, and the
788 * beginning of the next hole in @hole, and return 0.
789 * If @start is in a non-trailing hole, store @start in @hole and the
790 * beginning of the next non-hole in @data, and return 0.
791 * If @start is in a trailing hole or beyond EOF, return -ENXIO.
792 * If we can't find out, return a negative errno other than -ENXIO.
793 *
794 * (Shamefully copied from raw-posix.c, only miniscule adaptions.)
795 */
796static int find_allocation(BlockDriverState *bs, off_t start,
797 off_t *data, off_t *hole)
798{
799 BDRVGlusterState *s = bs->opaque;
800 off_t offs;
801
802 if (!s->supports_seek_data) {
803 return -ENOTSUP;
804 }
805
806 /*
807 * SEEK_DATA cases:
808 * D1. offs == start: start is in data
809 * D2. offs > start: start is in a hole, next data at offs
810 * D3. offs < 0, errno = ENXIO: either start is in a trailing hole
811 * or start is beyond EOF
812 * If the latter happens, the file has been truncated behind
813 * our back since we opened it. All bets are off then.
814 * Treating like a trailing hole is simplest.
815 * D4. offs < 0, errno != ENXIO: we learned nothing
816 */
817 offs = glfs_lseek(s->fd, start, SEEK_DATA);
818 if (offs < 0) {
819 return -errno; /* D3 or D4 */
820 }
821 assert(offs >= start);
822
823 if (offs > start) {
824 /* D2: in hole, next data at offs */
825 *hole = start;
826 *data = offs;
827 return 0;
828 }
829
830 /* D1: in data, end not yet known */
831
832 /*
833 * SEEK_HOLE cases:
834 * H1. offs == start: start is in a hole
835 * If this happens here, a hole has been dug behind our back
836 * since the previous lseek().
837 * H2. offs > start: either start is in data, next hole at offs,
838 * or start is in trailing hole, EOF at offs
839 * Linux treats trailing holes like any other hole: offs ==
840 * start. Solaris seeks to EOF instead: offs > start (blech).
841 * If that happens here, a hole has been dug behind our back
842 * since the previous lseek().
843 * H3. offs < 0, errno = ENXIO: start is beyond EOF
844 * If this happens, the file has been truncated behind our
845 * back since we opened it. Treat it like a trailing hole.
846 * H4. offs < 0, errno != ENXIO: we learned nothing
847 * Pretend we know nothing at all, i.e. "forget" about D1.
848 */
849 offs = glfs_lseek(s->fd, start, SEEK_HOLE);
850 if (offs < 0) {
851 return -errno; /* D1 and (H3 or H4) */
852 }
853 assert(offs >= start);
854
855 if (offs > start) {
856 /*
857 * D1 and H2: either in data, next hole at offs, or it was in
858 * data but is now in a trailing hole. In the latter case,
859 * all bets are off. Treating it as if it there was data all
860 * the way to EOF is safe, so simply do that.
861 */
862 *data = start;
863 *hole = offs;
864 return 0;
865 }
866
867 /* D1 and H1 */
868 return -EBUSY;
869}
870
871/*
872 * Returns the allocation status of the specified sectors.
873 *
874 * If 'sector_num' is beyond the end of the disk image the return value is 0
875 * and 'pnum' is set to 0.
876 *
877 * 'pnum' is set to the number of sectors (including and immediately following
878 * the specified sector) that are known to be in the same
879 * allocated/unallocated state.
880 *
881 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
882 * beyond the end of the disk image it will be clamped.
883 *
884 * (Based on raw_co_get_block_status() from raw-posix.c.)
885 */
886static int64_t coroutine_fn qemu_gluster_co_get_block_status(
887 BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum,
888 BlockDriverState **file)
889{
890 BDRVGlusterState *s = bs->opaque;
891 off_t start, data = 0, hole = 0;
892 int64_t total_size;
893 int ret = -EINVAL;
894
895 if (!s->fd) {
896 return ret;
897 }
898
899 start = sector_num * BDRV_SECTOR_SIZE;
900 total_size = bdrv_getlength(bs);
901 if (total_size < 0) {
902 return total_size;
903 } else if (start >= total_size) {
904 *pnum = 0;
905 return 0;
906 } else if (start + nb_sectors * BDRV_SECTOR_SIZE > total_size) {
907 nb_sectors = DIV_ROUND_UP(total_size - start, BDRV_SECTOR_SIZE);
908 }
909
910 ret = find_allocation(bs, start, &data, &hole);
911 if (ret == -ENXIO) {
912 /* Trailing hole */
913 *pnum = nb_sectors;
914 ret = BDRV_BLOCK_ZERO;
915 } else if (ret < 0) {
916 /* No info available, so pretend there are no holes */
917 *pnum = nb_sectors;
918 ret = BDRV_BLOCK_DATA;
919 } else if (data == start) {
920 /* On a data extent, compute sectors to the end of the extent,
921 * possibly including a partial sector at EOF. */
922 *pnum = MIN(nb_sectors, DIV_ROUND_UP(hole - start, BDRV_SECTOR_SIZE));
923 ret = BDRV_BLOCK_DATA;
924 } else {
925 /* On a hole, compute sectors to the beginning of the next extent. */
926 assert(hole == start);
927 *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE);
928 ret = BDRV_BLOCK_ZERO;
929 }
930
931 *file = bs;
932
933 return ret | BDRV_BLOCK_OFFSET_VALID | start;
934}
935
936
90c772de
CL
937static QemuOptsList qemu_gluster_create_opts = {
938 .name = "qemu-gluster-create-opts",
939 .head = QTAILQ_HEAD_INITIALIZER(qemu_gluster_create_opts.head),
940 .desc = {
941 {
942 .name = BLOCK_OPT_SIZE,
943 .type = QEMU_OPT_SIZE,
944 .help = "Virtual disk size"
945 },
946 {
947 .name = BLOCK_OPT_PREALLOC,
948 .type = QEMU_OPT_STRING,
949 .help = "Preallocation mode (allowed values: off, full)"
950 },
7eac868a
JC
951 {
952 .name = GLUSTER_OPT_DEBUG,
953 .type = QEMU_OPT_NUMBER,
954 .help = "Gluster log level, valid range is 0-9",
955 },
90c772de
CL
956 { /* end of list */ }
957 }
8d6d89cb
BR
958};
959
960static BlockDriver bdrv_gluster = {
961 .format_name = "gluster",
962 .protocol_name = "gluster",
963 .instance_size = sizeof(BDRVGlusterState),
030be321 964 .bdrv_needs_filename = true,
8d6d89cb 965 .bdrv_file_open = qemu_gluster_open,
adccfbcd
JC
966 .bdrv_reopen_prepare = qemu_gluster_reopen_prepare,
967 .bdrv_reopen_commit = qemu_gluster_reopen_commit,
968 .bdrv_reopen_abort = qemu_gluster_reopen_abort,
8d6d89cb 969 .bdrv_close = qemu_gluster_close,
c282e1fd 970 .bdrv_create = qemu_gluster_create,
8d6d89cb
BR
971 .bdrv_getlength = qemu_gluster_getlength,
972 .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
42ec24e2 973 .bdrv_truncate = qemu_gluster_truncate,
15744b0b
BR
974 .bdrv_co_readv = qemu_gluster_co_readv,
975 .bdrv_co_writev = qemu_gluster_co_writev,
976 .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk,
8ab6feec 977 .bdrv_has_zero_init = qemu_gluster_has_zero_init,
0c14fb47 978#ifdef CONFIG_GLUSTERFS_DISCARD
15744b0b 979 .bdrv_co_discard = qemu_gluster_co_discard,
7c815372
BR
980#endif
981#ifdef CONFIG_GLUSTERFS_ZEROFILL
e88a36eb 982 .bdrv_co_pwrite_zeroes = qemu_gluster_co_pwrite_zeroes,
0c14fb47 983#endif
947eb203 984 .bdrv_co_get_block_status = qemu_gluster_co_get_block_status,
90c772de 985 .create_opts = &qemu_gluster_create_opts,
8d6d89cb
BR
986};
987
988static BlockDriver bdrv_gluster_tcp = {
989 .format_name = "gluster",
990 .protocol_name = "gluster+tcp",
991 .instance_size = sizeof(BDRVGlusterState),
030be321 992 .bdrv_needs_filename = true,
8d6d89cb 993 .bdrv_file_open = qemu_gluster_open,
adccfbcd
JC
994 .bdrv_reopen_prepare = qemu_gluster_reopen_prepare,
995 .bdrv_reopen_commit = qemu_gluster_reopen_commit,
996 .bdrv_reopen_abort = qemu_gluster_reopen_abort,
8d6d89cb 997 .bdrv_close = qemu_gluster_close,
c282e1fd 998 .bdrv_create = qemu_gluster_create,
8d6d89cb
BR
999 .bdrv_getlength = qemu_gluster_getlength,
1000 .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
42ec24e2 1001 .bdrv_truncate = qemu_gluster_truncate,
15744b0b
BR
1002 .bdrv_co_readv = qemu_gluster_co_readv,
1003 .bdrv_co_writev = qemu_gluster_co_writev,
1004 .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk,
8ab6feec 1005 .bdrv_has_zero_init = qemu_gluster_has_zero_init,
0c14fb47 1006#ifdef CONFIG_GLUSTERFS_DISCARD
15744b0b 1007 .bdrv_co_discard = qemu_gluster_co_discard,
7c815372
BR
1008#endif
1009#ifdef CONFIG_GLUSTERFS_ZEROFILL
e88a36eb 1010 .bdrv_co_pwrite_zeroes = qemu_gluster_co_pwrite_zeroes,
0c14fb47 1011#endif
947eb203 1012 .bdrv_co_get_block_status = qemu_gluster_co_get_block_status,
90c772de 1013 .create_opts = &qemu_gluster_create_opts,
8d6d89cb
BR
1014};
1015
1016static BlockDriver bdrv_gluster_unix = {
1017 .format_name = "gluster",
1018 .protocol_name = "gluster+unix",
1019 .instance_size = sizeof(BDRVGlusterState),
030be321 1020 .bdrv_needs_filename = true,
8d6d89cb 1021 .bdrv_file_open = qemu_gluster_open,
adccfbcd
JC
1022 .bdrv_reopen_prepare = qemu_gluster_reopen_prepare,
1023 .bdrv_reopen_commit = qemu_gluster_reopen_commit,
1024 .bdrv_reopen_abort = qemu_gluster_reopen_abort,
8d6d89cb 1025 .bdrv_close = qemu_gluster_close,
c282e1fd 1026 .bdrv_create = qemu_gluster_create,
8d6d89cb
BR
1027 .bdrv_getlength = qemu_gluster_getlength,
1028 .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
42ec24e2 1029 .bdrv_truncate = qemu_gluster_truncate,
15744b0b
BR
1030 .bdrv_co_readv = qemu_gluster_co_readv,
1031 .bdrv_co_writev = qemu_gluster_co_writev,
1032 .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk,
8ab6feec 1033 .bdrv_has_zero_init = qemu_gluster_has_zero_init,
0c14fb47 1034#ifdef CONFIG_GLUSTERFS_DISCARD
15744b0b 1035 .bdrv_co_discard = qemu_gluster_co_discard,
7c815372
BR
1036#endif
1037#ifdef CONFIG_GLUSTERFS_ZEROFILL
e88a36eb 1038 .bdrv_co_pwrite_zeroes = qemu_gluster_co_pwrite_zeroes,
0c14fb47 1039#endif
947eb203 1040 .bdrv_co_get_block_status = qemu_gluster_co_get_block_status,
90c772de 1041 .create_opts = &qemu_gluster_create_opts,
8d6d89cb
BR
1042};
1043
1044static BlockDriver bdrv_gluster_rdma = {
1045 .format_name = "gluster",
1046 .protocol_name = "gluster+rdma",
1047 .instance_size = sizeof(BDRVGlusterState),
030be321 1048 .bdrv_needs_filename = true,
8d6d89cb 1049 .bdrv_file_open = qemu_gluster_open,
adccfbcd
JC
1050 .bdrv_reopen_prepare = qemu_gluster_reopen_prepare,
1051 .bdrv_reopen_commit = qemu_gluster_reopen_commit,
1052 .bdrv_reopen_abort = qemu_gluster_reopen_abort,
8d6d89cb 1053 .bdrv_close = qemu_gluster_close,
c282e1fd 1054 .bdrv_create = qemu_gluster_create,
8d6d89cb
BR
1055 .bdrv_getlength = qemu_gluster_getlength,
1056 .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
42ec24e2 1057 .bdrv_truncate = qemu_gluster_truncate,
15744b0b
BR
1058 .bdrv_co_readv = qemu_gluster_co_readv,
1059 .bdrv_co_writev = qemu_gluster_co_writev,
1060 .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk,
8ab6feec 1061 .bdrv_has_zero_init = qemu_gluster_has_zero_init,
0c14fb47 1062#ifdef CONFIG_GLUSTERFS_DISCARD
15744b0b 1063 .bdrv_co_discard = qemu_gluster_co_discard,
7c815372
BR
1064#endif
1065#ifdef CONFIG_GLUSTERFS_ZEROFILL
e88a36eb 1066 .bdrv_co_pwrite_zeroes = qemu_gluster_co_pwrite_zeroes,
0c14fb47 1067#endif
947eb203 1068 .bdrv_co_get_block_status = qemu_gluster_co_get_block_status,
90c772de 1069 .create_opts = &qemu_gluster_create_opts,
8d6d89cb
BR
1070};
1071
1072static void bdrv_gluster_init(void)
1073{
1074 bdrv_register(&bdrv_gluster_rdma);
1075 bdrv_register(&bdrv_gluster_unix);
1076 bdrv_register(&bdrv_gluster_tcp);
1077 bdrv_register(&bdrv_gluster);
1078}
1079
1080block_init(bdrv_gluster_init);