#include "qemu/error-report.h"
#include "qemu/module.h"
#include "qemu/option.h"
+#include "block/block-io.h"
#include "block/block_int.h"
#include "block/qdict.h"
#include "crypto/secret.h"
int64_t ret;
} RBDTask;
+typedef struct RBDDiffIterateReq {
+ uint64_t offs;
+ uint64_t bytes;
+ bool exists;
+} RBDDiffIterateReq;
+
static int qemu_rbd_connect(rados_t *cluster, rados_ioctx_t *io_ctx,
BlockdevOptionsRbd *opts, bool cache,
const char *keypairs, const char *secretid,
int ret;
assert(options->driver == BLOCKDEV_DRIVER_RBD);
- if (opts->location->has_snapshot) {
+ if (opts->location->snapshot) {
error_setg(errp, "Can't use snapshot name for image creation");
return -EINVAL;
}
#ifndef LIBRBD_SUPPORTS_ENCRYPTION
- if (opts->has_encrypt) {
+ if (opts->encrypt) {
error_setg(errp, "RBD library does not support image encryption");
return -ENOTSUP;
}
}
#ifdef LIBRBD_SUPPORTS_ENCRYPTION
- if (opts->has_encrypt) {
+ if (opts->encrypt) {
rbd_image_t image;
ret = rbd_open(io_ctx, opts->location->image, &image, NULL);
goto exit;
}
rbd_opts->encrypt = encrypt;
- rbd_opts->has_encrypt = !!encrypt;
/*
* Caution: while qdict_get_try_str() is fine, getting non-string
loc = rbd_opts->location;
loc->pool = g_strdup(qdict_get_try_str(options, "pool"));
loc->conf = g_strdup(qdict_get_try_str(options, "conf"));
- loc->has_conf = !!loc->conf;
loc->user = g_strdup(qdict_get_try_str(options, "user"));
- loc->has_user = !!loc->user;
loc->q_namespace = g_strdup(qdict_get_try_str(options, "namespace"));
- loc->has_q_namespace = !!loc->q_namespace;
loc->image = g_strdup(qdict_get_try_str(options, "image"));
keypairs = qdict_get_try_str(options, "=keyvalue-pairs");
return -EINVAL;
}
opts->key_secret = g_strdup(secretid);
- opts->has_key_secret = true;
}
mon_host = qemu_rbd_mon_host(opts, &local_err);
/* try default location when conf=NULL, but ignore failure */
r = rados_conf_read_file(*cluster, opts->conf);
- if (opts->has_conf && r < 0) {
+ if (opts->conf && r < 0) {
error_setg_errno(errp, -r, "error reading conf file %s", opts->conf);
goto failed_shutdown;
}
error_setg_errno(errp, -r, "error opening pool %s", opts->pool);
goto failed_shutdown;
}
+
+#ifdef HAVE_RBD_NAMESPACE_EXISTS
+ if (opts->q_namespace && strlen(opts->q_namespace) > 0) {
+ bool exists;
+
+ r = rbd_namespace_exists(*io_ctx, opts->q_namespace, &exists);
+ if (r < 0) {
+ error_setg_errno(errp, -r, "error checking namespace");
+ goto failed_ioctx_destroy;
+ }
+
+ if (!exists) {
+ error_setg(errp, "namespace '%s' does not exist",
+ opts->q_namespace);
+ r = -ENOENT;
+ goto failed_ioctx_destroy;
+ }
+ }
+#endif
+
/*
* Set the namespace after opening the io context on the pool,
* if nspace == NULL or if nspace == "", it is just as we did nothing
r = 0;
goto out;
+#ifdef HAVE_RBD_NAMESPACE_EXISTS
+failed_ioctx_destroy:
+ rados_ioctx_destroy(*io_ctx);
+#endif
failed_shutdown:
rados_shutdown(*cluster);
out:
goto failed_open;
}
- if (opts->has_encrypt) {
+ if (opts->encrypt) {
#ifdef LIBRBD_SUPPORTS_ENCRYPTION
r = qemu_rbd_encryption_load(s->image, opts->encrypt, errp);
if (r < 0) {
assert(!qiov || qiov->size == bytes);
+ if (cmd == RBD_AIO_WRITE || cmd == RBD_AIO_WRITE_ZEROES) {
+ /*
+ * RBD APIs don't allow us to write more than actual size, so in order
+ * to support growing images, we resize the image before write
+ * operations that exceed the current size.
+ */
+ if (offset + bytes > s->image_size) {
+ int r = qemu_rbd_resize(bs, offset + bytes);
+ if (r < 0) {
+ return r;
+ }
+ }
+ }
+
r = rbd_aio_create_completion(&task,
(rbd_callback_t) qemu_rbd_completion_cb, &c);
if (r < 0) {
}
static int
-coroutine_fn qemu_rbd_co_pwritev(BlockDriverState *bs, uint64_t offset,
- uint64_t bytes, QEMUIOVector *qiov,
- int flags)
+coroutine_fn qemu_rbd_co_pwritev(BlockDriverState *bs, int64_t offset,
+ int64_t bytes, QEMUIOVector *qiov,
+ BdrvRequestFlags flags)
{
- BDRVRBDState *s = bs->opaque;
- /*
- * RBD APIs don't allow us to write more than actual size, so in order
- * to support growing images, we resize the image before write
- * operations that exceed the current size.
- */
- if (offset + bytes > s->image_size) {
- int r = qemu_rbd_resize(bs, offset + bytes);
- if (r < 0) {
- return r;
- }
- }
return qemu_rbd_start_co(bs, offset, bytes, qiov, flags, RBD_AIO_WRITE);
}
}
static int coroutine_fn qemu_rbd_co_pdiscard(BlockDriverState *bs,
- int64_t offset, int count)
+ int64_t offset, int64_t bytes)
{
- return qemu_rbd_start_co(bs, offset, count, NULL, 0, RBD_AIO_DISCARD);
+ return qemu_rbd_start_co(bs, offset, bytes, NULL, 0, RBD_AIO_DISCARD);
}
#ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
static int
coroutine_fn qemu_rbd_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
- int count, BdrvRequestFlags flags)
+ int64_t bytes, BdrvRequestFlags flags)
{
- return qemu_rbd_start_co(bs, offset, count, NULL, flags,
+ return qemu_rbd_start_co(bs, offset, bytes, NULL, flags,
RBD_AIO_WRITE_ZEROES);
}
#endif
return spec_info;
}
+/*
+ * rbd_diff_iterate2 allows to interrupt the exection by returning a negative
+ * value in the callback routine. Choose a value that does not conflict with
+ * an existing exitcode and return it if we want to prematurely stop the
+ * execution because we detected a change in the allocation status.
+ */
+#define QEMU_RBD_EXIT_DIFF_ITERATE2 -9000
+
+static int qemu_rbd_diff_iterate_cb(uint64_t offs, size_t len,
+ int exists, void *opaque)
+{
+ RBDDiffIterateReq *req = opaque;
+
+ assert(req->offs + req->bytes <= offs);
+
+ /* treat a hole like an unallocated area and bail out */
+ if (!exists) {
+ return 0;
+ }
+
+ if (!req->exists && offs > req->offs) {
+ /*
+ * we started in an unallocated area and hit the first allocated
+ * block. req->bytes must be set to the length of the unallocated area
+ * before the allocated area. stop further processing.
+ */
+ req->bytes = offs - req->offs;
+ return QEMU_RBD_EXIT_DIFF_ITERATE2;
+ }
+
+ if (req->exists && offs > req->offs + req->bytes) {
+ /*
+ * we started in an allocated area and jumped over an unallocated area,
+ * req->bytes contains the length of the allocated area before the
+ * unallocated area. stop further processing.
+ */
+ return QEMU_RBD_EXIT_DIFF_ITERATE2;
+ }
+
+ req->bytes += len;
+ req->exists = true;
+
+ return 0;
+}
+
+static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
+ bool want_zero, int64_t offset,
+ int64_t bytes, int64_t *pnum,
+ int64_t *map,
+ BlockDriverState **file)
+{
+ BDRVRBDState *s = bs->opaque;
+ int status, r;
+ RBDDiffIterateReq req = { .offs = offset };
+ uint64_t features, flags;
+ uint64_t head = 0;
+
+ assert(offset + bytes <= s->image_size);
+
+ /* default to all sectors allocated */
+ status = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
+ *map = offset;
+ *file = bs;
+ *pnum = bytes;
+
+ /* check if RBD image supports fast-diff */
+ r = rbd_get_features(s->image, &features);
+ if (r < 0) {
+ return status;
+ }
+ if (!(features & RBD_FEATURE_FAST_DIFF)) {
+ return status;
+ }
+
+ /* check if RBD fast-diff result is valid */
+ r = rbd_get_flags(s->image, &flags);
+ if (r < 0) {
+ return status;
+ }
+ if (flags & RBD_FLAG_FAST_DIFF_INVALID) {
+ return status;
+ }
+
+#if LIBRBD_VERSION_CODE < LIBRBD_VERSION(1, 17, 0)
+ /*
+ * librbd had a bug until early 2022 that affected all versions of ceph that
+ * supported fast-diff. This bug results in reporting of incorrect offsets
+ * if the offset parameter to rbd_diff_iterate2 is not object aligned.
+ * Work around this bug by rounding down the offset to object boundaries.
+ * This is OK because we call rbd_diff_iterate2 with whole_object = true.
+ * However, this workaround only works for non cloned images with default
+ * striping.
+ *
+ * See: https://tracker.ceph.com/issues/53784
+ */
+
+ /* check if RBD image has non-default striping enabled */
+ if (features & RBD_FEATURE_STRIPINGV2) {
+ return status;
+ }
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+ /*
+ * check if RBD image is a clone (= has a parent).
+ *
+ * rbd_get_parent_info is deprecated from Nautilus onwards, but the
+ * replacement rbd_get_parent is not present in Luminous and Mimic.
+ */
+ if (rbd_get_parent_info(s->image, NULL, 0, NULL, 0, NULL, 0) != -ENOENT) {
+ return status;
+ }
+#pragma GCC diagnostic pop
+
+ head = req.offs & (s->object_size - 1);
+ req.offs -= head;
+ bytes += head;
+#endif
+
+ r = rbd_diff_iterate2(s->image, NULL, req.offs, bytes, true, true,
+ qemu_rbd_diff_iterate_cb, &req);
+ if (r < 0 && r != QEMU_RBD_EXIT_DIFF_ITERATE2) {
+ return status;
+ }
+ assert(req.bytes <= bytes);
+ if (!req.exists) {
+ if (r == 0) {
+ /*
+ * rbd_diff_iterate2 does not invoke callbacks for unallocated
+ * areas. This here catches the case where no callback was
+ * invoked at all (req.bytes == 0).
+ */
+ assert(req.bytes == 0);
+ req.bytes = bytes;
+ }
+ status = BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID;
+ }
+
+ assert(req.bytes > head);
+ *pnum = req.bytes - head;
+ return status;
+}
+
static int64_t qemu_rbd_getlength(BlockDriverState *bs)
{
BDRVRBDState *s = bs->opaque;
#ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
.bdrv_co_pwrite_zeroes = qemu_rbd_co_pwrite_zeroes,
#endif
+ .bdrv_co_block_status = qemu_rbd_co_block_status,
.bdrv_snapshot_create = qemu_rbd_snap_create,
.bdrv_snapshot_delete = qemu_rbd_snap_remove,