/*
- * Copyright (C) 2016-2020 Red Hat, Inc.
+ * Copyright (C) 2016-2022 Red Hat, Inc.
* Copyright (C) 2005 Anthony Liguori <anthony@codemonkey.ws>
*
* Network Block Device Server Side
#include "trace.h"
#include "nbd-internal.h"
#include "qemu/units.h"
+#include "qemu/memalign.h"
#define NBD_META_ID_BASE_ALLOCATION 0
+#define NBD_META_ID_ALLOCATION_DEPTH 1
/* Dirty bitmaps use 'NBD_META_ID_DIRTY_BITMAP + i', so keep this id last. */
-#define NBD_META_ID_DIRTY_BITMAP 1
+#define NBD_META_ID_DIRTY_BITMAP 2
/*
* NBD_MAX_BLOCK_STATUS_EXTENTS: 1 MiB of extents data. An empirical
typedef struct NBDRequestData NBDRequestData;
struct NBDRequestData {
- QSIMPLEQ_ENTRY(NBDRequestData) entry;
NBDClient *client;
uint8_t *data;
bool complete;
BlockBackend *eject_notifier_blk;
Notifier eject_notifier;
+ bool allocation_depth;
BdrvDirtyBitmap **export_bitmaps;
size_t nr_export_bitmaps;
};
NBDExport *exp;
size_t count; /* number of negotiated contexts */
bool base_allocation; /* export base:allocation context (block status) */
+ bool allocation_depth; /* export qemu:allocation-depth */
bool *bitmaps; /*
* export qemu:dirty-bitmap:<export bitmap name>,
* sized by exp->nr_export_bitmaps
CoMutex send_lock;
Coroutine *send_coroutine;
+ bool read_yielding;
+ bool quiescing;
+
QTAILQ_ENTRY(NBDClient) next;
int nb_requests;
bool closing;
/* Send an error reply.
* Return -errno on error, 0 on success. */
-static int GCC_FMT_ATTR(4, 0)
+static int G_GNUC_PRINTF(4, 0)
nbd_negotiate_send_rep_verr(NBDClient *client, uint32_t type,
Error **errp, const char *fmt, va_list va)
{
/* Send an error reply.
* Return -errno on error, 0 on success. */
-static int GCC_FMT_ATTR(4, 5)
+static int G_GNUC_PRINTF(4, 5)
nbd_negotiate_send_rep_err(NBDClient *client, uint32_t type,
Error **errp, const char *fmt, ...)
{
/* Drop remainder of the current option, and send a reply with the
* given error type and message. Return -errno on read or write
* failure; or 0 if connection is still live. */
-static int GCC_FMT_ATTR(4, 0)
+static int G_GNUC_PRINTF(4, 0)
nbd_opt_vdrop(NBDClient *client, uint32_t type, Error **errp,
const char *fmt, va_list va)
{
return ret;
}
-static int GCC_FMT_ATTR(4, 5)
+static int G_GNUC_PRINTF(4, 5)
nbd_opt_drop(NBDClient *client, uint32_t type, Error **errp,
const char *fmt, ...)
{
return ret;
}
-static int GCC_FMT_ATTR(3, 4)
+static int G_GNUC_PRINTF(3, 4)
nbd_opt_invalid(NBDClient *client, Error **errp, const char *fmt, ...)
{
int ret;
/* nbd_meta_qemu_query
*
* Handle queries to 'qemu' namespace. For now, only the qemu:dirty-bitmap:
- * context is available. Return true if @query has been handled.
+ * and qemu:allocation-depth contexts are available. Return true if @query
+ * has been handled.
*/
static bool nbd_meta_qemu_query(NBDClient *client, NBDExportMetaContexts *meta,
const char *query)
if (!*query) {
if (client->opt == NBD_OPT_LIST_META_CONTEXT) {
- memset(meta->bitmaps, 1, meta->exp->nr_export_bitmaps);
+ meta->allocation_depth = meta->exp->allocation_depth;
+ if (meta->exp->nr_export_bitmaps) {
+ memset(meta->bitmaps, 1, meta->exp->nr_export_bitmaps);
+ }
}
trace_nbd_negotiate_meta_query_parse("empty");
return true;
}
+ if (strcmp(query, "allocation-depth") == 0) {
+ trace_nbd_negotiate_meta_query_parse("allocation-depth");
+ meta->allocation_depth = meta->exp->allocation_depth;
+ return true;
+ }
+
if (nbd_strshift(&query, "dirty-bitmap:")) {
trace_nbd_negotiate_meta_query_parse("dirty-bitmap:");
if (!*query) {
- if (client->opt == NBD_OPT_LIST_META_CONTEXT) {
+ if (client->opt == NBD_OPT_LIST_META_CONTEXT &&
+ meta->exp->nr_export_bitmaps) {
memset(meta->bitmaps, 1, meta->exp->nr_export_bitmaps);
}
trace_nbd_negotiate_meta_query_parse("empty");
return true;
}
- trace_nbd_negotiate_meta_query_skip("not dirty-bitmap");
+ trace_nbd_negotiate_meta_query_skip("unknown qemu context");
return true;
}
{
int ret;
g_autofree char *export_name = NULL;
- g_autofree bool *bitmaps = NULL;
+ /* Mark unused to work around https://bugs.llvm.org/show_bug.cgi?id=3888 */
+ g_autofree G_GNUC_UNUSED bool *bitmaps = NULL;
NBDExportMetaContexts local_meta = {0};
uint32_t nb_queries;
size_t i;
size_t count = 0;
- if (!client->structured_reply) {
+ if (client->opt == NBD_OPT_SET_META_CONTEXT && !client->structured_reply) {
return nbd_opt_invalid(client, errp,
"request option '%s' when structured reply "
"is not negotiated",
if (client->opt == NBD_OPT_LIST_META_CONTEXT && !nb_queries) {
/* enable all known contexts */
meta->base_allocation = true;
- memset(meta->bitmaps, 1, meta->exp->nr_export_bitmaps);
+ meta->allocation_depth = meta->exp->allocation_depth;
+ if (meta->exp->nr_export_bitmaps) {
+ memset(meta->bitmaps, 1, meta->exp->nr_export_bitmaps);
+ }
} else {
for (i = 0; i < nb_queries; ++i) {
ret = nbd_negotiate_meta_query(client, meta, errp);
count++;
}
+ if (meta->allocation_depth) {
+ ret = nbd_negotiate_send_meta_context(client, "qemu:allocation-depth",
+ NBD_META_ID_ALLOCATION_DEPTH,
+ errp);
+ if (ret < 0) {
+ return ret;
+ }
+ count++;
+ }
+
for (i = 0; i < meta->exp->nr_export_bitmaps; i++) {
const char *bm_name;
g_autofree char *context = NULL;
return 0;
}
-static int nbd_receive_request(QIOChannel *ioc, NBDRequest *request,
+/* nbd_read_eof
+ * Tries to read @size bytes from @ioc. This is a local implementation of
+ * qio_channel_readv_all_eof. We have it here because we need it to be
+ * interruptible and to know when the coroutine is yielding.
+ * Returns 1 on success
+ * 0 on eof, when no data was read (errp is not set)
+ * negative errno on failure (errp is set)
+ */
+static inline int coroutine_fn
+nbd_read_eof(NBDClient *client, void *buffer, size_t size, Error **errp)
+{
+ bool partial = false;
+
+ assert(size);
+ while (size > 0) {
+ struct iovec iov = { .iov_base = buffer, .iov_len = size };
+ ssize_t len;
+
+ len = qio_channel_readv(client->ioc, &iov, 1, errp);
+ if (len == QIO_CHANNEL_ERR_BLOCK) {
+ client->read_yielding = true;
+ qio_channel_yield(client->ioc, G_IO_IN);
+ client->read_yielding = false;
+ if (client->quiescing) {
+ return -EAGAIN;
+ }
+ continue;
+ } else if (len < 0) {
+ return -EIO;
+ } else if (len == 0) {
+ if (partial) {
+ error_setg(errp,
+ "Unexpected end-of-file before all bytes were read");
+ return -EIO;
+ } else {
+ return 0;
+ }
+ }
+
+ partial = true;
+ size -= len;
+ buffer = (uint8_t *) buffer + len;
+ }
+ return 1;
+}
+
+static int nbd_receive_request(NBDClient *client, NBDRequest *request,
Error **errp)
{
uint8_t buf[NBD_REQUEST_SIZE];
uint32_t magic;
int ret;
- ret = nbd_read(ioc, buf, sizeof(buf), "request", errp);
+ ret = nbd_read_eof(client, buf, sizeof(buf), errp);
if (ret < 0) {
return ret;
}
+ if (ret == 0) {
+ return -EIO;
+ }
/* Request
[ 0 .. 3] magic (NBD_REQUEST_MAGIC)
g_free(req);
client->nb_requests--;
+
+ if (client->quiescing && client->nb_requests == 0) {
+ aio_wait_kick();
+ }
+
nbd_client_receive_next_request(client);
nbd_client_put(client);
QTAILQ_FOREACH(client, &exp->clients, next) {
qio_channel_attach_aio_context(client->ioc, ctx);
- if (client->recv_coroutine) {
- aio_co_schedule(ctx, client->recv_coroutine);
- }
- if (client->send_coroutine) {
- aio_co_schedule(ctx, client->send_coroutine);
- }
+
+ assert(client->nb_requests == 0);
+ assert(client->recv_coroutine == NULL);
+ assert(client->send_coroutine == NULL);
}
}
exp->common.ctx = NULL;
}
+static void nbd_drained_begin(void *opaque)
+{
+ NBDExport *exp = opaque;
+ NBDClient *client;
+
+ QTAILQ_FOREACH(client, &exp->clients, next) {
+ client->quiescing = true;
+ }
+}
+
+static void nbd_drained_end(void *opaque)
+{
+ NBDExport *exp = opaque;
+ NBDClient *client;
+
+ QTAILQ_FOREACH(client, &exp->clients, next) {
+ client->quiescing = false;
+ nbd_client_receive_next_request(client);
+ }
+}
+
+static bool nbd_drained_poll(void *opaque)
+{
+ NBDExport *exp = opaque;
+ NBDClient *client;
+
+ QTAILQ_FOREACH(client, &exp->clients, next) {
+ if (client->nb_requests != 0) {
+ /*
+ * If there's a coroutine waiting for a request on nbd_read_eof()
+ * enter it here so we don't depend on the client to wake it up.
+ */
+ if (client->recv_coroutine != NULL && client->read_yielding) {
+ qemu_aio_coroutine_enter(exp->common.ctx,
+ client->recv_coroutine);
+ }
+
+ return true;
+ }
+ }
+
+ return false;
+}
+
static void nbd_eject_notifier(Notifier *n, void *data)
{
NBDExport *exp = container_of(n, NBDExport, eject_notifier);
blk_add_remove_bs_notifier(blk, &nbd_exp->eject_notifier);
}
+static const BlockDevOps nbd_block_ops = {
+ .drained_begin = nbd_drained_begin,
+ .drained_end = nbd_drained_end,
+ .drained_poll = nbd_drained_poll,
+};
+
static int nbd_export_create(BlockExport *blk_exp, BlockExportOptions *exp_args,
Error **errp)
{
int64_t size;
uint64_t perm, shared_perm;
bool readonly = !exp_args->writable;
- bool shared = !exp_args->writable;
- strList *bitmaps;
+ BlockDirtyBitmapOrStrList *bitmaps;
size_t i;
int ret;
exp->description = g_strdup(arg->description);
exp->nbdflags = (NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_FLUSH |
NBD_FLAG_SEND_FUA | NBD_FLAG_SEND_CACHE);
+
+ if (nbd_server_max_connections() != 1) {
+ exp->nbdflags |= NBD_FLAG_CAN_MULTI_CONN;
+ }
if (readonly) {
exp->nbdflags |= NBD_FLAG_READ_ONLY;
- if (shared) {
- exp->nbdflags |= NBD_FLAG_CAN_MULTI_CONN;
- }
} else {
exp->nbdflags |= (NBD_FLAG_SEND_TRIM | NBD_FLAG_SEND_WRITE_ZEROES |
NBD_FLAG_SEND_FAST_ZERO);
}
exp->export_bitmaps = g_new0(BdrvDirtyBitmap *, exp->nr_export_bitmaps);
for (i = 0, bitmaps = arg->bitmaps; bitmaps;
- i++, bitmaps = bitmaps->next) {
- const char *bitmap = bitmaps->value;
+ i++, bitmaps = bitmaps->next)
+ {
+ const char *bitmap;
BlockDriverState *bs = blk_bs(blk);
BdrvDirtyBitmap *bm = NULL;
- while (bs) {
- bm = bdrv_find_dirty_bitmap(bs, bitmap);
- if (bm != NULL) {
- break;
+ switch (bitmaps->value->type) {
+ case QTYPE_QSTRING:
+ bitmap = bitmaps->value->u.local;
+ while (bs) {
+ bm = bdrv_find_dirty_bitmap(bs, bitmap);
+ if (bm != NULL) {
+ break;
+ }
+
+ bs = bdrv_filter_or_cow_bs(bs);
}
- bs = bdrv_filter_or_cow_bs(bs);
- }
+ if (bm == NULL) {
+ ret = -ENOENT;
+ error_setg(errp, "Bitmap '%s' is not found",
+ bitmaps->value->u.local);
+ goto fail;
+ }
- if (bm == NULL) {
- ret = -ENOENT;
- error_setg(errp, "Bitmap '%s' is not found", bitmap);
- goto fail;
+ if (readonly && bdrv_is_writable(bs) &&
+ bdrv_dirty_bitmap_enabled(bm)) {
+ ret = -EINVAL;
+ error_setg(errp, "Enabled bitmap '%s' incompatible with "
+ "readonly export", bitmap);
+ goto fail;
+ }
+ break;
+ case QTYPE_QDICT:
+ bitmap = bitmaps->value->u.external.name;
+ bm = block_dirty_bitmap_lookup(bitmaps->value->u.external.node,
+ bitmap, NULL, errp);
+ if (!bm) {
+ ret = -ENOENT;
+ goto fail;
+ }
+ break;
+ default:
+ abort();
}
- if (bdrv_dirty_bitmap_check(bm, BDRV_BITMAP_ALLOW_RO, errp)) {
- ret = -EINVAL;
- goto fail;
- }
+ assert(bm);
- if (readonly && bdrv_is_writable(bs) &&
- bdrv_dirty_bitmap_enabled(bm)) {
+ if (bdrv_dirty_bitmap_check(bm, BDRV_BITMAP_ALLOW_RO, errp)) {
ret = -EINVAL;
- error_setg(errp,
- "Enabled bitmap '%s' incompatible with readonly export",
- bitmap);
goto fail;
}
bdrv_dirty_bitmap_set_busy(exp->export_bitmaps[i], true);
}
+ exp->allocation_depth = arg->allocation_depth;
+
+ /*
+ * We need to inhibit request queuing in the block layer to ensure we can
+ * be properly quiesced when entering a drained section, as our coroutines
+ * servicing pending requests might enter blk_pread().
+ */
+ blk_set_disable_request_queuing(blk, true);
+
blk_add_aio_context_notifier(blk, blk_aio_attached, blk_aio_detach, exp);
+ blk_set_dev_ops(blk, &nbd_block_ops, exp);
+
QTAILQ_INSERT_TAIL(&exports, exp, next);
return 0;
}
blk_remove_aio_context_notifier(exp->common.blk, blk_aio_attached,
blk_aio_detach, exp);
+ blk_set_disable_request_queuing(exp->common.blk, false);
}
for (i = 0; i < exp->nr_export_bitmaps; i++) {
stl_be_p(&chunk.length, pnum);
ret = nbd_co_send_iov(client, iov, 1, errp);
} else {
- ret = blk_pread(exp->common.blk, offset + progress,
- data + progress, pnum);
+ ret = blk_pread(exp->common.blk, offset + progress, pnum,
+ data + progress, 0);
if (ret < 0) {
error_setg_errno(errp, -ret, "reading from file failed");
break;
g_free(ea->extents);
g_free(ea);
}
-G_DEFINE_AUTOPTR_CLEANUP_FUNC(NBDExtentArray, nbd_extent_array_free);
+G_DEFINE_AUTOPTR_CLEANUP_FUNC(NBDExtentArray, nbd_extent_array_free)
/* Further modifications of the array after conversion are abandoned */
static void nbd_extent_array_convert_to_be(NBDExtentArray *ea)
* Add extent to NBDExtentArray. If extent can't be added (no available space),
* return -1.
* For safety, when returning -1 for the first time, .can_add is set to false,
- * further call to nbd_extent_array_add() will crash.
- * (to avoid the situation, when after failing to add an extent (returned -1),
- * user miss this failure and add another extent, which is successfully added
- * (array is full, but new extent may be squashed into the last one), then we
- * have invalid array with skipped extent)
+ * and further calls to nbd_extent_array_add() will crash.
+ * (this avoids the situation where a caller ignores failure to add one extent,
+ * where adding another extent that would squash into the last array entry
+ * would result in an incorrect range reported to the client)
*/
static int nbd_extent_array_add(NBDExtentArray *ea,
uint32_t length, uint32_t flags)
return ret;
}
- flags = (ret & BDRV_BLOCK_ALLOCATED ? 0 : NBD_STATE_HOLE) |
- (ret & BDRV_BLOCK_ZERO ? NBD_STATE_ZERO : 0);
+ flags = (ret & BDRV_BLOCK_DATA ? 0 : NBD_STATE_HOLE) |
+ (ret & BDRV_BLOCK_ZERO ? NBD_STATE_ZERO : 0);
if (nbd_extent_array_add(ea, num, flags) < 0) {
return 0;
return 0;
}
+static int blockalloc_to_extents(BlockDriverState *bs, uint64_t offset,
+ uint64_t bytes, NBDExtentArray *ea)
+{
+ while (bytes) {
+ int64_t num;
+ int ret = bdrv_is_allocated_above(bs, NULL, false, offset, bytes,
+ &num);
+
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (nbd_extent_array_add(ea, num, ret) < 0) {
+ return 0;
+ }
+
+ offset += num;
+ bytes -= num;
+ }
+
+ return 0;
+}
+
/*
* nbd_co_send_extents
*
unsigned int nb_extents = dont_fragment ? 1 : NBD_MAX_BLOCK_STATUS_EXTENTS;
g_autoptr(NBDExtentArray) ea = nbd_extent_array_new(nb_extents);
- ret = blockstatus_to_extents(bs, offset, length, ea);
+ if (context_id == NBD_META_ID_BASE_ALLOCATION) {
+ ret = blockstatus_to_extents(bs, offset, length, ea);
+ } else {
+ ret = blockalloc_to_extents(bs, offset, length, ea);
+ }
if (ret < 0) {
return nbd_co_send_structured_error(
client, handle, -ret, "can't get block status", errp);
}
if (!full) {
- /* last non dirty extent */
- nbd_extent_array_add(es, end - start, 0);
+ /* last non dirty extent, nothing to do if array is now full */
+ (void) nbd_extent_array_add(es, end - start, 0);
}
bdrv_dirty_bitmap_unlock(bitmap);
/* nbd_co_receive_request
* Collect a client request. Return 0 if request looks valid, -EIO to drop
- * connection right away, and any other negative value to report an error to
- * the client (although the caller may still need to disconnect after reporting
- * the error).
+ * connection right away, -EAGAIN to indicate we were interrupted and the
+ * channel should be quiesced, and any other negative value to report an error
+ * to the client (although the caller may still need to disconnect after
+ * reporting the error).
*/
static int nbd_co_receive_request(NBDRequestData *req, NBDRequest *request,
Error **errp)
{
NBDClient *client = req->client;
int valid_flags;
+ int ret;
g_assert(qemu_in_coroutine());
assert(client->recv_coroutine == qemu_coroutine_self());
- if (nbd_receive_request(client->ioc, request, errp) < 0) {
- return -EIO;
+ ret = nbd_receive_request(client, request, errp);
+ if (ret < 0) {
+ return ret;
}
trace_nbd_co_receive_request_decode_type(request->handle, request->type,
data, request->len, errp);
}
- ret = blk_pread(exp->common.blk, request->from, data, request->len);
+ ret = blk_pread(exp->common.blk, request->from, request->len, data, 0);
if (ret < 0) {
return nbd_send_generic_reply(client, request->handle, ret,
"reading from file failed", errp);
if (request->flags & NBD_CMD_FLAG_FUA) {
flags |= BDRV_REQ_FUA;
}
- ret = blk_pwrite(exp->common.blk, request->from, data, request->len,
+ ret = blk_pwrite(exp->common.blk, request->from, request->len, data,
flags);
return nbd_send_generic_reply(client, request->handle, ret,
"writing to file failed", errp);
if (request->flags & NBD_CMD_FLAG_FAST_ZERO) {
flags |= BDRV_REQ_NO_FALLBACK;
}
- ret = 0;
- /* FIXME simplify this when blk_pwrite_zeroes switches to 64-bit */
- while (ret >= 0 && request->len) {
- int align = client->check_align ?: 1;
- int len = MIN(request->len, QEMU_ALIGN_DOWN(BDRV_REQUEST_MAX_BYTES,
- align));
- ret = blk_pwrite_zeroes(exp->common.blk, request->from, len, flags);
- request->len -= len;
- request->from += len;
- }
+ ret = blk_pwrite_zeroes(exp->common.blk, request->from, request->len,
+ flags);
return nbd_send_generic_reply(client, request->handle, ret,
"writing to file failed", errp);
"flush failed", errp);
case NBD_CMD_TRIM:
- ret = 0;
- /* FIXME simplify this when blk_co_pdiscard switches to 64-bit */
- while (ret >= 0 && request->len) {
- int align = client->check_align ?: 1;
- int len = MIN(request->len, QEMU_ALIGN_DOWN(BDRV_REQUEST_MAX_BYTES,
- align));
- ret = blk_co_pdiscard(exp->common.blk, request->from, len);
- request->len -= len;
- request->from += len;
- }
+ ret = blk_co_pdiscard(exp->common.blk, request->from, request->len);
if (ret >= 0 && request->flags & NBD_CMD_FLAG_FUA) {
ret = blk_co_flush(exp->common.blk);
}
}
}
+ if (client->export_meta.allocation_depth) {
+ ret = nbd_co_send_block_status(client, request->handle,
+ blk_bs(exp->common.blk),
+ request->from, request->len,
+ dont_fragment,
+ !--contexts_remaining,
+ NBD_META_ID_ALLOCATION_DEPTH,
+ errp);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+
for (i = 0; i < client->exp->nr_export_bitmaps; i++) {
if (!client->export_meta.bitmaps[i]) {
continue;
return;
}
+ if (client->quiescing) {
+ /*
+ * We're switching between AIO contexts. Don't attempt to receive a new
+ * request and kick the main context which may be waiting for us.
+ */
+ nbd_client_put(client);
+ client->recv_coroutine = NULL;
+ aio_wait_kick();
+ return;
+ }
+
req = nbd_request_get(client);
ret = nbd_co_receive_request(req, &request, &local_err);
client->recv_coroutine = NULL;
goto done;
}
+ if (ret == -EAGAIN) {
+ assert(client->quiescing);
+ goto done;
+ }
+
nbd_client_receive_next_request(client);
if (ret == -EIO) {
goto disconnect;
}
if (ret < 0) {
- /* It wans't -EIO, so, according to nbd_co_receive_request()
+ /* It wasn't -EIO, so, according to nbd_co_receive_request()
* semantics, we should return the error to the client. */
Error *export_err = local_err;
static void nbd_client_receive_next_request(NBDClient *client)
{
- if (!client->recv_coroutine && client->nb_requests < MAX_NBD_REQUESTS) {
+ if (!client->recv_coroutine && client->nb_requests < MAX_NBD_REQUESTS &&
+ !client->quiescing) {
nbd_client_get(client);
client->recv_coroutine = qemu_coroutine_create(nbd_trip, client);
aio_co_schedule(client->exp->common.ctx, client->recv_coroutine);