/* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */
#define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
-static void bdrv_parent_cb_resize(BlockDriverState *bs);
+static void coroutine_fn GRAPH_RDLOCK
+bdrv_parent_cb_resize(BlockDriverState *bs);
+
static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
int64_t offset, int64_t bytes, BdrvRequestFlags flags);
-static void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore)
+static void GRAPH_RDLOCK
+bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore)
{
BdrvChild *c, *next;
+ IO_OR_GS_CODE();
+ assert_bdrv_graph_readable();
QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
if (c == ignore) {
}
}
-static void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore)
+static void GRAPH_RDLOCK
+bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore)
{
BdrvChild *c;
+ IO_OR_GS_CODE();
+ assert_bdrv_graph_readable();
QLIST_FOREACH(c, &bs->parents, next_parent) {
if (c == ignore) {
bool bdrv_parent_drained_poll_single(BdrvChild *c)
{
+ IO_OR_GS_CODE();
+
if (c->klass->drained_poll) {
return c->klass->drained_poll(c);
}
return false;
}
-static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore,
- bool ignore_bds_parents)
+static bool GRAPH_RDLOCK
+bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore,
+ bool ignore_bds_parents)
{
BdrvChild *c, *next;
bool busy = false;
+ IO_OR_GS_CODE();
+ assert_bdrv_graph_readable();
QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) {
c->quiesced_parent = true;
if (c->klass->drained_begin) {
+ /* called with rdlock taken, but it doesn't really need it. */
c->klass->drained_begin(c);
}
}
static bool bdrv_drain_poll_top_level(BlockDriverState *bs,
BdrvChild *ignore_parent)
{
+ GLOBAL_STATE_CODE();
+ GRAPH_RDLOCK_GUARD_MAINLOOP();
+
return bdrv_drain_poll(bs, ignore_parent, false);
}
* timer callback), it is a bug in the caller that should be fixed. */
assert(data.done);
- /* Reaquire the AioContext of bs if we dropped it */
+ /* Reacquire the AioContext of bs if we dropped it */
if (ctx != co_ctx) {
aio_context_acquire(ctx);
}
/* Stop things in parent-to-child order */
if (qatomic_fetch_inc(&bs->quiesce_counter) == 0) {
+ GRAPH_RDLOCK_GUARD_MAINLOOP();
bdrv_parent_drained_begin(bs, parent);
if (bs->drv && bs->drv->bdrv_drain_begin) {
bs->drv->bdrv_drain_begin(bs);
bdrv_do_drained_begin(bs, parent, false);
}
-void bdrv_drained_begin(BlockDriverState *bs)
+void coroutine_mixed_fn
+bdrv_drained_begin(BlockDriverState *bs)
{
IO_OR_GS_CODE();
bdrv_do_drained_begin(bs, NULL, true);
bdrv_co_yield_to_drain(bs, false, parent, false);
return;
}
+
+ /* At this point, we should be always running in the main loop. */
+ GLOBAL_STATE_CODE();
assert(bs->quiesce_counter > 0);
GLOBAL_STATE_CODE();
/* Re-enable things in child-to-parent order */
old_quiesce_counter = qatomic_fetch_dec(&bs->quiesce_counter);
if (old_quiesce_counter == 1) {
+ GRAPH_RDLOCK_GUARD_MAINLOOP();
if (bs->drv && bs->drv->bdrv_drain_end) {
bs->drv->bdrv_drain_end(bs);
}
static void bdrv_drain_assert_idle(BlockDriverState *bs)
{
BdrvChild *child, *next;
+ GLOBAL_STATE_CODE();
+ GRAPH_RDLOCK_GUARD_MAINLOOP();
assert(qatomic_read(&bs->in_flight) == 0);
QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
{
BlockDriverState *bs = NULL;
bool result = false;
+
GLOBAL_STATE_CODE();
+ GRAPH_RDLOCK_GUARD_MAINLOOP();
/* bdrv_drain_poll() can't make changes to the graph and we are holding the
* main AioContext lock, so iterating bdrv_next_all_states() is safe. */
}
}
-void bdrv_drain_all_begin(void)
+void coroutine_mixed_fn bdrv_drain_all_begin(void)
{
BlockDriverState *bs = NULL;
qatomic_dec(&req->bs->serialising_in_flight);
}
- qemu_co_mutex_lock(&req->bs->reqs_lock);
+ qemu_mutex_lock(&req->bs->reqs_lock);
QLIST_REMOVE(req, list);
+ qemu_mutex_unlock(&req->bs->reqs_lock);
+
+ /*
+ * At this point qemu_co_queue_wait(&req->wait_queue, ...) won't be called
+ * anymore because the request has been removed from the list, so it's safe
+ * to restart the queue outside reqs_lock to minimize the critical section.
+ */
qemu_co_queue_restart_all(&req->wait_queue);
- qemu_co_mutex_unlock(&req->bs->reqs_lock);
}
/**
qemu_co_queue_init(&req->wait_queue);
- qemu_co_mutex_lock(&bs->reqs_lock);
+ qemu_mutex_lock(&bs->reqs_lock);
QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
- qemu_co_mutex_unlock(&bs->reqs_lock);
+ qemu_mutex_unlock(&bs->reqs_lock);
}
static bool tracked_request_overlaps(BdrvTrackedRequest *req,
}
/**
- * Round a region to cluster boundaries
+ * Round a region to subcluster (if supported) or cluster boundaries
*/
void coroutine_fn GRAPH_RDLOCK
-bdrv_round_to_clusters(BlockDriverState *bs, int64_t offset, int64_t bytes,
- int64_t *cluster_offset, int64_t *cluster_bytes)
+bdrv_round_to_subclusters(BlockDriverState *bs, int64_t offset, int64_t bytes,
+ int64_t *align_offset, int64_t *align_bytes)
{
BlockDriverInfo bdi;
IO_CODE();
- if (bdrv_co_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
- *cluster_offset = offset;
- *cluster_bytes = bytes;
+ if (bdrv_co_get_info(bs, &bdi) < 0 || bdi.subcluster_size == 0) {
+ *align_offset = offset;
+ *align_bytes = bytes;
} else {
- int64_t c = bdi.cluster_size;
- *cluster_offset = QEMU_ALIGN_DOWN(offset, c);
- *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c);
+ int64_t c = bdi.subcluster_size;
+ *align_offset = QEMU_ALIGN_DOWN(offset, c);
+ *align_bytes = QEMU_ALIGN_UP(offset - *align_offset + bytes, c);
}
}
return;
}
- qemu_co_mutex_lock(&bs->reqs_lock);
+ qemu_mutex_lock(&bs->reqs_lock);
bdrv_wait_serialising_requests_locked(self);
- qemu_co_mutex_unlock(&bs->reqs_lock);
+ qemu_mutex_unlock(&bs->reqs_lock);
}
void coroutine_fn bdrv_make_request_serialising(BdrvTrackedRequest *req,
{
IO_CODE();
- qemu_co_mutex_lock(&req->bs->reqs_lock);
+ qemu_mutex_lock(&req->bs->reqs_lock);
tracked_request_set_serialising(req, align);
bdrv_wait_serialising_requests_locked(req);
- qemu_co_mutex_unlock(&req->bs->reqs_lock);
+ qemu_mutex_unlock(&req->bs->reqs_lock);
}
int bdrv_check_qiov_request(int64_t offset, int64_t bytes,
void *bounce_buffer = NULL;
BlockDriver *drv = bs->drv;
- int64_t cluster_offset;
- int64_t cluster_bytes;
+ int64_t align_offset;
+ int64_t align_bytes;
int64_t skip_bytes;
int ret;
int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
* BDRV_REQUEST_MAX_BYTES (even when the original read did not), which
* is one reason we loop rather than doing it all at once.
*/
- bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes);
- skip_bytes = offset - cluster_offset;
+ bdrv_round_to_subclusters(bs, offset, bytes, &align_offset, &align_bytes);
+ skip_bytes = offset - align_offset;
trace_bdrv_co_do_copy_on_readv(bs, offset, bytes,
- cluster_offset, cluster_bytes);
+ align_offset, align_bytes);
- while (cluster_bytes) {
+ while (align_bytes) {
int64_t pnum;
if (skip_write) {
ret = 1; /* "already allocated", so nothing will be copied */
- pnum = MIN(cluster_bytes, max_transfer);
+ pnum = MIN(align_bytes, max_transfer);
} else {
- ret = bdrv_is_allocated(bs, cluster_offset,
- MIN(cluster_bytes, max_transfer), &pnum);
+ ret = bdrv_co_is_allocated(bs, align_offset,
+ MIN(align_bytes, max_transfer), &pnum);
if (ret < 0) {
/*
* Safe to treat errors in querying allocation as if
* unallocated; we'll probably fail again soon on the
* read, but at least that will set a decent errno.
*/
- pnum = MIN(cluster_bytes, max_transfer);
+ pnum = MIN(align_bytes, max_transfer);
}
/* Stop at EOF if the image ends in the middle of the cluster */
/* Must copy-on-read; use the bounce buffer */
pnum = MIN(pnum, MAX_BOUNCE_BUFFER);
if (!bounce_buffer) {
- int64_t max_we_need = MAX(pnum, cluster_bytes - pnum);
+ int64_t max_we_need = MAX(pnum, align_bytes - pnum);
int64_t max_allowed = MIN(max_transfer, MAX_BOUNCE_BUFFER);
int64_t bounce_buffer_len = MIN(max_we_need, max_allowed);
}
qemu_iovec_init_buf(&local_qiov, bounce_buffer, pnum);
- ret = bdrv_driver_preadv(bs, cluster_offset, pnum,
+ ret = bdrv_driver_preadv(bs, align_offset, pnum,
&local_qiov, 0, 0);
if (ret < 0) {
goto err;
/* FIXME: Should we (perhaps conditionally) be setting
* BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy
* that still correctly reads as zero? */
- ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum,
+ ret = bdrv_co_do_pwrite_zeroes(bs, align_offset, pnum,
BDRV_REQ_WRITE_UNCHANGED);
} else {
/* This does not change the data on the disk, it is not
* necessary to flush even in cache=writethrough mode.
*/
- ret = bdrv_driver_pwritev(bs, cluster_offset, pnum,
+ ret = bdrv_driver_pwritev(bs, align_offset, pnum,
&local_qiov, 0,
BDRV_REQ_WRITE_UNCHANGED);
}
}
}
- cluster_offset += pnum;
- cluster_bytes -= pnum;
+ align_offset += pnum;
+ align_bytes -= pnum;
progress += pnum - skip_bytes;
skip_bytes = 0;
}
/* The flag BDRV_REQ_COPY_ON_READ has reached its addressee */
flags &= ~BDRV_REQ_COPY_ON_READ;
- ret = bdrv_is_allocated(bs, offset, bytes, &pnum);
+ ret = bdrv_co_is_allocated(bs, offset, bytes, &pnum);
if (ret < 0) {
goto out;
}
}
}
-static inline void coroutine_fn
+static inline void coroutine_fn GRAPH_RDLOCK
bdrv_co_write_req_finish(BdrvChild *child, int64_t offset, int64_t bytes,
BdrvTrackedRequest *req, int ret)
{
int result = 0;
GLOBAL_STATE_CODE();
+ GRAPH_RDLOCK_GUARD_MAINLOOP();
/*
* bdrv queue is managed by record/replay,
* set to the host mapping and BDS corresponding to the guest offset.
*/
static int coroutine_fn GRAPH_RDLOCK
-bdrv_co_block_status(BlockDriverState *bs, bool want_zero,
- int64_t offset, int64_t bytes,
- int64_t *pnum, int64_t *map, BlockDriverState **file)
+bdrv_co_do_block_status(BlockDriverState *bs, bool want_zero,
+ int64_t offset, int64_t bytes,
+ int64_t *pnum, int64_t *map, BlockDriverState **file)
{
int64_t total_size;
int64_t n; /* bytes */
if (ret & BDRV_BLOCK_RAW) {
assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file);
- ret = bdrv_co_block_status(local_file, want_zero, local_map,
- *pnum, pnum, &local_map, &local_file);
+ ret = bdrv_co_do_block_status(local_file, want_zero, local_map,
+ *pnum, pnum, &local_map, &local_file);
goto out;
}
int64_t file_pnum;
int ret2;
- ret2 = bdrv_co_block_status(local_file, want_zero, local_map,
- *pnum, &file_pnum, NULL, NULL);
+ ret2 = bdrv_co_do_block_status(local_file, want_zero, local_map,
+ *pnum, &file_pnum, NULL, NULL);
if (ret2 >= 0) {
/* Ignore errors. This is just providing extra information, it
* is useful but not necessary.
return 0;
}
- ret = bdrv_co_block_status(bs, want_zero, offset, bytes, pnum, map, file);
+ ret = bdrv_co_do_block_status(bs, want_zero, offset, bytes, pnum,
+ map, file);
++*depth;
if (ret < 0 || *pnum == 0 || ret & BDRV_BLOCK_ALLOCATED || bs == base) {
return ret;
for (p = bdrv_filter_or_cow_bs(bs); include_base || p != base;
p = bdrv_filter_or_cow_bs(p))
{
- ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map,
- file);
+ ret = bdrv_co_do_block_status(p, want_zero, offset, bytes, pnum,
+ map, file);
++*depth;
if (ret < 0) {
return ret;
bytes, pnum, map, file, NULL);
}
-int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base,
- int64_t offset, int64_t bytes, int64_t *pnum,
- int64_t *map, BlockDriverState **file)
+int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, int64_t offset,
+ int64_t bytes, int64_t *pnum,
+ int64_t *map, BlockDriverState **file)
{
IO_CODE();
- return bdrv_common_block_status_above(bs, base, false, true, offset, bytes,
- pnum, map, file, NULL);
-}
-
-int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes,
- int64_t *pnum, int64_t *map, BlockDriverState **file)
-{
- IO_CODE();
- return bdrv_block_status_above(bs, bdrv_filter_or_cow_bs(bs),
- offset, bytes, pnum, map, file);
+ return bdrv_co_block_status_above(bs, bdrv_filter_or_cow_bs(bs),
+ offset, bytes, pnum, map, file);
}
/*
return !!(ret & BDRV_BLOCK_ALLOCATED);
}
-int bdrv_is_allocated(BlockDriverState *bs, int64_t offset, int64_t bytes,
- int64_t *pnum)
-{
- int ret;
- int64_t dummy;
- IO_CODE();
-
- ret = bdrv_common_block_status_above(bs, bs, true, false, offset,
- bytes, pnum ? pnum : &dummy, NULL,
- NULL, NULL);
- if (ret < 0) {
- return ret;
- }
- return !!(ret & BDRV_BLOCK_ALLOCATED);
-}
-
-/* See bdrv_is_allocated_above for documentation */
-int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top,
- BlockDriverState *base,
- bool include_base, int64_t offset,
- int64_t bytes, int64_t *pnum)
-{
- int depth;
- int ret;
- IO_CODE();
-
- ret = bdrv_co_common_block_status_above(top, base, include_base, false,
- offset, bytes, pnum, NULL, NULL,
- &depth);
- if (ret < 0) {
- return ret;
- }
-
- if (ret & BDRV_BLOCK_ALLOCATED) {
- return depth;
- }
- return 0;
-}
-
/*
* Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
*
* words, the result is not necessarily the maximum possible range);
* but 'pnum' will only be 0 when end of file is reached.
*/
-int bdrv_is_allocated_above(BlockDriverState *top,
- BlockDriverState *base,
- bool include_base, int64_t offset,
- int64_t bytes, int64_t *pnum)
+int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *bs,
+ BlockDriverState *base,
+ bool include_base, int64_t offset,
+ int64_t bytes, int64_t *pnum)
{
int depth;
int ret;
IO_CODE();
- ret = bdrv_common_block_status_above(top, base, include_base, false,
- offset, bytes, pnum, NULL, NULL,
- &depth);
+ ret = bdrv_co_common_block_status_above(bs, base, include_base, false,
+ offset, bytes, pnum, NULL, NULL,
+ &depth);
if (ret < 0) {
return ret;
}
/**************************************************************/
/* async I/Os */
+/**
+ * Synchronously cancels an acb. Must be called with the BQL held and the acb
+ * must be processed with the BQL held too (IOThreads are not allowed).
+ *
+ * Use bdrv_aio_cancel_async() instead when possible.
+ */
void bdrv_aio_cancel(BlockAIOCB *acb)
{
- IO_CODE();
+ GLOBAL_STATE_CODE();
qemu_aio_ref(acb);
bdrv_aio_cancel_async(acb);
- while (acb->refcnt > 1) {
- if (acb->aiocb_info->get_aio_context) {
- aio_poll(acb->aiocb_info->get_aio_context(acb), true);
- } else if (acb->bs) {
- /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so
- * assert that we're not using an I/O thread. Thread-safe
- * code should use bdrv_aio_cancel_async exclusively.
- */
- assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context());
- aio_poll(bdrv_get_aio_context(acb->bs), true);
- } else {
- abort();
- }
- }
+ AIO_WAIT_WHILE_UNLOCKED(NULL, acb->refcnt > 1);
qemu_aio_unref(acb);
}
goto early_exit;
}
- qemu_co_mutex_lock(&bs->reqs_lock);
+ qemu_mutex_lock(&bs->reqs_lock);
current_gen = qatomic_read(&bs->write_gen);
/* Wait until any previous flushes are completed */
/* Flushes reach this point in nondecreasing current_gen order. */
bs->active_flush_req = true;
- qemu_co_mutex_unlock(&bs->reqs_lock);
+ qemu_mutex_unlock(&bs->reqs_lock);
/* Write back all layers by calling one driver function */
if (bs->drv->bdrv_co_flush) {
bs->flushed_gen = current_gen;
}
- qemu_co_mutex_lock(&bs->reqs_lock);
+ qemu_mutex_lock(&bs->reqs_lock);
bs->active_flush_req = false;
/* Return value is ignored - it's ok if wait queue is empty */
qemu_co_queue_next(&bs->flush_queue);
- qemu_co_mutex_unlock(&bs->reqs_lock);
+ qemu_mutex_unlock(&bs->reqs_lock);
early_exit:
bdrv_dec_in_flight(bs);
bytes, read_flags, write_flags);
}
-static void bdrv_parent_cb_resize(BlockDriverState *bs)
+static void coroutine_fn GRAPH_RDLOCK
+bdrv_parent_cb_resize(BlockDriverState *bs)
{
BdrvChild *c;
+
+ assert_bdrv_graph_readable();
+
QLIST_FOREACH(c, &bs->parents, next_parent) {
if (c->klass->resize) {
c->klass->resize(c);