X-Git-Url: https://git.proxmox.com/?a=blobdiff_plain;f=block%2Fblock-copy.c;h=bb947afddad7681354330804e359e66fae48aae2;hb=8bdaed0f30ef8e8fb04d7909d7b9639bf98ce454;hp=f9e871b64fb8764a4ea39075aa15207e2479e616;hpb=8146b357d0cb3a3f5d500a1536f9f0e1ff3302cc;p=mirror_qemu.git diff --git a/block/block-copy.c b/block/block-copy.c index f9e871b64f..bb947afdda 100644 --- a/block/block-copy.c +++ b/block/block-copy.c @@ -17,21 +17,33 @@ #include "trace.h" #include "qapi/error.h" #include "block/block-copy.h" +#include "block/reqlist.h" #include "sysemu/block-backend.h" #include "qemu/units.h" #include "qemu/coroutine.h" #include "block/aio_task.h" +#include "qemu/error-report.h" +#include "qemu/memalign.h" #define BLOCK_COPY_MAX_COPY_RANGE (16 * MiB) #define BLOCK_COPY_MAX_BUFFER (1 * MiB) #define BLOCK_COPY_MAX_MEM (128 * MiB) #define BLOCK_COPY_MAX_WORKERS 64 #define BLOCK_COPY_SLICE_TIME 100000000ULL /* ns */ +#define BLOCK_COPY_CLUSTER_SIZE_DEFAULT (1 << 16) + +typedef enum { + COPY_READ_WRITE_CLUSTER, + COPY_READ_WRITE, + COPY_WRITE_ZEROES, + COPY_RANGE_SMALL, + COPY_RANGE_FULL +} BlockCopyMethod; static coroutine_fn int block_copy_task_entry(AioTask *task); typedef struct BlockCopyCallState { - /* IN parameters. Initialized in block_copy_async() and never changed. */ + /* Fields initialized in block_copy_async() and never changed. */ BlockCopyState *s; int64_t offset; int64_t bytes; @@ -40,38 +52,60 @@ typedef struct BlockCopyCallState { bool ignore_ratelimit; BlockCopyAsyncCallbackFunc cb; void *cb_opaque; - /* Coroutine where async block-copy is running */ Coroutine *co; + /* Fields whose state changes throughout the execution */ + bool finished; /* atomic */ + QemuCoSleep sleep; /* TODO: protect API with a lock */ + bool cancelled; /* atomic */ /* To reference all call states from BlockCopyState */ QLIST_ENTRY(BlockCopyCallState) list; - /* State */ - int ret; - bool finished; - QemuCoSleep sleep; - bool cancelled; - - /* OUT parameters */ + /* + * Fields that report information about return values and erros. + * Protected by lock in BlockCopyState. + */ bool error_is_read; + /* + * @ret is set concurrently by tasks under mutex. Only set once by first + * failed task (and untouched if no task failed). + * After finishing (call_state->finished is true), it is not modified + * anymore and may be safely read without mutex. + */ + int ret; } BlockCopyCallState; typedef struct BlockCopyTask { AioTask task; + /* + * Fields initialized in block_copy_task_create() + * and never changed. + */ BlockCopyState *s; BlockCopyCallState *call_state; - int64_t offset; - int64_t bytes; - bool zeroes; - QLIST_ENTRY(BlockCopyTask) list; - CoQueue wait_queue; /* coroutines blocked on this task */ + /* + * @method can also be set again in the while loop of + * block_copy_dirty_clusters(), but it is never accessed concurrently + * because the only other function that reads it is + * block_copy_task_entry() and it is invoked afterwards in the same + * iteration. + */ + BlockCopyMethod method; + + /* + * Generally, req is protected by lock in BlockCopyState, Still req.offset + * is only set on task creation, so may be read concurrently after creation. + * req.bytes is changed at most once, and need only protecting the case of + * parallel read while updating @bytes value in block_copy_task_shrink(). + */ + BlockReq req; } BlockCopyTask; static int64_t task_end(BlockCopyTask *task) { - return task->offset + task->bytes; + return task->req.offset + task->req.bytes; } typedef struct BlockCopyState { @@ -82,17 +116,25 @@ typedef struct BlockCopyState { */ BdrvChild *source; BdrvChild *target; - BdrvDirtyBitmap *copy_bitmap; - int64_t in_flight_bytes; + + /* + * Fields initialized in block_copy_state_new() + * and never changed. + */ int64_t cluster_size; - bool use_copy_range; - int64_t copy_size; + int64_t max_transfer; uint64_t len; - QLIST_HEAD(, BlockCopyTask) tasks; /* All tasks from all block-copy calls */ - QLIST_HEAD(, BlockCopyCallState) calls; - BdrvRequestFlags write_flags; + /* + * Fields whose state changes throughout the execution + * Protected by lock. + */ + CoMutex lock; + int64_t in_flight_bytes; + BlockCopyMethod method; + BlockReqList reqs; + QLIST_HEAD(, BlockCopyCallState) calls; /* * skip_unallocated: * @@ -107,59 +149,46 @@ typedef struct BlockCopyState { * skip unallocated regions, clear them in the copy_bitmap, and invoke * block_copy_reset_unallocated() every time it does. */ - bool skip_unallocated; - + bool skip_unallocated; /* atomic */ + /* State fields that use a thread-safe API */ + BdrvDirtyBitmap *copy_bitmap; ProgressMeter *progress; - SharedResource *mem; - - uint64_t speed; RateLimit rate_limit; } BlockCopyState; -static BlockCopyTask *find_conflicting_task(BlockCopyState *s, - int64_t offset, int64_t bytes) -{ - BlockCopyTask *t; - - QLIST_FOREACH(t, &s->tasks, list) { - if (offset + bytes > t->offset && offset < t->offset + t->bytes) { - return t; - } - } - - return NULL; -} - -/* - * If there are no intersecting tasks return false. Otherwise, wait for the - * first found intersecting tasks to finish and return true. - */ -static bool coroutine_fn block_copy_wait_one(BlockCopyState *s, int64_t offset, - int64_t bytes) +/* Called with lock held */ +static int64_t block_copy_chunk_size(BlockCopyState *s) { - BlockCopyTask *task = find_conflicting_task(s, offset, bytes); - - if (!task) { - return false; + switch (s->method) { + case COPY_READ_WRITE_CLUSTER: + return s->cluster_size; + case COPY_READ_WRITE: + case COPY_RANGE_SMALL: + return MIN(MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER), + s->max_transfer); + case COPY_RANGE_FULL: + return MIN(MAX(s->cluster_size, BLOCK_COPY_MAX_COPY_RANGE), + s->max_transfer); + default: + /* Cannot have COPY_WRITE_ZEROES here. */ + abort(); } - - qemu_co_queue_wait(&task->wait_queue, NULL); - - return true; } /* * Search for the first dirty area in offset/bytes range and create task at * the beginning of it. */ -static BlockCopyTask *block_copy_task_create(BlockCopyState *s, - BlockCopyCallState *call_state, - int64_t offset, int64_t bytes) +static coroutine_fn BlockCopyTask * +block_copy_task_create(BlockCopyState *s, BlockCopyCallState *call_state, + int64_t offset, int64_t bytes) { BlockCopyTask *task; - int64_t max_chunk = MIN_NON_ZERO(s->copy_size, call_state->max_chunk); + int64_t max_chunk; + QEMU_LOCK_GUARD(&s->lock); + max_chunk = MIN_NON_ZERO(block_copy_chunk_size(s), call_state->max_chunk); if (!bdrv_dirty_bitmap_next_dirty_area(s->copy_bitmap, offset, offset + bytes, max_chunk, &offset, &bytes)) @@ -171,7 +200,7 @@ static BlockCopyTask *block_copy_task_create(BlockCopyState *s, bytes = QEMU_ALIGN_UP(bytes, s->cluster_size); /* region is dirty, so no existent tasks possible in it */ - assert(!find_conflicting_task(s, offset, bytes)); + assert(!reqlist_find_conflict(&s->reqs, offset, bytes)); bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes); s->in_flight_bytes += bytes; @@ -181,11 +210,9 @@ static BlockCopyTask *block_copy_task_create(BlockCopyState *s, .task.func = block_copy_task_entry, .s = s, .call_state = call_state, - .offset = offset, - .bytes = bytes, + .method = s->method, }; - qemu_co_queue_init(&task->wait_queue); - QLIST_INSERT_HEAD(&s->tasks, task, list); + reqlist_init_req(&s->reqs, &task->req, offset, bytes); return task; } @@ -200,28 +227,35 @@ static BlockCopyTask *block_copy_task_create(BlockCopyState *s, static void coroutine_fn block_copy_task_shrink(BlockCopyTask *task, int64_t new_bytes) { - if (new_bytes == task->bytes) { + QEMU_LOCK_GUARD(&task->s->lock); + if (new_bytes == task->req.bytes) { return; } - assert(new_bytes > 0 && new_bytes < task->bytes); + assert(new_bytes > 0 && new_bytes < task->req.bytes); - task->s->in_flight_bytes -= task->bytes - new_bytes; + task->s->in_flight_bytes -= task->req.bytes - new_bytes; bdrv_set_dirty_bitmap(task->s->copy_bitmap, - task->offset + new_bytes, task->bytes - new_bytes); + task->req.offset + new_bytes, + task->req.bytes - new_bytes); - task->bytes = new_bytes; - qemu_co_queue_restart_all(&task->wait_queue); + reqlist_shrink_req(&task->req, new_bytes); } static void coroutine_fn block_copy_task_end(BlockCopyTask *task, int ret) { - task->s->in_flight_bytes -= task->bytes; + QEMU_LOCK_GUARD(&task->s->lock); + task->s->in_flight_bytes -= task->req.bytes; if (ret < 0) { - bdrv_set_dirty_bitmap(task->s->copy_bitmap, task->offset, task->bytes); + bdrv_set_dirty_bitmap(task->s->copy_bitmap, task->req.offset, + task->req.bytes); } - QLIST_REMOVE(task, list); - qemu_co_queue_restart_all(&task->wait_queue); + if (task->s->progress) { + progress_set_remaining(task->s->progress, + bdrv_get_dirty_count(task->s->copy_bitmap) + + task->s->in_flight_bytes); + } + reqlist_remove_req(&task->req); } void block_copy_state_free(BlockCopyState *s) @@ -243,12 +277,84 @@ static uint32_t block_copy_max_transfer(BdrvChild *source, BdrvChild *target) target->bs->bl.max_transfer)); } +void block_copy_set_copy_opts(BlockCopyState *s, bool use_copy_range, + bool compress) +{ + /* Keep BDRV_REQ_SERIALISING set (or not set) in block_copy_state_new() */ + s->write_flags = (s->write_flags & BDRV_REQ_SERIALISING) | + (compress ? BDRV_REQ_WRITE_COMPRESSED : 0); + + if (s->max_transfer < s->cluster_size) { + /* + * copy_range does not respect max_transfer. We don't want to bother + * with requests smaller than block-copy cluster size, so fallback to + * buffered copying (read and write respect max_transfer on their + * behalf). + */ + s->method = COPY_READ_WRITE_CLUSTER; + } else if (compress) { + /* Compression supports only cluster-size writes and no copy-range. */ + s->method = COPY_READ_WRITE_CLUSTER; + } else { + /* + * If copy range enabled, start with COPY_RANGE_SMALL, until first + * successful copy_range (look at block_copy_do_copy). + */ + s->method = use_copy_range ? COPY_RANGE_SMALL : COPY_READ_WRITE; + } +} + +static int64_t block_copy_calculate_cluster_size(BlockDriverState *target, + Error **errp) +{ + int ret; + BlockDriverInfo bdi; + bool target_does_cow = bdrv_backing_chain_next(target); + + /* + * If there is no backing file on the target, we cannot rely on COW if our + * backup cluster size is smaller than the target cluster size. Even for + * targets with a backing file, try to avoid COW if possible. + */ + ret = bdrv_get_info(target, &bdi); + if (ret == -ENOTSUP && !target_does_cow) { + /* Cluster size is not defined */ + warn_report("The target block device doesn't provide " + "information about the block size and it doesn't have a " + "backing file. The default block size of %u bytes is " + "used. If the actual block size of the target exceeds " + "this default, the backup may be unusable", + BLOCK_COPY_CLUSTER_SIZE_DEFAULT); + return BLOCK_COPY_CLUSTER_SIZE_DEFAULT; + } else if (ret < 0 && !target_does_cow) { + error_setg_errno(errp, -ret, + "Couldn't determine the cluster size of the target image, " + "which has no backing file"); + error_append_hint(errp, + "Aborting, since this may create an unusable destination image\n"); + return ret; + } else if (ret < 0 && target_does_cow) { + /* Not fatal; just trudge on ahead. */ + return BLOCK_COPY_CLUSTER_SIZE_DEFAULT; + } + + return MAX(BLOCK_COPY_CLUSTER_SIZE_DEFAULT, bdi.cluster_size); +} + BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target, - int64_t cluster_size, bool use_copy_range, - BdrvRequestFlags write_flags, Error **errp) + const BdrvDirtyBitmap *bitmap, + Error **errp) { + ERRP_GUARD(); BlockCopyState *s; + int64_t cluster_size; BdrvDirtyBitmap *copy_bitmap; + bool is_fleecing; + + cluster_size = block_copy_calculate_cluster_size(target->bs, errp); + if (cluster_size < 0) { + return NULL; + } copy_bitmap = bdrv_create_dirty_bitmap(source->bs, cluster_size, NULL, errp); @@ -256,6 +362,33 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target, return NULL; } bdrv_disable_dirty_bitmap(copy_bitmap); + if (bitmap) { + if (!bdrv_merge_dirty_bitmap(copy_bitmap, bitmap, NULL, errp)) { + error_prepend(errp, "Failed to merge bitmap '%s' to internal " + "copy-bitmap: ", bdrv_dirty_bitmap_name(bitmap)); + bdrv_release_dirty_bitmap(copy_bitmap); + return NULL; + } + } else { + bdrv_set_dirty_bitmap(copy_bitmap, 0, + bdrv_dirty_bitmap_size(copy_bitmap)); + } + + /* + * If source is in backing chain of target assume that target is going to be + * used for "image fleecing", i.e. it should represent a kind of snapshot of + * source at backup-start point in time. And target is going to be read by + * somebody (for example, used as NBD export) during backup job. + * + * In this case, we need to add BDRV_REQ_SERIALISING write flag to avoid + * intersection of backup writes and third party reads from target, + * otherwise reading from target we may occasionally read already updated by + * guest data. + * + * For more information see commit f8d59dfb40bb and test + * tests/qemu-iotests/222 + */ + is_fleecing = bdrv_chain_contains(target->bs, source->bs); s = g_new(BlockCopyState, 1); *s = (BlockCopyState) { @@ -264,39 +397,24 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target, .copy_bitmap = copy_bitmap, .cluster_size = cluster_size, .len = bdrv_dirty_bitmap_size(copy_bitmap), - .write_flags = write_flags, + .write_flags = (is_fleecing ? BDRV_REQ_SERIALISING : 0), .mem = shres_create(BLOCK_COPY_MAX_MEM), + .max_transfer = QEMU_ALIGN_DOWN( + block_copy_max_transfer(source, target), + cluster_size), }; - if (block_copy_max_transfer(source, target) < cluster_size) { - /* - * copy_range does not respect max_transfer. We don't want to bother - * with requests smaller than block-copy cluster size, so fallback to - * buffered copying (read and write respect max_transfer on their - * behalf). - */ - s->use_copy_range = false; - s->copy_size = cluster_size; - } else if (write_flags & BDRV_REQ_WRITE_COMPRESSED) { - /* Compression supports only cluster-size writes and no copy-range. */ - s->use_copy_range = false; - s->copy_size = cluster_size; - } else { - /* - * We enable copy-range, but keep small copy_size, until first - * successful copy_range (look at block_copy_do_copy). - */ - s->use_copy_range = use_copy_range; - s->copy_size = MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER); - } + block_copy_set_copy_opts(s, false, false); ratelimit_init(&s->rate_limit); - QLIST_INIT(&s->tasks); + qemu_co_mutex_init(&s->lock); + QLIST_INIT(&s->reqs); QLIST_INIT(&s->calls); return s; } +/* Only set before running the job, no need for locking. */ void block_copy_set_progress_meter(BlockCopyState *s, ProgressMeter *pm) { s->progress = pm; @@ -323,7 +441,7 @@ static coroutine_fn int block_copy_task_run(AioTaskPool *pool, aio_task_pool_wait_slot(pool); if (aio_task_pool_status(pool) < 0) { - co_put_to_shres(task->s->mem, task->bytes); + co_put_to_shres(task->s->mem, task->req.bytes); block_copy_task_end(task, -ECANCELED); g_free(task); return -ECANCELED; @@ -342,11 +460,15 @@ static coroutine_fn int block_copy_task_run(AioTaskPool *pool, * * No sync here: nor bitmap neighter intersecting requests handling, only copy. * + * @method is an in-out argument, so that copy_range can be either extended to + * a full-size buffer or disabled if the copy_range attempt fails. The output + * value of @method should be used for subsequent tasks. * Returns 0 on success. */ static int coroutine_fn block_copy_do_copy(BlockCopyState *s, int64_t offset, int64_t bytes, - bool zeroes, bool *error_is_read) + BlockCopyMethod *method, + bool *error_is_read) { int ret; int64_t nbytes = MIN(offset + bytes, s->len) - offset; @@ -360,7 +482,8 @@ static int coroutine_fn block_copy_do_copy(BlockCopyState *s, offset + bytes == QEMU_ALIGN_UP(s->len, s->cluster_size)); assert(nbytes < INT_MAX); - if (zeroes) { + switch (*method) { + case COPY_WRITE_ZEROES: ret = bdrv_co_pwrite_zeroes(s->target, offset, nbytes, s->write_flags & ~BDRV_REQ_WRITE_COMPRESSED); if (ret < 0) { @@ -368,86 +491,87 @@ static int coroutine_fn block_copy_do_copy(BlockCopyState *s, *error_is_read = false; } return ret; - } - if (s->use_copy_range) { + case COPY_RANGE_SMALL: + case COPY_RANGE_FULL: ret = bdrv_co_copy_range(s->source, offset, s->target, offset, nbytes, 0, s->write_flags); + if (ret >= 0) { + /* Successful copy-range, increase chunk size. */ + *method = COPY_RANGE_FULL; + return 0; + } + + trace_block_copy_copy_range_fail(s, offset, ret); + *method = COPY_READ_WRITE; + /* Fall through to read+write with allocated buffer */ + + case COPY_READ_WRITE_CLUSTER: + case COPY_READ_WRITE: + /* + * In case of failed copy_range request above, we may proceed with + * buffered request larger than BLOCK_COPY_MAX_BUFFER. + * Still, further requests will be properly limited, so don't care too + * much. Moreover the most likely case (copy_range is unsupported for + * the configuration, so the very first copy_range request fails) + * is handled by setting large copy_size only after first successful + * copy_range. + */ + + bounce_buffer = qemu_blockalign(s->source->bs, nbytes); + + ret = bdrv_co_pread(s->source, offset, nbytes, bounce_buffer, 0); if (ret < 0) { - trace_block_copy_copy_range_fail(s, offset, ret); - s->use_copy_range = false; - s->copy_size = MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER); - /* Fallback to read+write with allocated buffer */ - } else { - if (s->use_copy_range) { - /* - * Successful copy-range. Now increase copy_size. copy_range - * does not respect max_transfer (it's a TODO), so we factor - * that in here. - * - * Note: we double-check s->use_copy_range for the case when - * parallel block-copy request unsets it during previous - * bdrv_co_copy_range call. - */ - s->copy_size = - MIN(MAX(s->cluster_size, BLOCK_COPY_MAX_COPY_RANGE), - QEMU_ALIGN_DOWN(block_copy_max_transfer(s->source, - s->target), - s->cluster_size)); - } + trace_block_copy_read_fail(s, offset, ret); + *error_is_read = true; goto out; } - } - /* - * In case of failed copy_range request above, we may proceed with buffered - * request larger than BLOCK_COPY_MAX_BUFFER. Still, further requests will - * be properly limited, so don't care too much. Moreover the most likely - * case (copy_range is unsupported for the configuration, so the very first - * copy_range request fails) is handled by setting large copy_size only - * after first successful copy_range. - */ + ret = bdrv_co_pwrite(s->target, offset, nbytes, bounce_buffer, + s->write_flags); + if (ret < 0) { + trace_block_copy_write_fail(s, offset, ret); + *error_is_read = false; + goto out; + } - bounce_buffer = qemu_blockalign(s->source->bs, nbytes); + out: + qemu_vfree(bounce_buffer); + break; - ret = bdrv_co_pread(s->source, offset, nbytes, bounce_buffer, 0); - if (ret < 0) { - trace_block_copy_read_fail(s, offset, ret); - *error_is_read = true; - goto out; + default: + abort(); } - ret = bdrv_co_pwrite(s->target, offset, nbytes, bounce_buffer, - s->write_flags); - if (ret < 0) { - trace_block_copy_write_fail(s, offset, ret); - *error_is_read = false; - goto out; - } - -out: - qemu_vfree(bounce_buffer); - return ret; } static coroutine_fn int block_copy_task_entry(AioTask *task) { BlockCopyTask *t = container_of(task, BlockCopyTask, task); + BlockCopyState *s = t->s; bool error_is_read = false; + BlockCopyMethod method = t->method; int ret; - ret = block_copy_do_copy(t->s, t->offset, t->bytes, t->zeroes, + ret = block_copy_do_copy(s, t->req.offset, t->req.bytes, &method, &error_is_read); - if (ret < 0) { - if (!t->call_state->ret) { - t->call_state->ret = ret; - t->call_state->error_is_read = error_is_read; + + WITH_QEMU_LOCK_GUARD(&s->lock) { + if (s->method == t->method) { + s->method = method; + } + + if (ret < 0) { + if (!t->call_state->ret) { + t->call_state->ret = ret; + t->call_state->error_is_read = error_is_read; + } + } else if (s->progress) { + progress_work_done(s->progress, t->req.bytes); } - } else { - progress_work_done(t->s->progress, t->bytes); } - co_put_to_shres(t->s->mem, t->bytes); + co_put_to_shres(s->mem, t->req.bytes); block_copy_task_end(t, ret); return ret; @@ -460,7 +584,7 @@ static int block_copy_block_status(BlockCopyState *s, int64_t offset, BlockDriverState *base; int ret; - if (s->skip_unallocated) { + if (qatomic_read(&s->skip_unallocated)) { base = bdrv_backing_chain_next(s->source->bs); } else { base = NULL; @@ -527,6 +651,18 @@ static int block_copy_is_cluster_allocated(BlockCopyState *s, int64_t offset, } } +void block_copy_reset(BlockCopyState *s, int64_t offset, int64_t bytes) +{ + QEMU_LOCK_GUARD(&s->lock); + + bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes); + if (s->progress) { + progress_set_remaining(s->progress, + bdrv_get_dirty_count(s->copy_bitmap) + + s->in_flight_bytes); + } +} + /* * Reset bits in copy_bitmap starting at offset if they represent unallocated * data in the image. May reset subsequent contiguous bits. @@ -547,10 +683,7 @@ int64_t block_copy_reset_unallocated(BlockCopyState *s, bytes = clusters * s->cluster_size; if (!ret) { - bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes); - progress_set_remaining(s->progress, - bdrv_get_dirty_count(s->copy_bitmap) + - s->in_flight_bytes); + block_copy_reset(s, offset, bytes); } *count = bytes; @@ -586,7 +719,8 @@ block_copy_dirty_clusters(BlockCopyCallState *call_state) assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); assert(QEMU_IS_ALIGNED(bytes, s->cluster_size)); - while (bytes && aio_task_pool_status(aio) == 0 && !call_state->cancelled) { + while (bytes && aio_task_pool_status(aio) == 0 && + !qatomic_read(&call_state->cancelled)) { BlockCopyTask *task; int64_t status_bytes; @@ -596,49 +730,47 @@ block_copy_dirty_clusters(BlockCopyCallState *call_state) trace_block_copy_skip_range(s, offset, bytes); break; } - if (task->offset > offset) { - trace_block_copy_skip_range(s, offset, task->offset - offset); + if (task->req.offset > offset) { + trace_block_copy_skip_range(s, offset, task->req.offset - offset); } found_dirty = true; - ret = block_copy_block_status(s, task->offset, task->bytes, + ret = block_copy_block_status(s, task->req.offset, task->req.bytes, &status_bytes); assert(ret >= 0); /* never fail */ - if (status_bytes < task->bytes) { + if (status_bytes < task->req.bytes) { block_copy_task_shrink(task, status_bytes); } - if (s->skip_unallocated && !(ret & BDRV_BLOCK_ALLOCATED)) { + if (qatomic_read(&s->skip_unallocated) && + !(ret & BDRV_BLOCK_ALLOCATED)) { block_copy_task_end(task, 0); - progress_set_remaining(s->progress, - bdrv_get_dirty_count(s->copy_bitmap) + - s->in_flight_bytes); - trace_block_copy_skip_range(s, task->offset, task->bytes); + trace_block_copy_skip_range(s, task->req.offset, task->req.bytes); offset = task_end(task); bytes = end - offset; g_free(task); continue; } - task->zeroes = ret & BDRV_BLOCK_ZERO; - - if (s->speed) { - if (!call_state->ignore_ratelimit) { - uint64_t ns = ratelimit_calculate_delay(&s->rate_limit, 0); - if (ns > 0) { - block_copy_task_end(task, -EAGAIN); - g_free(task); - qemu_co_sleep_ns_wakeable(&call_state->sleep, - QEMU_CLOCK_REALTIME, ns); - continue; - } - } + if (ret & BDRV_BLOCK_ZERO) { + task->method = COPY_WRITE_ZEROES; + } - ratelimit_calculate_delay(&s->rate_limit, task->bytes); + if (!call_state->ignore_ratelimit) { + uint64_t ns = ratelimit_calculate_delay(&s->rate_limit, 0); + if (ns > 0) { + block_copy_task_end(task, -EAGAIN); + g_free(task); + qemu_co_sleep_ns_wakeable(&call_state->sleep, + QEMU_CLOCK_REALTIME, ns); + continue; + } } - trace_block_copy_process(s, task->offset); + ratelimit_calculate_delay(&s->rate_limit, task->req.bytes); - co_get_from_shres(s->mem, task->bytes); + trace_block_copy_process(s, task->req.offset); + + co_get_from_shres(s->mem, task->req.bytes); offset = task_end(task); bytes = end - offset; @@ -691,15 +823,40 @@ void block_copy_kick(BlockCopyCallState *call_state) static int coroutine_fn block_copy_common(BlockCopyCallState *call_state) { int ret; + BlockCopyState *s = call_state->s; - QLIST_INSERT_HEAD(&call_state->s->calls, call_state, list); + qemu_co_mutex_lock(&s->lock); + QLIST_INSERT_HEAD(&s->calls, call_state, list); + qemu_co_mutex_unlock(&s->lock); do { ret = block_copy_dirty_clusters(call_state); - if (ret == 0 && !call_state->cancelled) { - ret = block_copy_wait_one(call_state->s, call_state->offset, - call_state->bytes); + if (ret == 0 && !qatomic_read(&call_state->cancelled)) { + WITH_QEMU_LOCK_GUARD(&s->lock) { + /* + * Check that there is no task we still need to + * wait to complete + */ + ret = reqlist_wait_one(&s->reqs, call_state->offset, + call_state->bytes, &s->lock); + if (ret == 0) { + /* + * No pending tasks, but check again the bitmap in this + * same critical section, since a task might have failed + * between this and the critical section in + * block_copy_dirty_clusters(). + * + * reqlist_wait_one return value 0 also means that it + * didn't release the lock. So, we are still in the same + * critical section, not interrupted by any concurrent + * access to state. + */ + ret = bdrv_dirty_bitmap_next_dirty(s->copy_bitmap, + call_state->offset, + call_state->bytes) >= 0; + } + } } /* @@ -711,36 +868,57 @@ static int coroutine_fn block_copy_common(BlockCopyCallState *call_state) * 2. We have waited for some intersecting block-copy request * It may have failed and produced new dirty bits. */ - } while (ret > 0 && !call_state->cancelled); + } while (ret > 0 && !qatomic_read(&call_state->cancelled)); - call_state->finished = true; + qatomic_store_release(&call_state->finished, true); if (call_state->cb) { call_state->cb(call_state->cb_opaque); } + qemu_co_mutex_lock(&s->lock); QLIST_REMOVE(call_state, list); + qemu_co_mutex_unlock(&s->lock); return ret; } +static void coroutine_fn block_copy_async_co_entry(void *opaque) +{ + block_copy_common(opaque); +} + int coroutine_fn block_copy(BlockCopyState *s, int64_t start, int64_t bytes, - bool ignore_ratelimit) + bool ignore_ratelimit, uint64_t timeout_ns, + BlockCopyAsyncCallbackFunc cb, + void *cb_opaque) { - BlockCopyCallState call_state = { + int ret; + BlockCopyCallState *call_state = g_new(BlockCopyCallState, 1); + + *call_state = (BlockCopyCallState) { .s = s, .offset = start, .bytes = bytes, .ignore_ratelimit = ignore_ratelimit, .max_workers = BLOCK_COPY_MAX_WORKERS, + .cb = cb, + .cb_opaque = cb_opaque, }; - return block_copy_common(&call_state); -} + ret = qemu_co_timeout(block_copy_async_co_entry, call_state, timeout_ns, + g_free); + if (ret < 0) { + assert(ret == -ETIMEDOUT); + block_copy_call_cancel(call_state); + /* call_state will be freed by running coroutine. */ + return ret; + } -static void coroutine_fn block_copy_async_co_entry(void *opaque) -{ - block_copy_common(opaque); + ret = call_state->ret; + g_free(call_state); + + return ret; } BlockCopyCallState *block_copy_async(BlockCopyState *s, @@ -774,44 +952,50 @@ void block_copy_call_free(BlockCopyCallState *call_state) return; } - assert(call_state->finished); + assert(qatomic_read(&call_state->finished)); g_free(call_state); } bool block_copy_call_finished(BlockCopyCallState *call_state) { - return call_state->finished; + return qatomic_read(&call_state->finished); } bool block_copy_call_succeeded(BlockCopyCallState *call_state) { - return call_state->finished && !call_state->cancelled && - call_state->ret == 0; + return qatomic_load_acquire(&call_state->finished) && + !qatomic_read(&call_state->cancelled) && + call_state->ret == 0; } bool block_copy_call_failed(BlockCopyCallState *call_state) { - return call_state->finished && !call_state->cancelled && - call_state->ret < 0; + return qatomic_load_acquire(&call_state->finished) && + !qatomic_read(&call_state->cancelled) && + call_state->ret < 0; } bool block_copy_call_cancelled(BlockCopyCallState *call_state) { - return call_state->cancelled; + return qatomic_read(&call_state->cancelled); } int block_copy_call_status(BlockCopyCallState *call_state, bool *error_is_read) { - assert(call_state->finished); + assert(qatomic_load_acquire(&call_state->finished)); if (error_is_read) { *error_is_read = call_state->error_is_read; } return call_state->ret; } +/* + * Note that cancelling and finishing are racy. + * User can cancel a block-copy that is already finished. + */ void block_copy_call_cancel(BlockCopyCallState *call_state) { - call_state->cancelled = true; + qatomic_set(&call_state->cancelled, true); block_copy_kick(call_state); } @@ -820,17 +1004,19 @@ BdrvDirtyBitmap *block_copy_dirty_bitmap(BlockCopyState *s) return s->copy_bitmap; } +int64_t block_copy_cluster_size(BlockCopyState *s) +{ + return s->cluster_size; +} + void block_copy_set_skip_unallocated(BlockCopyState *s, bool skip) { - s->skip_unallocated = skip; + qatomic_set(&s->skip_unallocated, skip); } void block_copy_set_speed(BlockCopyState *s, uint64_t speed) { - s->speed = speed; - if (speed > 0) { - ratelimit_set_speed(&s->rate_limit, speed, BLOCK_COPY_SLICE_TIME); - } + ratelimit_set_speed(&s->rate_limit, speed, BLOCK_COPY_SLICE_TIME); /* * Note: it's good to kick all call states from here, but it should be done