]> git.proxmox.com Git - mirror_qemu.git/blame - block/block-copy.c
block-copy: small refactor in block_copy_task_entry and block_copy_common
[mirror_qemu.git] / block / block-copy.c
CommitLineData
beb5f545
VSO
1/*
2 * block_copy API
3 *
4 * Copyright (C) 2013 Proxmox Server Solutions
5 * Copyright (c) 2019 Virtuozzo International GmbH.
6 *
7 * Authors:
8 * Dietmar Maurer (dietmar@proxmox.com)
9 * Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
10 *
11 * This work is licensed under the terms of the GNU GPL, version 2 or later.
12 * See the COPYING file in the top-level directory.
13 */
14
15#include "qemu/osdep.h"
16
17#include "trace.h"
18#include "qapi/error.h"
19#include "block/block-copy.h"
20#include "sysemu/block-backend.h"
b3b7036a 21#include "qemu/units.h"
4ce5dd3e
VSO
22#include "qemu/coroutine.h"
23#include "block/aio_task.h"
b3b7036a
VSO
24
25#define BLOCK_COPY_MAX_COPY_RANGE (16 * MiB)
0e240245 26#define BLOCK_COPY_MAX_BUFFER (1 * MiB)
7f739d0e 27#define BLOCK_COPY_MAX_MEM (128 * MiB)
4ce5dd3e 28#define BLOCK_COPY_MAX_WORKERS 64
7e032df0 29#define BLOCK_COPY_SLICE_TIME 100000000ULL /* ns */
4ce5dd3e
VSO
30
31static coroutine_fn int block_copy_task_entry(AioTask *task);
32
33typedef struct BlockCopyCallState {
de4641b4 34 /* IN parameters. Initialized in block_copy_async() and never changed. */
3b8c2329
VSO
35 BlockCopyState *s;
36 int64_t offset;
37 int64_t bytes;
26be9d62
VSO
38 int max_workers;
39 int64_t max_chunk;
7e032df0 40 bool ignore_ratelimit;
de4641b4
VSO
41 BlockCopyAsyncCallbackFunc cb;
42 void *cb_opaque;
43
44 /* Coroutine where async block-copy is running */
45 Coroutine *co;
3b8c2329 46
2e099a9d
VSO
47 /* To reference all call states from BlockCopyState */
48 QLIST_ENTRY(BlockCopyCallState) list;
49
3b8c2329 50 /* State */
de4641b4
VSO
51 int ret;
52 bool finished;
29a6ea24 53 QemuCoSleep sleep;
a6d23d56 54 bool cancelled;
3b8c2329
VSO
55
56 /* OUT parameters */
4ce5dd3e
VSO
57 bool error_is_read;
58} BlockCopyCallState;
beb5f545 59
e9407785 60typedef struct BlockCopyTask {
4ce5dd3e
VSO
61 AioTask task;
62
1348a657 63 BlockCopyState *s;
4ce5dd3e 64 BlockCopyCallState *call_state;
397f4e9d
VSO
65 int64_t offset;
66 int64_t bytes;
4ce5dd3e 67 bool zeroes;
bed95234 68 bool copy_range;
e9407785
VSO
69 QLIST_ENTRY(BlockCopyTask) list;
70 CoQueue wait_queue; /* coroutines blocked on this task */
71} BlockCopyTask;
397f4e9d 72
42ac2144
VSO
73static int64_t task_end(BlockCopyTask *task)
74{
75 return task->offset + task->bytes;
76}
77
397f4e9d
VSO
78typedef struct BlockCopyState {
79 /*
80 * BdrvChild objects are not owned or managed by block-copy. They are
81 * provided by block-copy user and user is responsible for appropriate
82 * permissions on these children.
83 */
84 BdrvChild *source;
85 BdrvChild *target;
86 BdrvDirtyBitmap *copy_bitmap;
87 int64_t in_flight_bytes;
88 int64_t cluster_size;
89 bool use_copy_range;
90 int64_t copy_size;
91 uint64_t len;
2e099a9d
VSO
92 QLIST_HEAD(, BlockCopyTask) tasks; /* All tasks from all block-copy calls */
93 QLIST_HEAD(, BlockCopyCallState) calls;
397f4e9d
VSO
94
95 BdrvRequestFlags write_flags;
96
97 /*
98 * skip_unallocated:
99 *
100 * Used by sync=top jobs, which first scan the source node for unallocated
101 * areas and clear them in the copy_bitmap. During this process, the bitmap
102 * is thus not fully initialized: It may still have bits set for areas that
103 * are unallocated and should actually not be copied.
104 *
105 * This is indicated by skip_unallocated.
106 *
107 * In this case, block_copy() will query the source’s allocation status,
108 * skip unallocated regions, clear them in the copy_bitmap, and invoke
109 * block_copy_reset_unallocated() every time it does.
110 */
111 bool skip_unallocated;
112
113 ProgressMeter *progress;
397f4e9d
VSO
114
115 SharedResource *mem;
7e032df0 116
7e032df0 117 RateLimit rate_limit;
397f4e9d
VSO
118} BlockCopyState;
119
e9407785
VSO
120static BlockCopyTask *find_conflicting_task(BlockCopyState *s,
121 int64_t offset, int64_t bytes)
17187cb6 122{
e9407785 123 BlockCopyTask *t;
17187cb6 124
e9407785
VSO
125 QLIST_FOREACH(t, &s->tasks, list) {
126 if (offset + bytes > t->offset && offset < t->offset + t->bytes) {
127 return t;
17187cb6
VSO
128 }
129 }
130
131 return NULL;
132}
133
5332e5d2 134/*
e9407785
VSO
135 * If there are no intersecting tasks return false. Otherwise, wait for the
136 * first found intersecting tasks to finish and return true.
5332e5d2
VSO
137 */
138static bool coroutine_fn block_copy_wait_one(BlockCopyState *s, int64_t offset,
139 int64_t bytes)
a6ffe199 140{
e9407785 141 BlockCopyTask *task = find_conflicting_task(s, offset, bytes);
17187cb6 142
e9407785 143 if (!task) {
5332e5d2 144 return false;
17187cb6 145 }
5332e5d2 146
e9407785 147 qemu_co_queue_wait(&task->wait_queue, NULL);
5332e5d2
VSO
148
149 return true;
a6ffe199
VSO
150}
151
42ac2144
VSO
152/*
153 * Search for the first dirty area in offset/bytes range and create task at
154 * the beginning of it.
155 */
f13e60a9 156static BlockCopyTask *block_copy_task_create(BlockCopyState *s,
4ce5dd3e 157 BlockCopyCallState *call_state,
f13e60a9 158 int64_t offset, int64_t bytes)
a6ffe199 159{
42ac2144 160 BlockCopyTask *task;
26be9d62 161 int64_t max_chunk = MIN_NON_ZERO(s->copy_size, call_state->max_chunk);
f13e60a9 162
42ac2144
VSO
163 if (!bdrv_dirty_bitmap_next_dirty_area(s->copy_bitmap,
164 offset, offset + bytes,
26be9d62 165 max_chunk, &offset, &bytes))
42ac2144
VSO
166 {
167 return NULL;
168 }
169
7661a886
SR
170 assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
171 bytes = QEMU_ALIGN_UP(bytes, s->cluster_size);
172
42ac2144 173 /* region is dirty, so no existent tasks possible in it */
e9407785 174 assert(!find_conflicting_task(s, offset, bytes));
5332e5d2
VSO
175
176 bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes);
177 s->in_flight_bytes += bytes;
178
42ac2144 179 task = g_new(BlockCopyTask, 1);
1348a657 180 *task = (BlockCopyTask) {
4ce5dd3e 181 .task.func = block_copy_task_entry,
1348a657 182 .s = s,
4ce5dd3e 183 .call_state = call_state,
1348a657
VSO
184 .offset = offset,
185 .bytes = bytes,
bed95234 186 .copy_range = s->use_copy_range,
1348a657 187 };
e9407785
VSO
188 qemu_co_queue_init(&task->wait_queue);
189 QLIST_INSERT_HEAD(&s->tasks, task, list);
f13e60a9
VSO
190
191 return task;
a6ffe199
VSO
192}
193
5332e5d2 194/*
e9407785 195 * block_copy_task_shrink
5332e5d2 196 *
e9407785
VSO
197 * Drop the tail of the task to be handled later. Set dirty bits back and
198 * wake up all tasks waiting for us (may be some of them are not intersecting
199 * with shrunk task)
5332e5d2 200 */
1348a657 201static void coroutine_fn block_copy_task_shrink(BlockCopyTask *task,
e9407785 202 int64_t new_bytes)
a6ffe199 203{
e9407785 204 if (new_bytes == task->bytes) {
5332e5d2
VSO
205 return;
206 }
207
e9407785 208 assert(new_bytes > 0 && new_bytes < task->bytes);
5332e5d2 209
1348a657
VSO
210 task->s->in_flight_bytes -= task->bytes - new_bytes;
211 bdrv_set_dirty_bitmap(task->s->copy_bitmap,
e9407785 212 task->offset + new_bytes, task->bytes - new_bytes);
5332e5d2 213
e9407785
VSO
214 task->bytes = new_bytes;
215 qemu_co_queue_restart_all(&task->wait_queue);
5332e5d2
VSO
216}
217
1348a657 218static void coroutine_fn block_copy_task_end(BlockCopyTask *task, int ret)
5332e5d2 219{
1348a657 220 task->s->in_flight_bytes -= task->bytes;
5332e5d2 221 if (ret < 0) {
1348a657 222 bdrv_set_dirty_bitmap(task->s->copy_bitmap, task->offset, task->bytes);
5332e5d2 223 }
e9407785
VSO
224 QLIST_REMOVE(task, list);
225 qemu_co_queue_restart_all(&task->wait_queue);
a6ffe199
VSO
226}
227
beb5f545
VSO
228void block_copy_state_free(BlockCopyState *s)
229{
230 if (!s) {
231 return;
232 }
233
4951967d 234 ratelimit_destroy(&s->rate_limit);
5deb6cbd 235 bdrv_release_dirty_bitmap(s->copy_bitmap);
7f739d0e 236 shres_destroy(s->mem);
beb5f545
VSO
237 g_free(s);
238}
239
9d31bc53
VSO
240static uint32_t block_copy_max_transfer(BdrvChild *source, BdrvChild *target)
241{
242 return MIN_NON_ZERO(INT_MAX,
243 MIN_NON_ZERO(source->bs->bl.max_transfer,
244 target->bs->bl.max_transfer));
245}
246
00e30f05 247BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
86c6a3b6 248 int64_t cluster_size, bool use_copy_range,
0f4b02b7 249 BdrvRequestFlags write_flags, Error **errp)
beb5f545
VSO
250{
251 BlockCopyState *s;
beb5f545
VSO
252 BdrvDirtyBitmap *copy_bitmap;
253
00e30f05
VSO
254 copy_bitmap = bdrv_create_dirty_bitmap(source->bs, cluster_size, NULL,
255 errp);
beb5f545
VSO
256 if (!copy_bitmap) {
257 return NULL;
258 }
259 bdrv_disable_dirty_bitmap(copy_bitmap);
260
261 s = g_new(BlockCopyState, 1);
262 *s = (BlockCopyState) {
00e30f05
VSO
263 .source = source,
264 .target = target,
beb5f545
VSO
265 .copy_bitmap = copy_bitmap,
266 .cluster_size = cluster_size,
267 .len = bdrv_dirty_bitmap_size(copy_bitmap),
268 .write_flags = write_flags,
7f739d0e 269 .mem = shres_create(BLOCK_COPY_MAX_MEM),
beb5f545
VSO
270 };
271
9d31bc53 272 if (block_copy_max_transfer(source, target) < cluster_size) {
0e240245
VSO
273 /*
274 * copy_range does not respect max_transfer. We don't want to bother
275 * with requests smaller than block-copy cluster size, so fallback to
276 * buffered copying (read and write respect max_transfer on their
277 * behalf).
278 */
279 s->use_copy_range = false;
280 s->copy_size = cluster_size;
281 } else if (write_flags & BDRV_REQ_WRITE_COMPRESSED) {
dcfbece6 282 /* Compression supports only cluster-size writes and no copy-range. */
0e240245 283 s->use_copy_range = false;
dcfbece6 284 s->copy_size = cluster_size;
0e240245
VSO
285 } else {
286 /*
9d31bc53
VSO
287 * We enable copy-range, but keep small copy_size, until first
288 * successful copy_range (look at block_copy_do_copy).
0e240245 289 */
86c6a3b6 290 s->use_copy_range = use_copy_range;
9d31bc53 291 s->copy_size = MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER);
0e240245 292 }
beb5f545 293
4951967d 294 ratelimit_init(&s->rate_limit);
e9407785 295 QLIST_INIT(&s->tasks);
2e099a9d 296 QLIST_INIT(&s->calls);
a6ffe199 297
beb5f545 298 return s;
beb5f545
VSO
299}
300
d0ebeca1
VSO
301void block_copy_set_progress_meter(BlockCopyState *s, ProgressMeter *pm)
302{
303 s->progress = pm;
304}
305
4ce5dd3e
VSO
306/*
307 * Takes ownership of @task
308 *
309 * If pool is NULL directly run the task, otherwise schedule it into the pool.
310 *
311 * Returns: task.func return code if pool is NULL
312 * otherwise -ECANCELED if pool status is bad
313 * otherwise 0 (successfully scheduled)
314 */
315static coroutine_fn int block_copy_task_run(AioTaskPool *pool,
316 BlockCopyTask *task)
317{
318 if (!pool) {
319 int ret = task->task.func(&task->task);
320
321 g_free(task);
322 return ret;
323 }
324
325 aio_task_pool_wait_slot(pool);
326 if (aio_task_pool_status(pool) < 0) {
327 co_put_to_shres(task->s->mem, task->bytes);
328 block_copy_task_end(task, -ECANCELED);
329 g_free(task);
330 return -ECANCELED;
331 }
332
333 aio_task_pool_start_task(pool, &task->task);
334
335 return 0;
336}
337
beb5f545 338/*
e332a726
VSO
339 * block_copy_do_copy
340 *
dafaf135
VSO
341 * Do copy of cluster-aligned chunk. Requested region is allowed to exceed
342 * s->len only to cover last cluster when s->len is not aligned to clusters.
e332a726
VSO
343 *
344 * No sync here: nor bitmap neighter intersecting requests handling, only copy.
345 *
bed95234
VSO
346 * @copy_range is an in-out argument: if *copy_range is false, copy_range is not
347 * done. If *copy_range is true, copy_range is attempted. If the copy_range
348 * attempt fails, the function falls back to the usual read+write and
349 * *copy_range is set to false. *copy_range and zeroes must not be true
350 * simultaneously.
351 *
e332a726 352 * Returns 0 on success.
beb5f545 353 */
e332a726 354static int coroutine_fn block_copy_do_copy(BlockCopyState *s,
8719091f 355 int64_t offset, int64_t bytes,
bed95234
VSO
356 bool zeroes, bool *copy_range,
357 bool *error_is_read)
beb5f545
VSO
358{
359 int ret;
8719091f 360 int64_t nbytes = MIN(offset + bytes, s->len) - offset;
e332a726 361 void *bounce_buffer = NULL;
beb5f545 362
8719091f
VSO
363 assert(offset >= 0 && bytes > 0 && INT64_MAX - offset >= bytes);
364 assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
dafaf135 365 assert(QEMU_IS_ALIGNED(bytes, s->cluster_size));
8719091f
VSO
366 assert(offset < s->len);
367 assert(offset + bytes <= s->len ||
368 offset + bytes == QEMU_ALIGN_UP(s->len, s->cluster_size));
dafaf135 369 assert(nbytes < INT_MAX);
bed95234 370 assert(!(*copy_range && zeroes));
e332a726 371
2d57511a 372 if (zeroes) {
8719091f 373 ret = bdrv_co_pwrite_zeroes(s->target, offset, nbytes, s->write_flags &
2d57511a
VSO
374 ~BDRV_REQ_WRITE_COMPRESSED);
375 if (ret < 0) {
8719091f 376 trace_block_copy_write_zeroes_fail(s, offset, ret);
d7eca542 377 *error_is_read = false;
2d57511a
VSO
378 }
379 return ret;
380 }
381
bed95234 382 if (*copy_range) {
8719091f 383 ret = bdrv_co_copy_range(s->source, offset, s->target, offset, nbytes,
e332a726
VSO
384 0, s->write_flags);
385 if (ret < 0) {
8719091f 386 trace_block_copy_copy_range_fail(s, offset, ret);
bed95234 387 *copy_range = false;
e332a726
VSO
388 /* Fallback to read+write with allocated buffer */
389 } else {
bed95234 390 return 0;
e332a726
VSO
391 }
392 }
393
0e240245
VSO
394 /*
395 * In case of failed copy_range request above, we may proceed with buffered
396 * request larger than BLOCK_COPY_MAX_BUFFER. Still, further requests will
9d31bc53
VSO
397 * be properly limited, so don't care too much. Moreover the most likely
398 * case (copy_range is unsupported for the configuration, so the very first
399 * copy_range request fails) is handled by setting large copy_size only
400 * after first successful copy_range.
0e240245
VSO
401 */
402
e332a726 403 bounce_buffer = qemu_blockalign(s->source->bs, nbytes);
beb5f545 404
8719091f 405 ret = bdrv_co_pread(s->source, offset, nbytes, bounce_buffer, 0);
beb5f545 406 if (ret < 0) {
8719091f 407 trace_block_copy_read_fail(s, offset, ret);
d7eca542 408 *error_is_read = true;
e332a726 409 goto out;
beb5f545
VSO
410 }
411
8719091f 412 ret = bdrv_co_pwrite(s->target, offset, nbytes, bounce_buffer,
00e30f05 413 s->write_flags);
beb5f545 414 if (ret < 0) {
8719091f 415 trace_block_copy_write_fail(s, offset, ret);
d7eca542 416 *error_is_read = false;
e332a726 417 goto out;
beb5f545
VSO
418 }
419
e332a726 420out:
3816edd2
VSO
421 qemu_vfree(bounce_buffer);
422
beb5f545 423 return ret;
beb5f545
VSO
424}
425
bed95234
VSO
426static void block_copy_handle_copy_range_result(BlockCopyState *s,
427 bool is_success)
428{
429 if (!s->use_copy_range) {
430 /* already disabled */
431 return;
432 }
433
434 if (is_success) {
435 /*
436 * Successful copy-range. Now increase copy_size. copy_range
437 * does not respect max_transfer (it's a TODO), so we factor
438 * that in here.
439 */
440 s->copy_size =
441 MIN(MAX(s->cluster_size, BLOCK_COPY_MAX_COPY_RANGE),
442 QEMU_ALIGN_DOWN(block_copy_max_transfer(s->source,
443 s->target),
444 s->cluster_size));
445 } else {
446 /* Copy-range failed, disable it. */
447 s->use_copy_range = false;
448 s->copy_size = MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER);
449 }
450}
451
4ce5dd3e
VSO
452static coroutine_fn int block_copy_task_entry(AioTask *task)
453{
454 BlockCopyTask *t = container_of(task, BlockCopyTask, task);
c6a3e3df 455 BlockCopyState *s = t->s;
c78dd00e 456 bool error_is_read = false;
bed95234 457 bool copy_range = t->copy_range;
4ce5dd3e
VSO
458 int ret;
459
c6a3e3df 460 ret = block_copy_do_copy(s, t->offset, t->bytes, t->zeroes,
bed95234
VSO
461 &copy_range, &error_is_read);
462 if (t->copy_range) {
c6a3e3df 463 block_copy_handle_copy_range_result(s, copy_range);
bed95234 464 }
8146b357
VSO
465 if (ret < 0) {
466 if (!t->call_state->ret) {
467 t->call_state->ret = ret;
468 t->call_state->error_is_read = error_is_read;
469 }
4ce5dd3e 470 } else {
c6a3e3df 471 progress_work_done(s->progress, t->bytes);
4ce5dd3e 472 }
c6a3e3df 473 co_put_to_shres(s->mem, t->bytes);
4ce5dd3e
VSO
474 block_copy_task_end(t, ret);
475
476 return ret;
477}
478
2d57511a
VSO
479static int block_copy_block_status(BlockCopyState *s, int64_t offset,
480 int64_t bytes, int64_t *pnum)
481{
482 int64_t num;
483 BlockDriverState *base;
484 int ret;
485
c6f6d846
HR
486 if (s->skip_unallocated) {
487 base = bdrv_backing_chain_next(s->source->bs);
2d57511a
VSO
488 } else {
489 base = NULL;
490 }
491
492 ret = bdrv_block_status_above(s->source->bs, base, offset, bytes, &num,
493 NULL, NULL);
494 if (ret < 0 || num < s->cluster_size) {
495 /*
496 * On error or if failed to obtain large enough chunk just fallback to
497 * copy one cluster.
498 */
499 num = s->cluster_size;
500 ret = BDRV_BLOCK_ALLOCATED | BDRV_BLOCK_DATA;
501 } else if (offset + num == s->len) {
502 num = QEMU_ALIGN_UP(num, s->cluster_size);
503 } else {
504 num = QEMU_ALIGN_DOWN(num, s->cluster_size);
505 }
506
507 *pnum = num;
508 return ret;
509}
510
beb5f545
VSO
511/*
512 * Check if the cluster starting at offset is allocated or not.
513 * return via pnum the number of contiguous clusters sharing this allocation.
514 */
515static int block_copy_is_cluster_allocated(BlockCopyState *s, int64_t offset,
516 int64_t *pnum)
517{
00e30f05 518 BlockDriverState *bs = s->source->bs;
beb5f545
VSO
519 int64_t count, total_count = 0;
520 int64_t bytes = s->len - offset;
521 int ret;
522
523 assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
524
525 while (true) {
526 ret = bdrv_is_allocated(bs, offset, bytes, &count);
527 if (ret < 0) {
528 return ret;
529 }
530
531 total_count += count;
532
533 if (ret || count == 0) {
534 /*
535 * ret: partial segment(s) are considered allocated.
536 * otherwise: unallocated tail is treated as an entire segment.
537 */
538 *pnum = DIV_ROUND_UP(total_count, s->cluster_size);
539 return ret;
540 }
541
542 /* Unallocated segment(s) with uncertain following segment(s) */
543 if (total_count >= s->cluster_size) {
544 *pnum = total_count / s->cluster_size;
545 return 0;
546 }
547
548 offset += count;
549 bytes -= count;
550 }
551}
552
553/*
554 * Reset bits in copy_bitmap starting at offset if they represent unallocated
555 * data in the image. May reset subsequent contiguous bits.
556 * @return 0 when the cluster at @offset was unallocated,
557 * 1 otherwise, and -ret on error.
558 */
559int64_t block_copy_reset_unallocated(BlockCopyState *s,
560 int64_t offset, int64_t *count)
561{
562 int ret;
563 int64_t clusters, bytes;
564
565 ret = block_copy_is_cluster_allocated(s, offset, &clusters);
566 if (ret < 0) {
567 return ret;
568 }
569
570 bytes = clusters * s->cluster_size;
571
572 if (!ret) {
573 bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes);
d0ebeca1
VSO
574 progress_set_remaining(s->progress,
575 bdrv_get_dirty_count(s->copy_bitmap) +
576 s->in_flight_bytes);
beb5f545
VSO
577 }
578
579 *count = bytes;
580 return ret;
581}
582
5332e5d2
VSO
583/*
584 * block_copy_dirty_clusters
585 *
586 * Copy dirty clusters in @offset/@bytes range.
587 * Returns 1 if dirty clusters found and successfully copied, 0 if no dirty
588 * clusters found and -errno on failure.
589 */
3b8c2329
VSO
590static int coroutine_fn
591block_copy_dirty_clusters(BlockCopyCallState *call_state)
beb5f545 592{
3b8c2329
VSO
593 BlockCopyState *s = call_state->s;
594 int64_t offset = call_state->offset;
595 int64_t bytes = call_state->bytes;
596
beb5f545 597 int ret = 0;
5332e5d2 598 bool found_dirty = false;
42ac2144 599 int64_t end = offset + bytes;
4ce5dd3e 600 AioTaskPool *aio = NULL;
beb5f545
VSO
601
602 /*
603 * block_copy() user is responsible for keeping source and target in same
604 * aio context
605 */
00e30f05
VSO
606 assert(bdrv_get_aio_context(s->source->bs) ==
607 bdrv_get_aio_context(s->target->bs));
beb5f545 608
8719091f 609 assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
dafaf135 610 assert(QEMU_IS_ALIGNED(bytes, s->cluster_size));
beb5f545 611
a6d23d56 612 while (bytes && aio_task_pool_status(aio) == 0 && !call_state->cancelled) {
4ce5dd3e 613 BlockCopyTask *task;
42ac2144 614 int64_t status_bytes;
beb5f545 615
3b8c2329 616 task = block_copy_task_create(s, call_state, offset, bytes);
42ac2144
VSO
617 if (!task) {
618 /* No more dirty bits in the bitmap */
619 trace_block_copy_skip_range(s, offset, bytes);
620 break;
621 }
622 if (task->offset > offset) {
623 trace_block_copy_skip_range(s, offset, task->offset - offset);
beb5f545
VSO
624 }
625
5332e5d2
VSO
626 found_dirty = true;
627
42ac2144
VSO
628 ret = block_copy_block_status(s, task->offset, task->bytes,
629 &status_bytes);
5332e5d2 630 assert(ret >= 0); /* never fail */
42ac2144
VSO
631 if (status_bytes < task->bytes) {
632 block_copy_task_shrink(task, status_bytes);
633 }
2d57511a 634 if (s->skip_unallocated && !(ret & BDRV_BLOCK_ALLOCATED)) {
1348a657 635 block_copy_task_end(task, 0);
2d57511a
VSO
636 progress_set_remaining(s->progress,
637 bdrv_get_dirty_count(s->copy_bitmap) +
638 s->in_flight_bytes);
42ac2144
VSO
639 trace_block_copy_skip_range(s, task->offset, task->bytes);
640 offset = task_end(task);
641 bytes = end - offset;
fc9aefc8 642 g_free(task);
2d57511a 643 continue;
beb5f545 644 }
bed95234
VSO
645 if (ret & BDRV_BLOCK_ZERO) {
646 task->zeroes = true;
647 task->copy_range = false;
648 }
beb5f545 649
ca657c99
PB
650 if (!call_state->ignore_ratelimit) {
651 uint64_t ns = ratelimit_calculate_delay(&s->rate_limit, 0);
652 if (ns > 0) {
653 block_copy_task_end(task, -EAGAIN);
654 g_free(task);
655 qemu_co_sleep_ns_wakeable(&call_state->sleep,
656 QEMU_CLOCK_REALTIME, ns);
657 continue;
7e032df0 658 }
7e032df0
VSO
659 }
660
ca657c99
PB
661 ratelimit_calculate_delay(&s->rate_limit, task->bytes);
662
42ac2144 663 trace_block_copy_process(s, task->offset);
beb5f545 664
42ac2144 665 co_get_from_shres(s->mem, task->bytes);
beb5f545 666
42ac2144
VSO
667 offset = task_end(task);
668 bytes = end - offset;
4ce5dd3e
VSO
669
670 if (!aio && bytes) {
26be9d62 671 aio = aio_task_pool_new(call_state->max_workers);
4ce5dd3e
VSO
672 }
673
674 ret = block_copy_task_run(aio, task);
675 if (ret < 0) {
676 goto out;
677 }
678 }
679
680out:
681 if (aio) {
682 aio_task_pool_wait_all(aio);
683
684 /*
685 * We are not really interested in -ECANCELED returned from
686 * block_copy_task_run. If it fails, it means some task already failed
687 * for real reason, let's return first failure.
688 * Still, assert that we don't rewrite failure by success.
e8de7ba9
VSO
689 *
690 * Note: ret may be positive here because of block-status result.
4ce5dd3e 691 */
e8de7ba9 692 assert(ret >= 0 || aio_task_pool_status(aio) < 0);
4ce5dd3e
VSO
693 ret = aio_task_pool_status(aio);
694
695 aio_task_pool_free(aio);
696 }
beb5f545 697
4ce5dd3e 698 return ret < 0 ? ret : found_dirty;
5332e5d2
VSO
699}
700
7e032df0
VSO
701void block_copy_kick(BlockCopyCallState *call_state)
702{
29a6ea24 703 qemu_co_sleep_wake(&call_state->sleep);
7e032df0
VSO
704}
705
5332e5d2 706/*
3b8c2329 707 * block_copy_common
5332e5d2
VSO
708 *
709 * Copy requested region, accordingly to dirty bitmap.
710 * Collaborate with parallel block_copy requests: if they succeed it will help
711 * us. If they fail, we will retry not-copied regions. So, if we return error,
712 * it means that some I/O operation failed in context of _this_ block_copy call,
713 * not some parallel operation.
714 */
3b8c2329 715static int coroutine_fn block_copy_common(BlockCopyCallState *call_state)
5332e5d2
VSO
716{
717 int ret;
c6a3e3df 718 BlockCopyState *s = call_state->s;
5332e5d2 719
c6a3e3df 720 QLIST_INSERT_HEAD(&s->calls, call_state, list);
2e099a9d 721
5332e5d2 722 do {
3b8c2329 723 ret = block_copy_dirty_clusters(call_state);
5332e5d2 724
a6d23d56 725 if (ret == 0 && !call_state->cancelled) {
c6a3e3df 726 ret = block_copy_wait_one(s, call_state->offset,
3b8c2329 727 call_state->bytes);
5332e5d2
VSO
728 }
729
730 /*
731 * We retry in two cases:
732 * 1. Some progress done
733 * Something was copied, which means that there were yield points
734 * and some new dirty bits may have appeared (due to failed parallel
735 * block-copy requests).
736 * 2. We have waited for some intersecting block-copy request
737 * It may have failed and produced new dirty bits.
738 */
a6d23d56 739 } while (ret > 0 && !call_state->cancelled);
a6ffe199 740
de4641b4
VSO
741 call_state->finished = true;
742
743 if (call_state->cb) {
744 call_state->cb(call_state->cb_opaque);
745 }
746
2e099a9d
VSO
747 QLIST_REMOVE(call_state, list);
748
beb5f545
VSO
749 return ret;
750}
397f4e9d 751
3b8c2329 752int coroutine_fn block_copy(BlockCopyState *s, int64_t start, int64_t bytes,
143a6384 753 bool ignore_ratelimit)
3b8c2329
VSO
754{
755 BlockCopyCallState call_state = {
756 .s = s,
757 .offset = start,
758 .bytes = bytes,
7e032df0 759 .ignore_ratelimit = ignore_ratelimit,
26be9d62 760 .max_workers = BLOCK_COPY_MAX_WORKERS,
3b8c2329
VSO
761 };
762
143a6384 763 return block_copy_common(&call_state);
3b8c2329
VSO
764}
765
de4641b4
VSO
766static void coroutine_fn block_copy_async_co_entry(void *opaque)
767{
768 block_copy_common(opaque);
769}
770
771BlockCopyCallState *block_copy_async(BlockCopyState *s,
772 int64_t offset, int64_t bytes,
26be9d62 773 int max_workers, int64_t max_chunk,
de4641b4
VSO
774 BlockCopyAsyncCallbackFunc cb,
775 void *cb_opaque)
776{
777 BlockCopyCallState *call_state = g_new(BlockCopyCallState, 1);
778
779 *call_state = (BlockCopyCallState) {
780 .s = s,
781 .offset = offset,
782 .bytes = bytes,
26be9d62
VSO
783 .max_workers = max_workers,
784 .max_chunk = max_chunk,
de4641b4
VSO
785 .cb = cb,
786 .cb_opaque = cb_opaque,
787
788 .co = qemu_coroutine_create(block_copy_async_co_entry, call_state),
789 };
790
791 qemu_coroutine_enter(call_state->co);
792
793 return call_state;
794}
795
796void block_copy_call_free(BlockCopyCallState *call_state)
797{
798 if (!call_state) {
799 return;
800 }
801
802 assert(call_state->finished);
803 g_free(call_state);
804}
805
806bool block_copy_call_finished(BlockCopyCallState *call_state)
807{
808 return call_state->finished;
809}
810
811bool block_copy_call_succeeded(BlockCopyCallState *call_state)
812{
a6d23d56
VSO
813 return call_state->finished && !call_state->cancelled &&
814 call_state->ret == 0;
de4641b4
VSO
815}
816
817bool block_copy_call_failed(BlockCopyCallState *call_state)
818{
a6d23d56
VSO
819 return call_state->finished && !call_state->cancelled &&
820 call_state->ret < 0;
821}
822
823bool block_copy_call_cancelled(BlockCopyCallState *call_state)
824{
825 return call_state->cancelled;
de4641b4
VSO
826}
827
828int block_copy_call_status(BlockCopyCallState *call_state, bool *error_is_read)
829{
830 assert(call_state->finished);
831 if (error_is_read) {
832 *error_is_read = call_state->error_is_read;
833 }
834 return call_state->ret;
835}
836
a6d23d56
VSO
837void block_copy_call_cancel(BlockCopyCallState *call_state)
838{
839 call_state->cancelled = true;
840 block_copy_kick(call_state);
841}
842
397f4e9d
VSO
843BdrvDirtyBitmap *block_copy_dirty_bitmap(BlockCopyState *s)
844{
845 return s->copy_bitmap;
846}
847
848void block_copy_set_skip_unallocated(BlockCopyState *s, bool skip)
849{
850 s->skip_unallocated = skip;
851}
7e032df0
VSO
852
853void block_copy_set_speed(BlockCopyState *s, uint64_t speed)
854{
ca657c99 855 ratelimit_set_speed(&s->rate_limit, speed, BLOCK_COPY_SLICE_TIME);
7e032df0
VSO
856
857 /*
858 * Note: it's good to kick all call states from here, but it should be done
859 * only from a coroutine, to not crash if s->calls list changed while
860 * entering one call. So for now, the only user of this function kicks its
861 * only one call_state by hand.
862 */
863}