]> git.proxmox.com Git - mirror_qemu.git/blame - block/block-copy.c
block/block-copy: rename in-flight requests to tasks
[mirror_qemu.git] / block / block-copy.c
CommitLineData
beb5f545
VSO
1/*
2 * block_copy API
3 *
4 * Copyright (C) 2013 Proxmox Server Solutions
5 * Copyright (c) 2019 Virtuozzo International GmbH.
6 *
7 * Authors:
8 * Dietmar Maurer (dietmar@proxmox.com)
9 * Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
10 *
11 * This work is licensed under the terms of the GNU GPL, version 2 or later.
12 * See the COPYING file in the top-level directory.
13 */
14
15#include "qemu/osdep.h"
16
17#include "trace.h"
18#include "qapi/error.h"
19#include "block/block-copy.h"
20#include "sysemu/block-backend.h"
b3b7036a
VSO
21#include "qemu/units.h"
22
23#define BLOCK_COPY_MAX_COPY_RANGE (16 * MiB)
0e240245 24#define BLOCK_COPY_MAX_BUFFER (1 * MiB)
7f739d0e 25#define BLOCK_COPY_MAX_MEM (128 * MiB)
beb5f545 26
e9407785 27typedef struct BlockCopyTask {
397f4e9d
VSO
28 int64_t offset;
29 int64_t bytes;
e9407785
VSO
30 QLIST_ENTRY(BlockCopyTask) list;
31 CoQueue wait_queue; /* coroutines blocked on this task */
32} BlockCopyTask;
397f4e9d
VSO
33
34typedef struct BlockCopyState {
35 /*
36 * BdrvChild objects are not owned or managed by block-copy. They are
37 * provided by block-copy user and user is responsible for appropriate
38 * permissions on these children.
39 */
40 BdrvChild *source;
41 BdrvChild *target;
42 BdrvDirtyBitmap *copy_bitmap;
43 int64_t in_flight_bytes;
44 int64_t cluster_size;
45 bool use_copy_range;
46 int64_t copy_size;
47 uint64_t len;
e9407785 48 QLIST_HEAD(, BlockCopyTask) tasks;
397f4e9d
VSO
49
50 BdrvRequestFlags write_flags;
51
52 /*
53 * skip_unallocated:
54 *
55 * Used by sync=top jobs, which first scan the source node for unallocated
56 * areas and clear them in the copy_bitmap. During this process, the bitmap
57 * is thus not fully initialized: It may still have bits set for areas that
58 * are unallocated and should actually not be copied.
59 *
60 * This is indicated by skip_unallocated.
61 *
62 * In this case, block_copy() will query the source’s allocation status,
63 * skip unallocated regions, clear them in the copy_bitmap, and invoke
64 * block_copy_reset_unallocated() every time it does.
65 */
66 bool skip_unallocated;
67
68 ProgressMeter *progress;
69 /* progress_bytes_callback: called when some copying progress is done. */
70 ProgressBytesCallbackFunc progress_bytes_callback;
71 void *progress_opaque;
72
73 SharedResource *mem;
74} BlockCopyState;
75
e9407785
VSO
76static BlockCopyTask *find_conflicting_task(BlockCopyState *s,
77 int64_t offset, int64_t bytes)
17187cb6 78{
e9407785 79 BlockCopyTask *t;
17187cb6 80
e9407785
VSO
81 QLIST_FOREACH(t, &s->tasks, list) {
82 if (offset + bytes > t->offset && offset < t->offset + t->bytes) {
83 return t;
17187cb6
VSO
84 }
85 }
86
87 return NULL;
88}
89
5332e5d2 90/*
e9407785
VSO
91 * If there are no intersecting tasks return false. Otherwise, wait for the
92 * first found intersecting tasks to finish and return true.
5332e5d2
VSO
93 */
94static bool coroutine_fn block_copy_wait_one(BlockCopyState *s, int64_t offset,
95 int64_t bytes)
a6ffe199 96{
e9407785 97 BlockCopyTask *task = find_conflicting_task(s, offset, bytes);
17187cb6 98
e9407785 99 if (!task) {
5332e5d2 100 return false;
17187cb6 101 }
5332e5d2 102
e9407785 103 qemu_co_queue_wait(&task->wait_queue, NULL);
5332e5d2
VSO
104
105 return true;
a6ffe199
VSO
106}
107
5332e5d2 108/* Called only on full-dirty region */
e9407785
VSO
109static void block_copy_task_begin(BlockCopyState *s, BlockCopyTask *task,
110 int64_t offset, int64_t bytes)
a6ffe199 111{
e9407785 112 assert(!find_conflicting_task(s, offset, bytes));
5332e5d2
VSO
113
114 bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes);
115 s->in_flight_bytes += bytes;
116
e9407785
VSO
117 task->offset = offset;
118 task->bytes = bytes;
119 qemu_co_queue_init(&task->wait_queue);
120 QLIST_INSERT_HEAD(&s->tasks, task, list);
a6ffe199
VSO
121}
122
5332e5d2 123/*
e9407785 124 * block_copy_task_shrink
5332e5d2 125 *
e9407785
VSO
126 * Drop the tail of the task to be handled later. Set dirty bits back and
127 * wake up all tasks waiting for us (may be some of them are not intersecting
128 * with shrunk task)
5332e5d2 129 */
e9407785
VSO
130static void coroutine_fn block_copy_task_shrink(BlockCopyState *s,
131 BlockCopyTask *task,
132 int64_t new_bytes)
a6ffe199 133{
e9407785 134 if (new_bytes == task->bytes) {
5332e5d2
VSO
135 return;
136 }
137
e9407785 138 assert(new_bytes > 0 && new_bytes < task->bytes);
5332e5d2 139
e9407785 140 s->in_flight_bytes -= task->bytes - new_bytes;
5332e5d2 141 bdrv_set_dirty_bitmap(s->copy_bitmap,
e9407785 142 task->offset + new_bytes, task->bytes - new_bytes);
5332e5d2 143
e9407785
VSO
144 task->bytes = new_bytes;
145 qemu_co_queue_restart_all(&task->wait_queue);
5332e5d2
VSO
146}
147
e9407785
VSO
148static void coroutine_fn block_copy_task_end(BlockCopyState *s,
149 BlockCopyTask *task, int ret)
5332e5d2 150{
e9407785 151 s->in_flight_bytes -= task->bytes;
5332e5d2 152 if (ret < 0) {
e9407785 153 bdrv_set_dirty_bitmap(s->copy_bitmap, task->offset, task->bytes);
5332e5d2 154 }
e9407785
VSO
155 QLIST_REMOVE(task, list);
156 qemu_co_queue_restart_all(&task->wait_queue);
a6ffe199
VSO
157}
158
beb5f545
VSO
159void block_copy_state_free(BlockCopyState *s)
160{
161 if (!s) {
162 return;
163 }
164
5deb6cbd 165 bdrv_release_dirty_bitmap(s->copy_bitmap);
7f739d0e 166 shres_destroy(s->mem);
beb5f545
VSO
167 g_free(s);
168}
169
9d31bc53
VSO
170static uint32_t block_copy_max_transfer(BdrvChild *source, BdrvChild *target)
171{
172 return MIN_NON_ZERO(INT_MAX,
173 MIN_NON_ZERO(source->bs->bl.max_transfer,
174 target->bs->bl.max_transfer));
175}
176
00e30f05 177BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
0f4b02b7
VSO
178 int64_t cluster_size,
179 BdrvRequestFlags write_flags, Error **errp)
beb5f545
VSO
180{
181 BlockCopyState *s;
beb5f545
VSO
182 BdrvDirtyBitmap *copy_bitmap;
183
00e30f05
VSO
184 copy_bitmap = bdrv_create_dirty_bitmap(source->bs, cluster_size, NULL,
185 errp);
beb5f545
VSO
186 if (!copy_bitmap) {
187 return NULL;
188 }
189 bdrv_disable_dirty_bitmap(copy_bitmap);
190
191 s = g_new(BlockCopyState, 1);
192 *s = (BlockCopyState) {
00e30f05
VSO
193 .source = source,
194 .target = target,
beb5f545
VSO
195 .copy_bitmap = copy_bitmap,
196 .cluster_size = cluster_size,
197 .len = bdrv_dirty_bitmap_size(copy_bitmap),
198 .write_flags = write_flags,
7f739d0e 199 .mem = shres_create(BLOCK_COPY_MAX_MEM),
beb5f545
VSO
200 };
201
9d31bc53 202 if (block_copy_max_transfer(source, target) < cluster_size) {
0e240245
VSO
203 /*
204 * copy_range does not respect max_transfer. We don't want to bother
205 * with requests smaller than block-copy cluster size, so fallback to
206 * buffered copying (read and write respect max_transfer on their
207 * behalf).
208 */
209 s->use_copy_range = false;
210 s->copy_size = cluster_size;
211 } else if (write_flags & BDRV_REQ_WRITE_COMPRESSED) {
dcfbece6 212 /* Compression supports only cluster-size writes and no copy-range. */
0e240245 213 s->use_copy_range = false;
dcfbece6 214 s->copy_size = cluster_size;
0e240245
VSO
215 } else {
216 /*
9d31bc53
VSO
217 * We enable copy-range, but keep small copy_size, until first
218 * successful copy_range (look at block_copy_do_copy).
0e240245
VSO
219 */
220 s->use_copy_range = true;
9d31bc53 221 s->copy_size = MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER);
0e240245 222 }
beb5f545 223
e9407785 224 QLIST_INIT(&s->tasks);
a6ffe199 225
beb5f545 226 return s;
beb5f545
VSO
227}
228
d0ebeca1 229void block_copy_set_progress_callback(
0f4b02b7
VSO
230 BlockCopyState *s,
231 ProgressBytesCallbackFunc progress_bytes_callback,
0f4b02b7
VSO
232 void *progress_opaque)
233{
234 s->progress_bytes_callback = progress_bytes_callback;
0f4b02b7
VSO
235 s->progress_opaque = progress_opaque;
236}
237
d0ebeca1
VSO
238void block_copy_set_progress_meter(BlockCopyState *s, ProgressMeter *pm)
239{
240 s->progress = pm;
241}
242
beb5f545 243/*
e332a726
VSO
244 * block_copy_do_copy
245 *
dafaf135
VSO
246 * Do copy of cluster-aligned chunk. Requested region is allowed to exceed
247 * s->len only to cover last cluster when s->len is not aligned to clusters.
e332a726
VSO
248 *
249 * No sync here: nor bitmap neighter intersecting requests handling, only copy.
250 *
251 * Returns 0 on success.
beb5f545 252 */
e332a726 253static int coroutine_fn block_copy_do_copy(BlockCopyState *s,
8719091f 254 int64_t offset, int64_t bytes,
2d57511a 255 bool zeroes, bool *error_is_read)
beb5f545
VSO
256{
257 int ret;
8719091f 258 int64_t nbytes = MIN(offset + bytes, s->len) - offset;
e332a726 259 void *bounce_buffer = NULL;
beb5f545 260
8719091f
VSO
261 assert(offset >= 0 && bytes > 0 && INT64_MAX - offset >= bytes);
262 assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
dafaf135 263 assert(QEMU_IS_ALIGNED(bytes, s->cluster_size));
8719091f
VSO
264 assert(offset < s->len);
265 assert(offset + bytes <= s->len ||
266 offset + bytes == QEMU_ALIGN_UP(s->len, s->cluster_size));
dafaf135 267 assert(nbytes < INT_MAX);
e332a726 268
2d57511a 269 if (zeroes) {
8719091f 270 ret = bdrv_co_pwrite_zeroes(s->target, offset, nbytes, s->write_flags &
2d57511a
VSO
271 ~BDRV_REQ_WRITE_COMPRESSED);
272 if (ret < 0) {
8719091f 273 trace_block_copy_write_zeroes_fail(s, offset, ret);
2d57511a
VSO
274 if (error_is_read) {
275 *error_is_read = false;
276 }
277 }
278 return ret;
279 }
280
e332a726 281 if (s->use_copy_range) {
8719091f 282 ret = bdrv_co_copy_range(s->source, offset, s->target, offset, nbytes,
e332a726
VSO
283 0, s->write_flags);
284 if (ret < 0) {
8719091f 285 trace_block_copy_copy_range_fail(s, offset, ret);
e332a726 286 s->use_copy_range = false;
0e240245 287 s->copy_size = MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER);
e332a726
VSO
288 /* Fallback to read+write with allocated buffer */
289 } else {
9d31bc53
VSO
290 if (s->use_copy_range) {
291 /*
292 * Successful copy-range. Now increase copy_size. copy_range
293 * does not respect max_transfer (it's a TODO), so we factor
294 * that in here.
295 *
296 * Note: we double-check s->use_copy_range for the case when
297 * parallel block-copy request unsets it during previous
298 * bdrv_co_copy_range call.
299 */
300 s->copy_size =
301 MIN(MAX(s->cluster_size, BLOCK_COPY_MAX_COPY_RANGE),
302 QEMU_ALIGN_DOWN(block_copy_max_transfer(s->source,
303 s->target),
304 s->cluster_size));
305 }
e332a726
VSO
306 goto out;
307 }
308 }
309
0e240245
VSO
310 /*
311 * In case of failed copy_range request above, we may proceed with buffered
312 * request larger than BLOCK_COPY_MAX_BUFFER. Still, further requests will
9d31bc53
VSO
313 * be properly limited, so don't care too much. Moreover the most likely
314 * case (copy_range is unsupported for the configuration, so the very first
315 * copy_range request fails) is handled by setting large copy_size only
316 * after first successful copy_range.
0e240245
VSO
317 */
318
e332a726 319 bounce_buffer = qemu_blockalign(s->source->bs, nbytes);
beb5f545 320
8719091f 321 ret = bdrv_co_pread(s->source, offset, nbytes, bounce_buffer, 0);
beb5f545 322 if (ret < 0) {
8719091f 323 trace_block_copy_read_fail(s, offset, ret);
beb5f545
VSO
324 if (error_is_read) {
325 *error_is_read = true;
326 }
e332a726 327 goto out;
beb5f545
VSO
328 }
329
8719091f 330 ret = bdrv_co_pwrite(s->target, offset, nbytes, bounce_buffer,
00e30f05 331 s->write_flags);
beb5f545 332 if (ret < 0) {
8719091f 333 trace_block_copy_write_fail(s, offset, ret);
beb5f545
VSO
334 if (error_is_read) {
335 *error_is_read = false;
336 }
e332a726 337 goto out;
beb5f545
VSO
338 }
339
e332a726 340out:
3816edd2
VSO
341 qemu_vfree(bounce_buffer);
342
beb5f545 343 return ret;
beb5f545
VSO
344}
345
2d57511a
VSO
346static int block_copy_block_status(BlockCopyState *s, int64_t offset,
347 int64_t bytes, int64_t *pnum)
348{
349 int64_t num;
350 BlockDriverState *base;
351 int ret;
352
353 if (s->skip_unallocated && s->source->bs->backing) {
354 base = s->source->bs->backing->bs;
355 } else {
356 base = NULL;
357 }
358
359 ret = bdrv_block_status_above(s->source->bs, base, offset, bytes, &num,
360 NULL, NULL);
361 if (ret < 0 || num < s->cluster_size) {
362 /*
363 * On error or if failed to obtain large enough chunk just fallback to
364 * copy one cluster.
365 */
366 num = s->cluster_size;
367 ret = BDRV_BLOCK_ALLOCATED | BDRV_BLOCK_DATA;
368 } else if (offset + num == s->len) {
369 num = QEMU_ALIGN_UP(num, s->cluster_size);
370 } else {
371 num = QEMU_ALIGN_DOWN(num, s->cluster_size);
372 }
373
374 *pnum = num;
375 return ret;
376}
377
beb5f545
VSO
378/*
379 * Check if the cluster starting at offset is allocated or not.
380 * return via pnum the number of contiguous clusters sharing this allocation.
381 */
382static int block_copy_is_cluster_allocated(BlockCopyState *s, int64_t offset,
383 int64_t *pnum)
384{
00e30f05 385 BlockDriverState *bs = s->source->bs;
beb5f545
VSO
386 int64_t count, total_count = 0;
387 int64_t bytes = s->len - offset;
388 int ret;
389
390 assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
391
392 while (true) {
393 ret = bdrv_is_allocated(bs, offset, bytes, &count);
394 if (ret < 0) {
395 return ret;
396 }
397
398 total_count += count;
399
400 if (ret || count == 0) {
401 /*
402 * ret: partial segment(s) are considered allocated.
403 * otherwise: unallocated tail is treated as an entire segment.
404 */
405 *pnum = DIV_ROUND_UP(total_count, s->cluster_size);
406 return ret;
407 }
408
409 /* Unallocated segment(s) with uncertain following segment(s) */
410 if (total_count >= s->cluster_size) {
411 *pnum = total_count / s->cluster_size;
412 return 0;
413 }
414
415 offset += count;
416 bytes -= count;
417 }
418}
419
420/*
421 * Reset bits in copy_bitmap starting at offset if they represent unallocated
422 * data in the image. May reset subsequent contiguous bits.
423 * @return 0 when the cluster at @offset was unallocated,
424 * 1 otherwise, and -ret on error.
425 */
426int64_t block_copy_reset_unallocated(BlockCopyState *s,
427 int64_t offset, int64_t *count)
428{
429 int ret;
430 int64_t clusters, bytes;
431
432 ret = block_copy_is_cluster_allocated(s, offset, &clusters);
433 if (ret < 0) {
434 return ret;
435 }
436
437 bytes = clusters * s->cluster_size;
438
439 if (!ret) {
440 bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes);
d0ebeca1
VSO
441 progress_set_remaining(s->progress,
442 bdrv_get_dirty_count(s->copy_bitmap) +
443 s->in_flight_bytes);
beb5f545
VSO
444 }
445
446 *count = bytes;
447 return ret;
448}
449
5332e5d2
VSO
450/*
451 * block_copy_dirty_clusters
452 *
453 * Copy dirty clusters in @offset/@bytes range.
454 * Returns 1 if dirty clusters found and successfully copied, 0 if no dirty
455 * clusters found and -errno on failure.
456 */
457static int coroutine_fn block_copy_dirty_clusters(BlockCopyState *s,
458 int64_t offset, int64_t bytes,
459 bool *error_is_read)
beb5f545
VSO
460{
461 int ret = 0;
5332e5d2 462 bool found_dirty = false;
beb5f545
VSO
463
464 /*
465 * block_copy() user is responsible for keeping source and target in same
466 * aio context
467 */
00e30f05
VSO
468 assert(bdrv_get_aio_context(s->source->bs) ==
469 bdrv_get_aio_context(s->target->bs));
beb5f545 470
8719091f 471 assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
dafaf135 472 assert(QEMU_IS_ALIGNED(bytes, s->cluster_size));
beb5f545 473
dafaf135 474 while (bytes) {
e9407785 475 BlockCopyTask task;
dafaf135 476 int64_t next_zero, cur_bytes, status_bytes;
beb5f545 477
8719091f
VSO
478 if (!bdrv_dirty_bitmap_get(s->copy_bitmap, offset)) {
479 trace_block_copy_skip(s, offset);
480 offset += s->cluster_size;
dafaf135 481 bytes -= s->cluster_size;
beb5f545
VSO
482 continue; /* already copied */
483 }
484
5332e5d2
VSO
485 found_dirty = true;
486
dafaf135 487 cur_bytes = MIN(bytes, s->copy_size);
e332a726 488
8719091f 489 next_zero = bdrv_dirty_bitmap_next_zero(s->copy_bitmap, offset,
dafaf135 490 cur_bytes);
e332a726 491 if (next_zero >= 0) {
8719091f
VSO
492 assert(next_zero > offset); /* offset is dirty */
493 assert(next_zero < offset + cur_bytes); /* no need to do MIN() */
494 cur_bytes = next_zero - offset;
beb5f545 495 }
e9407785 496 block_copy_task_begin(s, &task, offset, cur_bytes);
beb5f545 497
8719091f 498 ret = block_copy_block_status(s, offset, cur_bytes, &status_bytes);
5332e5d2
VSO
499 assert(ret >= 0); /* never fail */
500 cur_bytes = MIN(cur_bytes, status_bytes);
e9407785 501 block_copy_task_shrink(s, &task, cur_bytes);
2d57511a 502 if (s->skip_unallocated && !(ret & BDRV_BLOCK_ALLOCATED)) {
e9407785 503 block_copy_task_end(s, &task, 0);
2d57511a
VSO
504 progress_set_remaining(s->progress,
505 bdrv_get_dirty_count(s->copy_bitmap) +
506 s->in_flight_bytes);
8719091f
VSO
507 trace_block_copy_skip_range(s, offset, status_bytes);
508 offset += status_bytes;
dafaf135 509 bytes -= status_bytes;
2d57511a 510 continue;
beb5f545
VSO
511 }
512
8719091f 513 trace_block_copy_process(s, offset);
beb5f545 514
dafaf135 515 co_get_from_shres(s->mem, cur_bytes);
8719091f 516 ret = block_copy_do_copy(s, offset, cur_bytes, ret & BDRV_BLOCK_ZERO,
2d57511a 517 error_is_read);
dafaf135 518 co_put_to_shres(s->mem, cur_bytes);
e9407785 519 block_copy_task_end(s, &task, ret);
beb5f545 520 if (ret < 0) {
5332e5d2 521 return ret;
beb5f545
VSO
522 }
523
dafaf135
VSO
524 progress_work_done(s->progress, cur_bytes);
525 s->progress_bytes_callback(cur_bytes, s->progress_opaque);
8719091f 526 offset += cur_bytes;
dafaf135 527 bytes -= cur_bytes;
beb5f545
VSO
528 }
529
5332e5d2
VSO
530 return found_dirty;
531}
532
533/*
534 * block_copy
535 *
536 * Copy requested region, accordingly to dirty bitmap.
537 * Collaborate with parallel block_copy requests: if they succeed it will help
538 * us. If they fail, we will retry not-copied regions. So, if we return error,
539 * it means that some I/O operation failed in context of _this_ block_copy call,
540 * not some parallel operation.
541 */
542int coroutine_fn block_copy(BlockCopyState *s, int64_t offset, int64_t bytes,
543 bool *error_is_read)
544{
545 int ret;
546
547 do {
548 ret = block_copy_dirty_clusters(s, offset, bytes, error_is_read);
549
550 if (ret == 0) {
551 ret = block_copy_wait_one(s, offset, bytes);
552 }
553
554 /*
555 * We retry in two cases:
556 * 1. Some progress done
557 * Something was copied, which means that there were yield points
558 * and some new dirty bits may have appeared (due to failed parallel
559 * block-copy requests).
560 * 2. We have waited for some intersecting block-copy request
561 * It may have failed and produced new dirty bits.
562 */
563 } while (ret > 0);
a6ffe199 564
beb5f545
VSO
565 return ret;
566}
397f4e9d
VSO
567
568BdrvDirtyBitmap *block_copy_dirty_bitmap(BlockCopyState *s)
569{
570 return s->copy_bitmap;
571}
572
573void block_copy_set_skip_unallocated(BlockCopyState *s, bool skip)
574{
575 s->skip_unallocated = skip;
576}