]>
Commit | Line | Data |
---|---|---|
beb5f545 VSO |
1 | /* |
2 | * block_copy API | |
3 | * | |
4 | * Copyright (C) 2013 Proxmox Server Solutions | |
5 | * Copyright (c) 2019 Virtuozzo International GmbH. | |
6 | * | |
7 | * Authors: | |
8 | * Dietmar Maurer (dietmar@proxmox.com) | |
9 | * Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com> | |
10 | * | |
11 | * This work is licensed under the terms of the GNU GPL, version 2 or later. | |
12 | * See the COPYING file in the top-level directory. | |
13 | */ | |
14 | ||
15 | #include "qemu/osdep.h" | |
16 | ||
17 | #include "trace.h" | |
18 | #include "qapi/error.h" | |
19 | #include "block/block-copy.h" | |
20 | #include "sysemu/block-backend.h" | |
b3b7036a | 21 | #include "qemu/units.h" |
4ce5dd3e VSO |
22 | #include "qemu/coroutine.h" |
23 | #include "block/aio_task.h" | |
b3b7036a VSO |
24 | |
25 | #define BLOCK_COPY_MAX_COPY_RANGE (16 * MiB) | |
0e240245 | 26 | #define BLOCK_COPY_MAX_BUFFER (1 * MiB) |
7f739d0e | 27 | #define BLOCK_COPY_MAX_MEM (128 * MiB) |
4ce5dd3e | 28 | #define BLOCK_COPY_MAX_WORKERS 64 |
7e032df0 | 29 | #define BLOCK_COPY_SLICE_TIME 100000000ULL /* ns */ |
4ce5dd3e VSO |
30 | |
31 | static coroutine_fn int block_copy_task_entry(AioTask *task); | |
32 | ||
33 | typedef struct BlockCopyCallState { | |
de4641b4 | 34 | /* IN parameters. Initialized in block_copy_async() and never changed. */ |
3b8c2329 VSO |
35 | BlockCopyState *s; |
36 | int64_t offset; | |
37 | int64_t bytes; | |
26be9d62 VSO |
38 | int max_workers; |
39 | int64_t max_chunk; | |
7e032df0 | 40 | bool ignore_ratelimit; |
de4641b4 VSO |
41 | BlockCopyAsyncCallbackFunc cb; |
42 | void *cb_opaque; | |
43 | ||
44 | /* Coroutine where async block-copy is running */ | |
45 | Coroutine *co; | |
3b8c2329 | 46 | |
2e099a9d VSO |
47 | /* To reference all call states from BlockCopyState */ |
48 | QLIST_ENTRY(BlockCopyCallState) list; | |
49 | ||
3b8c2329 | 50 | /* State */ |
de4641b4 VSO |
51 | int ret; |
52 | bool finished; | |
29a6ea24 | 53 | QemuCoSleep sleep; |
a6d23d56 | 54 | bool cancelled; |
3b8c2329 VSO |
55 | |
56 | /* OUT parameters */ | |
4ce5dd3e VSO |
57 | bool error_is_read; |
58 | } BlockCopyCallState; | |
beb5f545 | 59 | |
e9407785 | 60 | typedef struct BlockCopyTask { |
4ce5dd3e VSO |
61 | AioTask task; |
62 | ||
1348a657 | 63 | BlockCopyState *s; |
4ce5dd3e | 64 | BlockCopyCallState *call_state; |
397f4e9d VSO |
65 | int64_t offset; |
66 | int64_t bytes; | |
4ce5dd3e | 67 | bool zeroes; |
bed95234 | 68 | bool copy_range; |
e9407785 VSO |
69 | QLIST_ENTRY(BlockCopyTask) list; |
70 | CoQueue wait_queue; /* coroutines blocked on this task */ | |
71 | } BlockCopyTask; | |
397f4e9d | 72 | |
42ac2144 VSO |
73 | static int64_t task_end(BlockCopyTask *task) |
74 | { | |
75 | return task->offset + task->bytes; | |
76 | } | |
77 | ||
397f4e9d VSO |
78 | typedef struct BlockCopyState { |
79 | /* | |
80 | * BdrvChild objects are not owned or managed by block-copy. They are | |
81 | * provided by block-copy user and user is responsible for appropriate | |
82 | * permissions on these children. | |
83 | */ | |
84 | BdrvChild *source; | |
85 | BdrvChild *target; | |
86 | BdrvDirtyBitmap *copy_bitmap; | |
87 | int64_t in_flight_bytes; | |
88 | int64_t cluster_size; | |
89 | bool use_copy_range; | |
90 | int64_t copy_size; | |
91 | uint64_t len; | |
2e099a9d VSO |
92 | QLIST_HEAD(, BlockCopyTask) tasks; /* All tasks from all block-copy calls */ |
93 | QLIST_HEAD(, BlockCopyCallState) calls; | |
397f4e9d VSO |
94 | |
95 | BdrvRequestFlags write_flags; | |
96 | ||
97 | /* | |
98 | * skip_unallocated: | |
99 | * | |
100 | * Used by sync=top jobs, which first scan the source node for unallocated | |
101 | * areas and clear them in the copy_bitmap. During this process, the bitmap | |
102 | * is thus not fully initialized: It may still have bits set for areas that | |
103 | * are unallocated and should actually not be copied. | |
104 | * | |
105 | * This is indicated by skip_unallocated. | |
106 | * | |
107 | * In this case, block_copy() will query the source’s allocation status, | |
108 | * skip unallocated regions, clear them in the copy_bitmap, and invoke | |
109 | * block_copy_reset_unallocated() every time it does. | |
110 | */ | |
111 | bool skip_unallocated; | |
112 | ||
113 | ProgressMeter *progress; | |
397f4e9d VSO |
114 | |
115 | SharedResource *mem; | |
7e032df0 | 116 | |
7e032df0 | 117 | RateLimit rate_limit; |
397f4e9d VSO |
118 | } BlockCopyState; |
119 | ||
e9407785 VSO |
120 | static BlockCopyTask *find_conflicting_task(BlockCopyState *s, |
121 | int64_t offset, int64_t bytes) | |
17187cb6 | 122 | { |
e9407785 | 123 | BlockCopyTask *t; |
17187cb6 | 124 | |
e9407785 VSO |
125 | QLIST_FOREACH(t, &s->tasks, list) { |
126 | if (offset + bytes > t->offset && offset < t->offset + t->bytes) { | |
127 | return t; | |
17187cb6 VSO |
128 | } |
129 | } | |
130 | ||
131 | return NULL; | |
132 | } | |
133 | ||
5332e5d2 | 134 | /* |
e9407785 VSO |
135 | * If there are no intersecting tasks return false. Otherwise, wait for the |
136 | * first found intersecting tasks to finish and return true. | |
5332e5d2 VSO |
137 | */ |
138 | static bool coroutine_fn block_copy_wait_one(BlockCopyState *s, int64_t offset, | |
139 | int64_t bytes) | |
a6ffe199 | 140 | { |
e9407785 | 141 | BlockCopyTask *task = find_conflicting_task(s, offset, bytes); |
17187cb6 | 142 | |
e9407785 | 143 | if (!task) { |
5332e5d2 | 144 | return false; |
17187cb6 | 145 | } |
5332e5d2 | 146 | |
e9407785 | 147 | qemu_co_queue_wait(&task->wait_queue, NULL); |
5332e5d2 VSO |
148 | |
149 | return true; | |
a6ffe199 VSO |
150 | } |
151 | ||
42ac2144 VSO |
152 | /* |
153 | * Search for the first dirty area in offset/bytes range and create task at | |
154 | * the beginning of it. | |
155 | */ | |
f13e60a9 | 156 | static BlockCopyTask *block_copy_task_create(BlockCopyState *s, |
4ce5dd3e | 157 | BlockCopyCallState *call_state, |
f13e60a9 | 158 | int64_t offset, int64_t bytes) |
a6ffe199 | 159 | { |
42ac2144 | 160 | BlockCopyTask *task; |
26be9d62 | 161 | int64_t max_chunk = MIN_NON_ZERO(s->copy_size, call_state->max_chunk); |
f13e60a9 | 162 | |
42ac2144 VSO |
163 | if (!bdrv_dirty_bitmap_next_dirty_area(s->copy_bitmap, |
164 | offset, offset + bytes, | |
26be9d62 | 165 | max_chunk, &offset, &bytes)) |
42ac2144 VSO |
166 | { |
167 | return NULL; | |
168 | } | |
169 | ||
7661a886 SR |
170 | assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); |
171 | bytes = QEMU_ALIGN_UP(bytes, s->cluster_size); | |
172 | ||
42ac2144 | 173 | /* region is dirty, so no existent tasks possible in it */ |
e9407785 | 174 | assert(!find_conflicting_task(s, offset, bytes)); |
5332e5d2 VSO |
175 | |
176 | bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes); | |
177 | s->in_flight_bytes += bytes; | |
178 | ||
42ac2144 | 179 | task = g_new(BlockCopyTask, 1); |
1348a657 | 180 | *task = (BlockCopyTask) { |
4ce5dd3e | 181 | .task.func = block_copy_task_entry, |
1348a657 | 182 | .s = s, |
4ce5dd3e | 183 | .call_state = call_state, |
1348a657 VSO |
184 | .offset = offset, |
185 | .bytes = bytes, | |
bed95234 | 186 | .copy_range = s->use_copy_range, |
1348a657 | 187 | }; |
e9407785 VSO |
188 | qemu_co_queue_init(&task->wait_queue); |
189 | QLIST_INSERT_HEAD(&s->tasks, task, list); | |
f13e60a9 VSO |
190 | |
191 | return task; | |
a6ffe199 VSO |
192 | } |
193 | ||
5332e5d2 | 194 | /* |
e9407785 | 195 | * block_copy_task_shrink |
5332e5d2 | 196 | * |
e9407785 VSO |
197 | * Drop the tail of the task to be handled later. Set dirty bits back and |
198 | * wake up all tasks waiting for us (may be some of them are not intersecting | |
199 | * with shrunk task) | |
5332e5d2 | 200 | */ |
1348a657 | 201 | static void coroutine_fn block_copy_task_shrink(BlockCopyTask *task, |
e9407785 | 202 | int64_t new_bytes) |
a6ffe199 | 203 | { |
e9407785 | 204 | if (new_bytes == task->bytes) { |
5332e5d2 VSO |
205 | return; |
206 | } | |
207 | ||
e9407785 | 208 | assert(new_bytes > 0 && new_bytes < task->bytes); |
5332e5d2 | 209 | |
1348a657 VSO |
210 | task->s->in_flight_bytes -= task->bytes - new_bytes; |
211 | bdrv_set_dirty_bitmap(task->s->copy_bitmap, | |
e9407785 | 212 | task->offset + new_bytes, task->bytes - new_bytes); |
5332e5d2 | 213 | |
e9407785 VSO |
214 | task->bytes = new_bytes; |
215 | qemu_co_queue_restart_all(&task->wait_queue); | |
5332e5d2 VSO |
216 | } |
217 | ||
1348a657 | 218 | static void coroutine_fn block_copy_task_end(BlockCopyTask *task, int ret) |
5332e5d2 | 219 | { |
1348a657 | 220 | task->s->in_flight_bytes -= task->bytes; |
5332e5d2 | 221 | if (ret < 0) { |
1348a657 | 222 | bdrv_set_dirty_bitmap(task->s->copy_bitmap, task->offset, task->bytes); |
5332e5d2 | 223 | } |
e9407785 VSO |
224 | QLIST_REMOVE(task, list); |
225 | qemu_co_queue_restart_all(&task->wait_queue); | |
a6ffe199 VSO |
226 | } |
227 | ||
beb5f545 VSO |
228 | void block_copy_state_free(BlockCopyState *s) |
229 | { | |
230 | if (!s) { | |
231 | return; | |
232 | } | |
233 | ||
4951967d | 234 | ratelimit_destroy(&s->rate_limit); |
5deb6cbd | 235 | bdrv_release_dirty_bitmap(s->copy_bitmap); |
7f739d0e | 236 | shres_destroy(s->mem); |
beb5f545 VSO |
237 | g_free(s); |
238 | } | |
239 | ||
9d31bc53 VSO |
240 | static uint32_t block_copy_max_transfer(BdrvChild *source, BdrvChild *target) |
241 | { | |
242 | return MIN_NON_ZERO(INT_MAX, | |
243 | MIN_NON_ZERO(source->bs->bl.max_transfer, | |
244 | target->bs->bl.max_transfer)); | |
245 | } | |
246 | ||
00e30f05 | 247 | BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target, |
86c6a3b6 | 248 | int64_t cluster_size, bool use_copy_range, |
0f4b02b7 | 249 | BdrvRequestFlags write_flags, Error **errp) |
beb5f545 VSO |
250 | { |
251 | BlockCopyState *s; | |
beb5f545 VSO |
252 | BdrvDirtyBitmap *copy_bitmap; |
253 | ||
00e30f05 VSO |
254 | copy_bitmap = bdrv_create_dirty_bitmap(source->bs, cluster_size, NULL, |
255 | errp); | |
beb5f545 VSO |
256 | if (!copy_bitmap) { |
257 | return NULL; | |
258 | } | |
259 | bdrv_disable_dirty_bitmap(copy_bitmap); | |
260 | ||
261 | s = g_new(BlockCopyState, 1); | |
262 | *s = (BlockCopyState) { | |
00e30f05 VSO |
263 | .source = source, |
264 | .target = target, | |
beb5f545 VSO |
265 | .copy_bitmap = copy_bitmap, |
266 | .cluster_size = cluster_size, | |
267 | .len = bdrv_dirty_bitmap_size(copy_bitmap), | |
268 | .write_flags = write_flags, | |
7f739d0e | 269 | .mem = shres_create(BLOCK_COPY_MAX_MEM), |
beb5f545 VSO |
270 | }; |
271 | ||
9d31bc53 | 272 | if (block_copy_max_transfer(source, target) < cluster_size) { |
0e240245 VSO |
273 | /* |
274 | * copy_range does not respect max_transfer. We don't want to bother | |
275 | * with requests smaller than block-copy cluster size, so fallback to | |
276 | * buffered copying (read and write respect max_transfer on their | |
277 | * behalf). | |
278 | */ | |
279 | s->use_copy_range = false; | |
280 | s->copy_size = cluster_size; | |
281 | } else if (write_flags & BDRV_REQ_WRITE_COMPRESSED) { | |
dcfbece6 | 282 | /* Compression supports only cluster-size writes and no copy-range. */ |
0e240245 | 283 | s->use_copy_range = false; |
dcfbece6 | 284 | s->copy_size = cluster_size; |
0e240245 VSO |
285 | } else { |
286 | /* | |
9d31bc53 VSO |
287 | * We enable copy-range, but keep small copy_size, until first |
288 | * successful copy_range (look at block_copy_do_copy). | |
0e240245 | 289 | */ |
86c6a3b6 | 290 | s->use_copy_range = use_copy_range; |
9d31bc53 | 291 | s->copy_size = MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER); |
0e240245 | 292 | } |
beb5f545 | 293 | |
4951967d | 294 | ratelimit_init(&s->rate_limit); |
e9407785 | 295 | QLIST_INIT(&s->tasks); |
2e099a9d | 296 | QLIST_INIT(&s->calls); |
a6ffe199 | 297 | |
beb5f545 | 298 | return s; |
beb5f545 VSO |
299 | } |
300 | ||
d0ebeca1 VSO |
301 | void block_copy_set_progress_meter(BlockCopyState *s, ProgressMeter *pm) |
302 | { | |
303 | s->progress = pm; | |
304 | } | |
305 | ||
4ce5dd3e VSO |
306 | /* |
307 | * Takes ownership of @task | |
308 | * | |
309 | * If pool is NULL directly run the task, otherwise schedule it into the pool. | |
310 | * | |
311 | * Returns: task.func return code if pool is NULL | |
312 | * otherwise -ECANCELED if pool status is bad | |
313 | * otherwise 0 (successfully scheduled) | |
314 | */ | |
315 | static coroutine_fn int block_copy_task_run(AioTaskPool *pool, | |
316 | BlockCopyTask *task) | |
317 | { | |
318 | if (!pool) { | |
319 | int ret = task->task.func(&task->task); | |
320 | ||
321 | g_free(task); | |
322 | return ret; | |
323 | } | |
324 | ||
325 | aio_task_pool_wait_slot(pool); | |
326 | if (aio_task_pool_status(pool) < 0) { | |
327 | co_put_to_shres(task->s->mem, task->bytes); | |
328 | block_copy_task_end(task, -ECANCELED); | |
329 | g_free(task); | |
330 | return -ECANCELED; | |
331 | } | |
332 | ||
333 | aio_task_pool_start_task(pool, &task->task); | |
334 | ||
335 | return 0; | |
336 | } | |
337 | ||
beb5f545 | 338 | /* |
e332a726 VSO |
339 | * block_copy_do_copy |
340 | * | |
dafaf135 VSO |
341 | * Do copy of cluster-aligned chunk. Requested region is allowed to exceed |
342 | * s->len only to cover last cluster when s->len is not aligned to clusters. | |
e332a726 VSO |
343 | * |
344 | * No sync here: nor bitmap neighter intersecting requests handling, only copy. | |
345 | * | |
bed95234 VSO |
346 | * @copy_range is an in-out argument: if *copy_range is false, copy_range is not |
347 | * done. If *copy_range is true, copy_range is attempted. If the copy_range | |
348 | * attempt fails, the function falls back to the usual read+write and | |
349 | * *copy_range is set to false. *copy_range and zeroes must not be true | |
350 | * simultaneously. | |
351 | * | |
e332a726 | 352 | * Returns 0 on success. |
beb5f545 | 353 | */ |
e332a726 | 354 | static int coroutine_fn block_copy_do_copy(BlockCopyState *s, |
8719091f | 355 | int64_t offset, int64_t bytes, |
bed95234 VSO |
356 | bool zeroes, bool *copy_range, |
357 | bool *error_is_read) | |
beb5f545 VSO |
358 | { |
359 | int ret; | |
8719091f | 360 | int64_t nbytes = MIN(offset + bytes, s->len) - offset; |
e332a726 | 361 | void *bounce_buffer = NULL; |
beb5f545 | 362 | |
8719091f VSO |
363 | assert(offset >= 0 && bytes > 0 && INT64_MAX - offset >= bytes); |
364 | assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); | |
dafaf135 | 365 | assert(QEMU_IS_ALIGNED(bytes, s->cluster_size)); |
8719091f VSO |
366 | assert(offset < s->len); |
367 | assert(offset + bytes <= s->len || | |
368 | offset + bytes == QEMU_ALIGN_UP(s->len, s->cluster_size)); | |
dafaf135 | 369 | assert(nbytes < INT_MAX); |
bed95234 | 370 | assert(!(*copy_range && zeroes)); |
e332a726 | 371 | |
2d57511a | 372 | if (zeroes) { |
8719091f | 373 | ret = bdrv_co_pwrite_zeroes(s->target, offset, nbytes, s->write_flags & |
2d57511a VSO |
374 | ~BDRV_REQ_WRITE_COMPRESSED); |
375 | if (ret < 0) { | |
8719091f | 376 | trace_block_copy_write_zeroes_fail(s, offset, ret); |
d7eca542 | 377 | *error_is_read = false; |
2d57511a VSO |
378 | } |
379 | return ret; | |
380 | } | |
381 | ||
bed95234 | 382 | if (*copy_range) { |
8719091f | 383 | ret = bdrv_co_copy_range(s->source, offset, s->target, offset, nbytes, |
e332a726 VSO |
384 | 0, s->write_flags); |
385 | if (ret < 0) { | |
8719091f | 386 | trace_block_copy_copy_range_fail(s, offset, ret); |
bed95234 | 387 | *copy_range = false; |
e332a726 VSO |
388 | /* Fallback to read+write with allocated buffer */ |
389 | } else { | |
bed95234 | 390 | return 0; |
e332a726 VSO |
391 | } |
392 | } | |
393 | ||
0e240245 VSO |
394 | /* |
395 | * In case of failed copy_range request above, we may proceed with buffered | |
396 | * request larger than BLOCK_COPY_MAX_BUFFER. Still, further requests will | |
9d31bc53 VSO |
397 | * be properly limited, so don't care too much. Moreover the most likely |
398 | * case (copy_range is unsupported for the configuration, so the very first | |
399 | * copy_range request fails) is handled by setting large copy_size only | |
400 | * after first successful copy_range. | |
0e240245 VSO |
401 | */ |
402 | ||
e332a726 | 403 | bounce_buffer = qemu_blockalign(s->source->bs, nbytes); |
beb5f545 | 404 | |
8719091f | 405 | ret = bdrv_co_pread(s->source, offset, nbytes, bounce_buffer, 0); |
beb5f545 | 406 | if (ret < 0) { |
8719091f | 407 | trace_block_copy_read_fail(s, offset, ret); |
d7eca542 | 408 | *error_is_read = true; |
e332a726 | 409 | goto out; |
beb5f545 VSO |
410 | } |
411 | ||
8719091f | 412 | ret = bdrv_co_pwrite(s->target, offset, nbytes, bounce_buffer, |
00e30f05 | 413 | s->write_flags); |
beb5f545 | 414 | if (ret < 0) { |
8719091f | 415 | trace_block_copy_write_fail(s, offset, ret); |
d7eca542 | 416 | *error_is_read = false; |
e332a726 | 417 | goto out; |
beb5f545 VSO |
418 | } |
419 | ||
e332a726 | 420 | out: |
3816edd2 VSO |
421 | qemu_vfree(bounce_buffer); |
422 | ||
beb5f545 | 423 | return ret; |
beb5f545 VSO |
424 | } |
425 | ||
bed95234 VSO |
426 | static void block_copy_handle_copy_range_result(BlockCopyState *s, |
427 | bool is_success) | |
428 | { | |
429 | if (!s->use_copy_range) { | |
430 | /* already disabled */ | |
431 | return; | |
432 | } | |
433 | ||
434 | if (is_success) { | |
435 | /* | |
436 | * Successful copy-range. Now increase copy_size. copy_range | |
437 | * does not respect max_transfer (it's a TODO), so we factor | |
438 | * that in here. | |
439 | */ | |
440 | s->copy_size = | |
441 | MIN(MAX(s->cluster_size, BLOCK_COPY_MAX_COPY_RANGE), | |
442 | QEMU_ALIGN_DOWN(block_copy_max_transfer(s->source, | |
443 | s->target), | |
444 | s->cluster_size)); | |
445 | } else { | |
446 | /* Copy-range failed, disable it. */ | |
447 | s->use_copy_range = false; | |
448 | s->copy_size = MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER); | |
449 | } | |
450 | } | |
451 | ||
4ce5dd3e VSO |
452 | static coroutine_fn int block_copy_task_entry(AioTask *task) |
453 | { | |
454 | BlockCopyTask *t = container_of(task, BlockCopyTask, task); | |
c6a3e3df | 455 | BlockCopyState *s = t->s; |
c78dd00e | 456 | bool error_is_read = false; |
bed95234 | 457 | bool copy_range = t->copy_range; |
4ce5dd3e VSO |
458 | int ret; |
459 | ||
c6a3e3df | 460 | ret = block_copy_do_copy(s, t->offset, t->bytes, t->zeroes, |
bed95234 VSO |
461 | ©_range, &error_is_read); |
462 | if (t->copy_range) { | |
c6a3e3df | 463 | block_copy_handle_copy_range_result(s, copy_range); |
bed95234 | 464 | } |
8146b357 VSO |
465 | if (ret < 0) { |
466 | if (!t->call_state->ret) { | |
467 | t->call_state->ret = ret; | |
468 | t->call_state->error_is_read = error_is_read; | |
469 | } | |
4ce5dd3e | 470 | } else { |
c6a3e3df | 471 | progress_work_done(s->progress, t->bytes); |
4ce5dd3e | 472 | } |
c6a3e3df | 473 | co_put_to_shres(s->mem, t->bytes); |
4ce5dd3e VSO |
474 | block_copy_task_end(t, ret); |
475 | ||
476 | return ret; | |
477 | } | |
478 | ||
2d57511a VSO |
479 | static int block_copy_block_status(BlockCopyState *s, int64_t offset, |
480 | int64_t bytes, int64_t *pnum) | |
481 | { | |
482 | int64_t num; | |
483 | BlockDriverState *base; | |
484 | int ret; | |
485 | ||
c6f6d846 HR |
486 | if (s->skip_unallocated) { |
487 | base = bdrv_backing_chain_next(s->source->bs); | |
2d57511a VSO |
488 | } else { |
489 | base = NULL; | |
490 | } | |
491 | ||
492 | ret = bdrv_block_status_above(s->source->bs, base, offset, bytes, &num, | |
493 | NULL, NULL); | |
494 | if (ret < 0 || num < s->cluster_size) { | |
495 | /* | |
496 | * On error or if failed to obtain large enough chunk just fallback to | |
497 | * copy one cluster. | |
498 | */ | |
499 | num = s->cluster_size; | |
500 | ret = BDRV_BLOCK_ALLOCATED | BDRV_BLOCK_DATA; | |
501 | } else if (offset + num == s->len) { | |
502 | num = QEMU_ALIGN_UP(num, s->cluster_size); | |
503 | } else { | |
504 | num = QEMU_ALIGN_DOWN(num, s->cluster_size); | |
505 | } | |
506 | ||
507 | *pnum = num; | |
508 | return ret; | |
509 | } | |
510 | ||
beb5f545 VSO |
511 | /* |
512 | * Check if the cluster starting at offset is allocated or not. | |
513 | * return via pnum the number of contiguous clusters sharing this allocation. | |
514 | */ | |
515 | static int block_copy_is_cluster_allocated(BlockCopyState *s, int64_t offset, | |
516 | int64_t *pnum) | |
517 | { | |
00e30f05 | 518 | BlockDriverState *bs = s->source->bs; |
beb5f545 VSO |
519 | int64_t count, total_count = 0; |
520 | int64_t bytes = s->len - offset; | |
521 | int ret; | |
522 | ||
523 | assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); | |
524 | ||
525 | while (true) { | |
526 | ret = bdrv_is_allocated(bs, offset, bytes, &count); | |
527 | if (ret < 0) { | |
528 | return ret; | |
529 | } | |
530 | ||
531 | total_count += count; | |
532 | ||
533 | if (ret || count == 0) { | |
534 | /* | |
535 | * ret: partial segment(s) are considered allocated. | |
536 | * otherwise: unallocated tail is treated as an entire segment. | |
537 | */ | |
538 | *pnum = DIV_ROUND_UP(total_count, s->cluster_size); | |
539 | return ret; | |
540 | } | |
541 | ||
542 | /* Unallocated segment(s) with uncertain following segment(s) */ | |
543 | if (total_count >= s->cluster_size) { | |
544 | *pnum = total_count / s->cluster_size; | |
545 | return 0; | |
546 | } | |
547 | ||
548 | offset += count; | |
549 | bytes -= count; | |
550 | } | |
551 | } | |
552 | ||
553 | /* | |
554 | * Reset bits in copy_bitmap starting at offset if they represent unallocated | |
555 | * data in the image. May reset subsequent contiguous bits. | |
556 | * @return 0 when the cluster at @offset was unallocated, | |
557 | * 1 otherwise, and -ret on error. | |
558 | */ | |
559 | int64_t block_copy_reset_unallocated(BlockCopyState *s, | |
560 | int64_t offset, int64_t *count) | |
561 | { | |
562 | int ret; | |
563 | int64_t clusters, bytes; | |
564 | ||
565 | ret = block_copy_is_cluster_allocated(s, offset, &clusters); | |
566 | if (ret < 0) { | |
567 | return ret; | |
568 | } | |
569 | ||
570 | bytes = clusters * s->cluster_size; | |
571 | ||
572 | if (!ret) { | |
573 | bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes); | |
d0ebeca1 VSO |
574 | progress_set_remaining(s->progress, |
575 | bdrv_get_dirty_count(s->copy_bitmap) + | |
576 | s->in_flight_bytes); | |
beb5f545 VSO |
577 | } |
578 | ||
579 | *count = bytes; | |
580 | return ret; | |
581 | } | |
582 | ||
5332e5d2 VSO |
583 | /* |
584 | * block_copy_dirty_clusters | |
585 | * | |
586 | * Copy dirty clusters in @offset/@bytes range. | |
587 | * Returns 1 if dirty clusters found and successfully copied, 0 if no dirty | |
588 | * clusters found and -errno on failure. | |
589 | */ | |
3b8c2329 VSO |
590 | static int coroutine_fn |
591 | block_copy_dirty_clusters(BlockCopyCallState *call_state) | |
beb5f545 | 592 | { |
3b8c2329 VSO |
593 | BlockCopyState *s = call_state->s; |
594 | int64_t offset = call_state->offset; | |
595 | int64_t bytes = call_state->bytes; | |
596 | ||
beb5f545 | 597 | int ret = 0; |
5332e5d2 | 598 | bool found_dirty = false; |
42ac2144 | 599 | int64_t end = offset + bytes; |
4ce5dd3e | 600 | AioTaskPool *aio = NULL; |
beb5f545 VSO |
601 | |
602 | /* | |
603 | * block_copy() user is responsible for keeping source and target in same | |
604 | * aio context | |
605 | */ | |
00e30f05 VSO |
606 | assert(bdrv_get_aio_context(s->source->bs) == |
607 | bdrv_get_aio_context(s->target->bs)); | |
beb5f545 | 608 | |
8719091f | 609 | assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); |
dafaf135 | 610 | assert(QEMU_IS_ALIGNED(bytes, s->cluster_size)); |
beb5f545 | 611 | |
a6d23d56 | 612 | while (bytes && aio_task_pool_status(aio) == 0 && !call_state->cancelled) { |
4ce5dd3e | 613 | BlockCopyTask *task; |
42ac2144 | 614 | int64_t status_bytes; |
beb5f545 | 615 | |
3b8c2329 | 616 | task = block_copy_task_create(s, call_state, offset, bytes); |
42ac2144 VSO |
617 | if (!task) { |
618 | /* No more dirty bits in the bitmap */ | |
619 | trace_block_copy_skip_range(s, offset, bytes); | |
620 | break; | |
621 | } | |
622 | if (task->offset > offset) { | |
623 | trace_block_copy_skip_range(s, offset, task->offset - offset); | |
beb5f545 VSO |
624 | } |
625 | ||
5332e5d2 VSO |
626 | found_dirty = true; |
627 | ||
42ac2144 VSO |
628 | ret = block_copy_block_status(s, task->offset, task->bytes, |
629 | &status_bytes); | |
5332e5d2 | 630 | assert(ret >= 0); /* never fail */ |
42ac2144 VSO |
631 | if (status_bytes < task->bytes) { |
632 | block_copy_task_shrink(task, status_bytes); | |
633 | } | |
2d57511a | 634 | if (s->skip_unallocated && !(ret & BDRV_BLOCK_ALLOCATED)) { |
1348a657 | 635 | block_copy_task_end(task, 0); |
2d57511a VSO |
636 | progress_set_remaining(s->progress, |
637 | bdrv_get_dirty_count(s->copy_bitmap) + | |
638 | s->in_flight_bytes); | |
42ac2144 VSO |
639 | trace_block_copy_skip_range(s, task->offset, task->bytes); |
640 | offset = task_end(task); | |
641 | bytes = end - offset; | |
fc9aefc8 | 642 | g_free(task); |
2d57511a | 643 | continue; |
beb5f545 | 644 | } |
bed95234 VSO |
645 | if (ret & BDRV_BLOCK_ZERO) { |
646 | task->zeroes = true; | |
647 | task->copy_range = false; | |
648 | } | |
beb5f545 | 649 | |
ca657c99 PB |
650 | if (!call_state->ignore_ratelimit) { |
651 | uint64_t ns = ratelimit_calculate_delay(&s->rate_limit, 0); | |
652 | if (ns > 0) { | |
653 | block_copy_task_end(task, -EAGAIN); | |
654 | g_free(task); | |
655 | qemu_co_sleep_ns_wakeable(&call_state->sleep, | |
656 | QEMU_CLOCK_REALTIME, ns); | |
657 | continue; | |
7e032df0 | 658 | } |
7e032df0 VSO |
659 | } |
660 | ||
ca657c99 PB |
661 | ratelimit_calculate_delay(&s->rate_limit, task->bytes); |
662 | ||
42ac2144 | 663 | trace_block_copy_process(s, task->offset); |
beb5f545 | 664 | |
42ac2144 | 665 | co_get_from_shres(s->mem, task->bytes); |
beb5f545 | 666 | |
42ac2144 VSO |
667 | offset = task_end(task); |
668 | bytes = end - offset; | |
4ce5dd3e VSO |
669 | |
670 | if (!aio && bytes) { | |
26be9d62 | 671 | aio = aio_task_pool_new(call_state->max_workers); |
4ce5dd3e VSO |
672 | } |
673 | ||
674 | ret = block_copy_task_run(aio, task); | |
675 | if (ret < 0) { | |
676 | goto out; | |
677 | } | |
678 | } | |
679 | ||
680 | out: | |
681 | if (aio) { | |
682 | aio_task_pool_wait_all(aio); | |
683 | ||
684 | /* | |
685 | * We are not really interested in -ECANCELED returned from | |
686 | * block_copy_task_run. If it fails, it means some task already failed | |
687 | * for real reason, let's return first failure. | |
688 | * Still, assert that we don't rewrite failure by success. | |
e8de7ba9 VSO |
689 | * |
690 | * Note: ret may be positive here because of block-status result. | |
4ce5dd3e | 691 | */ |
e8de7ba9 | 692 | assert(ret >= 0 || aio_task_pool_status(aio) < 0); |
4ce5dd3e VSO |
693 | ret = aio_task_pool_status(aio); |
694 | ||
695 | aio_task_pool_free(aio); | |
696 | } | |
beb5f545 | 697 | |
4ce5dd3e | 698 | return ret < 0 ? ret : found_dirty; |
5332e5d2 VSO |
699 | } |
700 | ||
7e032df0 VSO |
701 | void block_copy_kick(BlockCopyCallState *call_state) |
702 | { | |
29a6ea24 | 703 | qemu_co_sleep_wake(&call_state->sleep); |
7e032df0 VSO |
704 | } |
705 | ||
5332e5d2 | 706 | /* |
3b8c2329 | 707 | * block_copy_common |
5332e5d2 VSO |
708 | * |
709 | * Copy requested region, accordingly to dirty bitmap. | |
710 | * Collaborate with parallel block_copy requests: if they succeed it will help | |
711 | * us. If they fail, we will retry not-copied regions. So, if we return error, | |
712 | * it means that some I/O operation failed in context of _this_ block_copy call, | |
713 | * not some parallel operation. | |
714 | */ | |
3b8c2329 | 715 | static int coroutine_fn block_copy_common(BlockCopyCallState *call_state) |
5332e5d2 VSO |
716 | { |
717 | int ret; | |
c6a3e3df | 718 | BlockCopyState *s = call_state->s; |
5332e5d2 | 719 | |
c6a3e3df | 720 | QLIST_INSERT_HEAD(&s->calls, call_state, list); |
2e099a9d | 721 | |
5332e5d2 | 722 | do { |
3b8c2329 | 723 | ret = block_copy_dirty_clusters(call_state); |
5332e5d2 | 724 | |
a6d23d56 | 725 | if (ret == 0 && !call_state->cancelled) { |
c6a3e3df | 726 | ret = block_copy_wait_one(s, call_state->offset, |
3b8c2329 | 727 | call_state->bytes); |
5332e5d2 VSO |
728 | } |
729 | ||
730 | /* | |
731 | * We retry in two cases: | |
732 | * 1. Some progress done | |
733 | * Something was copied, which means that there were yield points | |
734 | * and some new dirty bits may have appeared (due to failed parallel | |
735 | * block-copy requests). | |
736 | * 2. We have waited for some intersecting block-copy request | |
737 | * It may have failed and produced new dirty bits. | |
738 | */ | |
a6d23d56 | 739 | } while (ret > 0 && !call_state->cancelled); |
a6ffe199 | 740 | |
de4641b4 VSO |
741 | call_state->finished = true; |
742 | ||
743 | if (call_state->cb) { | |
744 | call_state->cb(call_state->cb_opaque); | |
745 | } | |
746 | ||
2e099a9d VSO |
747 | QLIST_REMOVE(call_state, list); |
748 | ||
beb5f545 VSO |
749 | return ret; |
750 | } | |
397f4e9d | 751 | |
3b8c2329 | 752 | int coroutine_fn block_copy(BlockCopyState *s, int64_t start, int64_t bytes, |
143a6384 | 753 | bool ignore_ratelimit) |
3b8c2329 VSO |
754 | { |
755 | BlockCopyCallState call_state = { | |
756 | .s = s, | |
757 | .offset = start, | |
758 | .bytes = bytes, | |
7e032df0 | 759 | .ignore_ratelimit = ignore_ratelimit, |
26be9d62 | 760 | .max_workers = BLOCK_COPY_MAX_WORKERS, |
3b8c2329 VSO |
761 | }; |
762 | ||
143a6384 | 763 | return block_copy_common(&call_state); |
3b8c2329 VSO |
764 | } |
765 | ||
de4641b4 VSO |
766 | static void coroutine_fn block_copy_async_co_entry(void *opaque) |
767 | { | |
768 | block_copy_common(opaque); | |
769 | } | |
770 | ||
771 | BlockCopyCallState *block_copy_async(BlockCopyState *s, | |
772 | int64_t offset, int64_t bytes, | |
26be9d62 | 773 | int max_workers, int64_t max_chunk, |
de4641b4 VSO |
774 | BlockCopyAsyncCallbackFunc cb, |
775 | void *cb_opaque) | |
776 | { | |
777 | BlockCopyCallState *call_state = g_new(BlockCopyCallState, 1); | |
778 | ||
779 | *call_state = (BlockCopyCallState) { | |
780 | .s = s, | |
781 | .offset = offset, | |
782 | .bytes = bytes, | |
26be9d62 VSO |
783 | .max_workers = max_workers, |
784 | .max_chunk = max_chunk, | |
de4641b4 VSO |
785 | .cb = cb, |
786 | .cb_opaque = cb_opaque, | |
787 | ||
788 | .co = qemu_coroutine_create(block_copy_async_co_entry, call_state), | |
789 | }; | |
790 | ||
791 | qemu_coroutine_enter(call_state->co); | |
792 | ||
793 | return call_state; | |
794 | } | |
795 | ||
796 | void block_copy_call_free(BlockCopyCallState *call_state) | |
797 | { | |
798 | if (!call_state) { | |
799 | return; | |
800 | } | |
801 | ||
802 | assert(call_state->finished); | |
803 | g_free(call_state); | |
804 | } | |
805 | ||
806 | bool block_copy_call_finished(BlockCopyCallState *call_state) | |
807 | { | |
808 | return call_state->finished; | |
809 | } | |
810 | ||
811 | bool block_copy_call_succeeded(BlockCopyCallState *call_state) | |
812 | { | |
a6d23d56 VSO |
813 | return call_state->finished && !call_state->cancelled && |
814 | call_state->ret == 0; | |
de4641b4 VSO |
815 | } |
816 | ||
817 | bool block_copy_call_failed(BlockCopyCallState *call_state) | |
818 | { | |
a6d23d56 VSO |
819 | return call_state->finished && !call_state->cancelled && |
820 | call_state->ret < 0; | |
821 | } | |
822 | ||
823 | bool block_copy_call_cancelled(BlockCopyCallState *call_state) | |
824 | { | |
825 | return call_state->cancelled; | |
de4641b4 VSO |
826 | } |
827 | ||
828 | int block_copy_call_status(BlockCopyCallState *call_state, bool *error_is_read) | |
829 | { | |
830 | assert(call_state->finished); | |
831 | if (error_is_read) { | |
832 | *error_is_read = call_state->error_is_read; | |
833 | } | |
834 | return call_state->ret; | |
835 | } | |
836 | ||
a6d23d56 VSO |
837 | void block_copy_call_cancel(BlockCopyCallState *call_state) |
838 | { | |
839 | call_state->cancelled = true; | |
840 | block_copy_kick(call_state); | |
841 | } | |
842 | ||
397f4e9d VSO |
843 | BdrvDirtyBitmap *block_copy_dirty_bitmap(BlockCopyState *s) |
844 | { | |
845 | return s->copy_bitmap; | |
846 | } | |
847 | ||
848 | void block_copy_set_skip_unallocated(BlockCopyState *s, bool skip) | |
849 | { | |
850 | s->skip_unallocated = skip; | |
851 | } | |
7e032df0 VSO |
852 | |
853 | void block_copy_set_speed(BlockCopyState *s, uint64_t speed) | |
854 | { | |
ca657c99 | 855 | ratelimit_set_speed(&s->rate_limit, speed, BLOCK_COPY_SLICE_TIME); |
7e032df0 VSO |
856 | |
857 | /* | |
858 | * Note: it's good to kick all call states from here, but it should be done | |
859 | * only from a coroutine, to not crash if s->calls list changed while | |
860 | * entering one call. So for now, the only user of this function kicks its | |
861 | * only one call_state by hand. | |
862 | */ | |
863 | } |