]>
Commit | Line | Data |
---|---|---|
beb5f545 VSO |
1 | /* |
2 | * block_copy API | |
3 | * | |
4 | * Copyright (C) 2013 Proxmox Server Solutions | |
5 | * Copyright (c) 2019 Virtuozzo International GmbH. | |
6 | * | |
7 | * Authors: | |
8 | * Dietmar Maurer (dietmar@proxmox.com) | |
9 | * Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com> | |
10 | * | |
11 | * This work is licensed under the terms of the GNU GPL, version 2 or later. | |
12 | * See the COPYING file in the top-level directory. | |
13 | */ | |
14 | ||
15 | #include "qemu/osdep.h" | |
16 | ||
17 | #include "trace.h" | |
18 | #include "qapi/error.h" | |
19 | #include "block/block-copy.h" | |
20 | #include "sysemu/block-backend.h" | |
b3b7036a | 21 | #include "qemu/units.h" |
4ce5dd3e VSO |
22 | #include "qemu/coroutine.h" |
23 | #include "block/aio_task.h" | |
b3b7036a VSO |
24 | |
25 | #define BLOCK_COPY_MAX_COPY_RANGE (16 * MiB) | |
0e240245 | 26 | #define BLOCK_COPY_MAX_BUFFER (1 * MiB) |
7f739d0e | 27 | #define BLOCK_COPY_MAX_MEM (128 * MiB) |
4ce5dd3e | 28 | #define BLOCK_COPY_MAX_WORKERS 64 |
7e032df0 | 29 | #define BLOCK_COPY_SLICE_TIME 100000000ULL /* ns */ |
4ce5dd3e | 30 | |
05d5e12b PB |
31 | typedef enum { |
32 | COPY_READ_WRITE_CLUSTER, | |
33 | COPY_READ_WRITE, | |
34 | COPY_WRITE_ZEROES, | |
35 | COPY_RANGE_SMALL, | |
36 | COPY_RANGE_FULL | |
37 | } BlockCopyMethod; | |
38 | ||
4ce5dd3e VSO |
39 | static coroutine_fn int block_copy_task_entry(AioTask *task); |
40 | ||
41 | typedef struct BlockCopyCallState { | |
d0c389d2 | 42 | /* Fields initialized in block_copy_async() and never changed. */ |
3b8c2329 VSO |
43 | BlockCopyState *s; |
44 | int64_t offset; | |
45 | int64_t bytes; | |
26be9d62 VSO |
46 | int max_workers; |
47 | int64_t max_chunk; | |
7e032df0 | 48 | bool ignore_ratelimit; |
de4641b4 VSO |
49 | BlockCopyAsyncCallbackFunc cb; |
50 | void *cb_opaque; | |
de4641b4 VSO |
51 | /* Coroutine where async block-copy is running */ |
52 | Coroutine *co; | |
3b8c2329 | 53 | |
d0c389d2 | 54 | /* Fields whose state changes throughout the execution */ |
149009be | 55 | bool finished; /* atomic */ |
d0c389d2 | 56 | QemuCoSleep sleep; /* TODO: protect API with a lock */ |
149009be | 57 | bool cancelled; /* atomic */ |
d0c389d2 EGE |
58 | /* To reference all call states from BlockCopyState */ |
59 | QLIST_ENTRY(BlockCopyCallState) list; | |
3b8c2329 | 60 | |
d0c389d2 EGE |
61 | /* |
62 | * Fields that report information about return values and erros. | |
63 | * Protected by lock in BlockCopyState. | |
64 | */ | |
4ce5dd3e | 65 | bool error_is_read; |
d0c389d2 EGE |
66 | /* |
67 | * @ret is set concurrently by tasks under mutex. Only set once by first | |
68 | * failed task (and untouched if no task failed). | |
69 | * After finishing (call_state->finished is true), it is not modified | |
70 | * anymore and may be safely read without mutex. | |
71 | */ | |
72 | int ret; | |
4ce5dd3e | 73 | } BlockCopyCallState; |
beb5f545 | 74 | |
e9407785 | 75 | typedef struct BlockCopyTask { |
4ce5dd3e VSO |
76 | AioTask task; |
77 | ||
d0c389d2 EGE |
78 | /* |
79 | * Fields initialized in block_copy_task_create() | |
80 | * and never changed. | |
81 | */ | |
1348a657 | 82 | BlockCopyState *s; |
4ce5dd3e | 83 | BlockCopyCallState *call_state; |
397f4e9d | 84 | int64_t offset; |
d0c389d2 EGE |
85 | /* |
86 | * @method can also be set again in the while loop of | |
87 | * block_copy_dirty_clusters(), but it is never accessed concurrently | |
88 | * because the only other function that reads it is | |
89 | * block_copy_task_entry() and it is invoked afterwards in the same | |
90 | * iteration. | |
91 | */ | |
05d5e12b | 92 | BlockCopyMethod method; |
d0c389d2 EGE |
93 | |
94 | /* | |
95 | * Fields whose state changes throughout the execution | |
96 | * Protected by lock in BlockCopyState. | |
97 | */ | |
e9407785 | 98 | CoQueue wait_queue; /* coroutines blocked on this task */ |
d0c389d2 EGE |
99 | /* |
100 | * Only protect the case of parallel read while updating @bytes | |
101 | * value in block_copy_task_shrink(). | |
102 | */ | |
103 | int64_t bytes; | |
104 | QLIST_ENTRY(BlockCopyTask) list; | |
e9407785 | 105 | } BlockCopyTask; |
397f4e9d | 106 | |
42ac2144 VSO |
107 | static int64_t task_end(BlockCopyTask *task) |
108 | { | |
109 | return task->offset + task->bytes; | |
110 | } | |
111 | ||
397f4e9d VSO |
112 | typedef struct BlockCopyState { |
113 | /* | |
114 | * BdrvChild objects are not owned or managed by block-copy. They are | |
115 | * provided by block-copy user and user is responsible for appropriate | |
116 | * permissions on these children. | |
117 | */ | |
118 | BdrvChild *source; | |
119 | BdrvChild *target; | |
d0c389d2 EGE |
120 | |
121 | /* | |
122 | * Fields initialized in block_copy_state_new() | |
123 | * and never changed. | |
124 | */ | |
397f4e9d | 125 | int64_t cluster_size; |
05d5e12b | 126 | int64_t max_transfer; |
397f4e9d | 127 | uint64_t len; |
397f4e9d VSO |
128 | BdrvRequestFlags write_flags; |
129 | ||
d0c389d2 EGE |
130 | /* |
131 | * Fields whose state changes throughout the execution | |
132 | * Protected by lock. | |
133 | */ | |
134 | CoMutex lock; | |
135 | int64_t in_flight_bytes; | |
136 | BlockCopyMethod method; | |
137 | QLIST_HEAD(, BlockCopyTask) tasks; /* All tasks from all block-copy calls */ | |
138 | QLIST_HEAD(, BlockCopyCallState) calls; | |
397f4e9d VSO |
139 | /* |
140 | * skip_unallocated: | |
141 | * | |
142 | * Used by sync=top jobs, which first scan the source node for unallocated | |
143 | * areas and clear them in the copy_bitmap. During this process, the bitmap | |
144 | * is thus not fully initialized: It may still have bits set for areas that | |
145 | * are unallocated and should actually not be copied. | |
146 | * | |
147 | * This is indicated by skip_unallocated. | |
148 | * | |
149 | * In this case, block_copy() will query the source’s allocation status, | |
150 | * skip unallocated regions, clear them in the copy_bitmap, and invoke | |
151 | * block_copy_reset_unallocated() every time it does. | |
152 | */ | |
d0c389d2 EGE |
153 | bool skip_unallocated; /* atomic */ |
154 | /* State fields that use a thread-safe API */ | |
155 | BdrvDirtyBitmap *copy_bitmap; | |
397f4e9d | 156 | ProgressMeter *progress; |
397f4e9d | 157 | SharedResource *mem; |
7e032df0 | 158 | RateLimit rate_limit; |
397f4e9d VSO |
159 | } BlockCopyState; |
160 | ||
d0c389d2 | 161 | /* Called with lock held */ |
e9407785 VSO |
162 | static BlockCopyTask *find_conflicting_task(BlockCopyState *s, |
163 | int64_t offset, int64_t bytes) | |
17187cb6 | 164 | { |
e9407785 | 165 | BlockCopyTask *t; |
17187cb6 | 166 | |
e9407785 VSO |
167 | QLIST_FOREACH(t, &s->tasks, list) { |
168 | if (offset + bytes > t->offset && offset < t->offset + t->bytes) { | |
169 | return t; | |
17187cb6 VSO |
170 | } |
171 | } | |
172 | ||
173 | return NULL; | |
174 | } | |
175 | ||
5332e5d2 | 176 | /* |
e9407785 VSO |
177 | * If there are no intersecting tasks return false. Otherwise, wait for the |
178 | * first found intersecting tasks to finish and return true. | |
d0c389d2 EGE |
179 | * |
180 | * Called with lock held. May temporary release the lock. | |
181 | * Return value of 0 proves that lock was NOT released. | |
5332e5d2 VSO |
182 | */ |
183 | static bool coroutine_fn block_copy_wait_one(BlockCopyState *s, int64_t offset, | |
184 | int64_t bytes) | |
a6ffe199 | 185 | { |
e9407785 | 186 | BlockCopyTask *task = find_conflicting_task(s, offset, bytes); |
17187cb6 | 187 | |
e9407785 | 188 | if (!task) { |
5332e5d2 | 189 | return false; |
17187cb6 | 190 | } |
5332e5d2 | 191 | |
d0c389d2 | 192 | qemu_co_queue_wait(&task->wait_queue, &s->lock); |
5332e5d2 VSO |
193 | |
194 | return true; | |
a6ffe199 VSO |
195 | } |
196 | ||
d0c389d2 | 197 | /* Called with lock held */ |
05d5e12b PB |
198 | static int64_t block_copy_chunk_size(BlockCopyState *s) |
199 | { | |
200 | switch (s->method) { | |
201 | case COPY_READ_WRITE_CLUSTER: | |
202 | return s->cluster_size; | |
203 | case COPY_READ_WRITE: | |
204 | case COPY_RANGE_SMALL: | |
205 | return MIN(MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER), | |
206 | s->max_transfer); | |
207 | case COPY_RANGE_FULL: | |
208 | return MIN(MAX(s->cluster_size, BLOCK_COPY_MAX_COPY_RANGE), | |
209 | s->max_transfer); | |
210 | default: | |
211 | /* Cannot have COPY_WRITE_ZEROES here. */ | |
212 | abort(); | |
213 | } | |
214 | } | |
215 | ||
42ac2144 VSO |
216 | /* |
217 | * Search for the first dirty area in offset/bytes range and create task at | |
218 | * the beginning of it. | |
219 | */ | |
d0c389d2 EGE |
220 | static coroutine_fn BlockCopyTask * |
221 | block_copy_task_create(BlockCopyState *s, BlockCopyCallState *call_state, | |
222 | int64_t offset, int64_t bytes) | |
a6ffe199 | 223 | { |
42ac2144 | 224 | BlockCopyTask *task; |
05d5e12b | 225 | int64_t max_chunk; |
f13e60a9 | 226 | |
d0c389d2 | 227 | QEMU_LOCK_GUARD(&s->lock); |
05d5e12b | 228 | max_chunk = MIN_NON_ZERO(block_copy_chunk_size(s), call_state->max_chunk); |
42ac2144 VSO |
229 | if (!bdrv_dirty_bitmap_next_dirty_area(s->copy_bitmap, |
230 | offset, offset + bytes, | |
26be9d62 | 231 | max_chunk, &offset, &bytes)) |
42ac2144 VSO |
232 | { |
233 | return NULL; | |
234 | } | |
235 | ||
7661a886 SR |
236 | assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); |
237 | bytes = QEMU_ALIGN_UP(bytes, s->cluster_size); | |
238 | ||
42ac2144 | 239 | /* region is dirty, so no existent tasks possible in it */ |
e9407785 | 240 | assert(!find_conflicting_task(s, offset, bytes)); |
5332e5d2 VSO |
241 | |
242 | bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes); | |
243 | s->in_flight_bytes += bytes; | |
244 | ||
42ac2144 | 245 | task = g_new(BlockCopyTask, 1); |
1348a657 | 246 | *task = (BlockCopyTask) { |
4ce5dd3e | 247 | .task.func = block_copy_task_entry, |
1348a657 | 248 | .s = s, |
4ce5dd3e | 249 | .call_state = call_state, |
1348a657 VSO |
250 | .offset = offset, |
251 | .bytes = bytes, | |
05d5e12b | 252 | .method = s->method, |
1348a657 | 253 | }; |
e9407785 VSO |
254 | qemu_co_queue_init(&task->wait_queue); |
255 | QLIST_INSERT_HEAD(&s->tasks, task, list); | |
f13e60a9 VSO |
256 | |
257 | return task; | |
a6ffe199 VSO |
258 | } |
259 | ||
5332e5d2 | 260 | /* |
e9407785 | 261 | * block_copy_task_shrink |
5332e5d2 | 262 | * |
e9407785 VSO |
263 | * Drop the tail of the task to be handled later. Set dirty bits back and |
264 | * wake up all tasks waiting for us (may be some of them are not intersecting | |
265 | * with shrunk task) | |
5332e5d2 | 266 | */ |
1348a657 | 267 | static void coroutine_fn block_copy_task_shrink(BlockCopyTask *task, |
e9407785 | 268 | int64_t new_bytes) |
a6ffe199 | 269 | { |
d0c389d2 | 270 | QEMU_LOCK_GUARD(&task->s->lock); |
e9407785 | 271 | if (new_bytes == task->bytes) { |
5332e5d2 VSO |
272 | return; |
273 | } | |
274 | ||
e9407785 | 275 | assert(new_bytes > 0 && new_bytes < task->bytes); |
5332e5d2 | 276 | |
1348a657 VSO |
277 | task->s->in_flight_bytes -= task->bytes - new_bytes; |
278 | bdrv_set_dirty_bitmap(task->s->copy_bitmap, | |
e9407785 | 279 | task->offset + new_bytes, task->bytes - new_bytes); |
5332e5d2 | 280 | |
e9407785 VSO |
281 | task->bytes = new_bytes; |
282 | qemu_co_queue_restart_all(&task->wait_queue); | |
5332e5d2 VSO |
283 | } |
284 | ||
1348a657 | 285 | static void coroutine_fn block_copy_task_end(BlockCopyTask *task, int ret) |
5332e5d2 | 286 | { |
d0c389d2 | 287 | QEMU_LOCK_GUARD(&task->s->lock); |
1348a657 | 288 | task->s->in_flight_bytes -= task->bytes; |
5332e5d2 | 289 | if (ret < 0) { |
1348a657 | 290 | bdrv_set_dirty_bitmap(task->s->copy_bitmap, task->offset, task->bytes); |
5332e5d2 | 291 | } |
e9407785 | 292 | QLIST_REMOVE(task, list); |
e3dd339f EGE |
293 | progress_set_remaining(task->s->progress, |
294 | bdrv_get_dirty_count(task->s->copy_bitmap) + | |
295 | task->s->in_flight_bytes); | |
e9407785 | 296 | qemu_co_queue_restart_all(&task->wait_queue); |
a6ffe199 VSO |
297 | } |
298 | ||
beb5f545 VSO |
299 | void block_copy_state_free(BlockCopyState *s) |
300 | { | |
301 | if (!s) { | |
302 | return; | |
303 | } | |
304 | ||
4951967d | 305 | ratelimit_destroy(&s->rate_limit); |
5deb6cbd | 306 | bdrv_release_dirty_bitmap(s->copy_bitmap); |
7f739d0e | 307 | shres_destroy(s->mem); |
beb5f545 VSO |
308 | g_free(s); |
309 | } | |
310 | ||
9d31bc53 VSO |
311 | static uint32_t block_copy_max_transfer(BdrvChild *source, BdrvChild *target) |
312 | { | |
313 | return MIN_NON_ZERO(INT_MAX, | |
314 | MIN_NON_ZERO(source->bs->bl.max_transfer, | |
315 | target->bs->bl.max_transfer)); | |
316 | } | |
317 | ||
00e30f05 | 318 | BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target, |
86c6a3b6 | 319 | int64_t cluster_size, bool use_copy_range, |
49577723 | 320 | bool compress, Error **errp) |
beb5f545 VSO |
321 | { |
322 | BlockCopyState *s; | |
beb5f545 | 323 | BdrvDirtyBitmap *copy_bitmap; |
49577723 | 324 | bool is_fleecing; |
beb5f545 | 325 | |
00e30f05 VSO |
326 | copy_bitmap = bdrv_create_dirty_bitmap(source->bs, cluster_size, NULL, |
327 | errp); | |
beb5f545 VSO |
328 | if (!copy_bitmap) { |
329 | return NULL; | |
330 | } | |
331 | bdrv_disable_dirty_bitmap(copy_bitmap); | |
332 | ||
49577723 VSO |
333 | /* |
334 | * If source is in backing chain of target assume that target is going to be | |
335 | * used for "image fleecing", i.e. it should represent a kind of snapshot of | |
336 | * source at backup-start point in time. And target is going to be read by | |
337 | * somebody (for example, used as NBD export) during backup job. | |
338 | * | |
339 | * In this case, we need to add BDRV_REQ_SERIALISING write flag to avoid | |
340 | * intersection of backup writes and third party reads from target, | |
341 | * otherwise reading from target we may occasionally read already updated by | |
342 | * guest data. | |
343 | * | |
344 | * For more information see commit f8d59dfb40bb and test | |
345 | * tests/qemu-iotests/222 | |
346 | */ | |
347 | is_fleecing = bdrv_chain_contains(target->bs, source->bs); | |
348 | ||
beb5f545 VSO |
349 | s = g_new(BlockCopyState, 1); |
350 | *s = (BlockCopyState) { | |
00e30f05 VSO |
351 | .source = source, |
352 | .target = target, | |
beb5f545 VSO |
353 | .copy_bitmap = copy_bitmap, |
354 | .cluster_size = cluster_size, | |
355 | .len = bdrv_dirty_bitmap_size(copy_bitmap), | |
49577723 VSO |
356 | .write_flags = (is_fleecing ? BDRV_REQ_SERIALISING : 0) | |
357 | (compress ? BDRV_REQ_WRITE_COMPRESSED : 0), | |
7f739d0e | 358 | .mem = shres_create(BLOCK_COPY_MAX_MEM), |
05d5e12b PB |
359 | .max_transfer = QEMU_ALIGN_DOWN( |
360 | block_copy_max_transfer(source, target), | |
361 | cluster_size), | |
beb5f545 VSO |
362 | }; |
363 | ||
05d5e12b | 364 | if (s->max_transfer < cluster_size) { |
0e240245 VSO |
365 | /* |
366 | * copy_range does not respect max_transfer. We don't want to bother | |
367 | * with requests smaller than block-copy cluster size, so fallback to | |
368 | * buffered copying (read and write respect max_transfer on their | |
369 | * behalf). | |
370 | */ | |
05d5e12b | 371 | s->method = COPY_READ_WRITE_CLUSTER; |
49577723 | 372 | } else if (compress) { |
dcfbece6 | 373 | /* Compression supports only cluster-size writes and no copy-range. */ |
05d5e12b | 374 | s->method = COPY_READ_WRITE_CLUSTER; |
0e240245 VSO |
375 | } else { |
376 | /* | |
05d5e12b | 377 | * If copy range enabled, start with COPY_RANGE_SMALL, until first |
9d31bc53 | 378 | * successful copy_range (look at block_copy_do_copy). |
0e240245 | 379 | */ |
05d5e12b | 380 | s->method = use_copy_range ? COPY_RANGE_SMALL : COPY_READ_WRITE; |
0e240245 | 381 | } |
beb5f545 | 382 | |
4951967d | 383 | ratelimit_init(&s->rate_limit); |
d0c389d2 | 384 | qemu_co_mutex_init(&s->lock); |
e9407785 | 385 | QLIST_INIT(&s->tasks); |
2e099a9d | 386 | QLIST_INIT(&s->calls); |
a6ffe199 | 387 | |
beb5f545 | 388 | return s; |
beb5f545 VSO |
389 | } |
390 | ||
d0c389d2 | 391 | /* Only set before running the job, no need for locking. */ |
d0ebeca1 VSO |
392 | void block_copy_set_progress_meter(BlockCopyState *s, ProgressMeter *pm) |
393 | { | |
394 | s->progress = pm; | |
395 | } | |
396 | ||
4ce5dd3e VSO |
397 | /* |
398 | * Takes ownership of @task | |
399 | * | |
400 | * If pool is NULL directly run the task, otherwise schedule it into the pool. | |
401 | * | |
402 | * Returns: task.func return code if pool is NULL | |
403 | * otherwise -ECANCELED if pool status is bad | |
404 | * otherwise 0 (successfully scheduled) | |
405 | */ | |
406 | static coroutine_fn int block_copy_task_run(AioTaskPool *pool, | |
407 | BlockCopyTask *task) | |
408 | { | |
409 | if (!pool) { | |
410 | int ret = task->task.func(&task->task); | |
411 | ||
412 | g_free(task); | |
413 | return ret; | |
414 | } | |
415 | ||
416 | aio_task_pool_wait_slot(pool); | |
417 | if (aio_task_pool_status(pool) < 0) { | |
418 | co_put_to_shres(task->s->mem, task->bytes); | |
419 | block_copy_task_end(task, -ECANCELED); | |
420 | g_free(task); | |
421 | return -ECANCELED; | |
422 | } | |
423 | ||
424 | aio_task_pool_start_task(pool, &task->task); | |
425 | ||
426 | return 0; | |
427 | } | |
428 | ||
beb5f545 | 429 | /* |
e332a726 VSO |
430 | * block_copy_do_copy |
431 | * | |
dafaf135 VSO |
432 | * Do copy of cluster-aligned chunk. Requested region is allowed to exceed |
433 | * s->len only to cover last cluster when s->len is not aligned to clusters. | |
e332a726 VSO |
434 | * |
435 | * No sync here: nor bitmap neighter intersecting requests handling, only copy. | |
436 | * | |
05d5e12b PB |
437 | * @method is an in-out argument, so that copy_range can be either extended to |
438 | * a full-size buffer or disabled if the copy_range attempt fails. The output | |
439 | * value of @method should be used for subsequent tasks. | |
e332a726 | 440 | * Returns 0 on success. |
beb5f545 | 441 | */ |
e332a726 | 442 | static int coroutine_fn block_copy_do_copy(BlockCopyState *s, |
8719091f | 443 | int64_t offset, int64_t bytes, |
05d5e12b | 444 | BlockCopyMethod *method, |
bed95234 | 445 | bool *error_is_read) |
beb5f545 VSO |
446 | { |
447 | int ret; | |
8719091f | 448 | int64_t nbytes = MIN(offset + bytes, s->len) - offset; |
e332a726 | 449 | void *bounce_buffer = NULL; |
beb5f545 | 450 | |
8719091f VSO |
451 | assert(offset >= 0 && bytes > 0 && INT64_MAX - offset >= bytes); |
452 | assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); | |
dafaf135 | 453 | assert(QEMU_IS_ALIGNED(bytes, s->cluster_size)); |
8719091f VSO |
454 | assert(offset < s->len); |
455 | assert(offset + bytes <= s->len || | |
456 | offset + bytes == QEMU_ALIGN_UP(s->len, s->cluster_size)); | |
dafaf135 | 457 | assert(nbytes < INT_MAX); |
e332a726 | 458 | |
05d5e12b PB |
459 | switch (*method) { |
460 | case COPY_WRITE_ZEROES: | |
8719091f | 461 | ret = bdrv_co_pwrite_zeroes(s->target, offset, nbytes, s->write_flags & |
2d57511a VSO |
462 | ~BDRV_REQ_WRITE_COMPRESSED); |
463 | if (ret < 0) { | |
8719091f | 464 | trace_block_copy_write_zeroes_fail(s, offset, ret); |
d7eca542 | 465 | *error_is_read = false; |
2d57511a VSO |
466 | } |
467 | return ret; | |
2d57511a | 468 | |
05d5e12b PB |
469 | case COPY_RANGE_SMALL: |
470 | case COPY_RANGE_FULL: | |
8719091f | 471 | ret = bdrv_co_copy_range(s->source, offset, s->target, offset, nbytes, |
e332a726 | 472 | 0, s->write_flags); |
05d5e12b PB |
473 | if (ret >= 0) { |
474 | /* Successful copy-range, increase chunk size. */ | |
475 | *method = COPY_RANGE_FULL; | |
bed95234 | 476 | return 0; |
e332a726 | 477 | } |
e332a726 | 478 | |
05d5e12b PB |
479 | trace_block_copy_copy_range_fail(s, offset, ret); |
480 | *method = COPY_READ_WRITE; | |
481 | /* Fall through to read+write with allocated buffer */ | |
0e240245 | 482 | |
05d5e12b PB |
483 | case COPY_READ_WRITE_CLUSTER: |
484 | case COPY_READ_WRITE: | |
485 | /* | |
486 | * In case of failed copy_range request above, we may proceed with | |
487 | * buffered request larger than BLOCK_COPY_MAX_BUFFER. | |
488 | * Still, further requests will be properly limited, so don't care too | |
489 | * much. Moreover the most likely case (copy_range is unsupported for | |
490 | * the configuration, so the very first copy_range request fails) | |
491 | * is handled by setting large copy_size only after first successful | |
492 | * copy_range. | |
493 | */ | |
beb5f545 | 494 | |
05d5e12b | 495 | bounce_buffer = qemu_blockalign(s->source->bs, nbytes); |
beb5f545 | 496 | |
05d5e12b PB |
497 | ret = bdrv_co_pread(s->source, offset, nbytes, bounce_buffer, 0); |
498 | if (ret < 0) { | |
499 | trace_block_copy_read_fail(s, offset, ret); | |
500 | *error_is_read = true; | |
501 | goto out; | |
502 | } | |
beb5f545 | 503 | |
05d5e12b PB |
504 | ret = bdrv_co_pwrite(s->target, offset, nbytes, bounce_buffer, |
505 | s->write_flags); | |
506 | if (ret < 0) { | |
507 | trace_block_copy_write_fail(s, offset, ret); | |
508 | *error_is_read = false; | |
509 | goto out; | |
510 | } | |
3816edd2 | 511 | |
05d5e12b PB |
512 | out: |
513 | qemu_vfree(bounce_buffer); | |
514 | break; | |
beb5f545 | 515 | |
05d5e12b PB |
516 | default: |
517 | abort(); | |
bed95234 VSO |
518 | } |
519 | ||
05d5e12b | 520 | return ret; |
bed95234 VSO |
521 | } |
522 | ||
4ce5dd3e VSO |
523 | static coroutine_fn int block_copy_task_entry(AioTask *task) |
524 | { | |
525 | BlockCopyTask *t = container_of(task, BlockCopyTask, task); | |
c6a3e3df | 526 | BlockCopyState *s = t->s; |
c78dd00e | 527 | bool error_is_read = false; |
05d5e12b | 528 | BlockCopyMethod method = t->method; |
4ce5dd3e VSO |
529 | int ret; |
530 | ||
05d5e12b | 531 | ret = block_copy_do_copy(s, t->offset, t->bytes, &method, &error_is_read); |
d0c389d2 EGE |
532 | |
533 | WITH_QEMU_LOCK_GUARD(&s->lock) { | |
534 | if (s->method == t->method) { | |
535 | s->method = method; | |
536 | } | |
537 | ||
538 | if (ret < 0) { | |
539 | if (!t->call_state->ret) { | |
540 | t->call_state->ret = ret; | |
541 | t->call_state->error_is_read = error_is_read; | |
542 | } | |
543 | } else { | |
544 | progress_work_done(s->progress, t->bytes); | |
8146b357 | 545 | } |
4ce5dd3e | 546 | } |
c6a3e3df | 547 | co_put_to_shres(s->mem, t->bytes); |
4ce5dd3e VSO |
548 | block_copy_task_end(t, ret); |
549 | ||
550 | return ret; | |
551 | } | |
552 | ||
2d57511a VSO |
553 | static int block_copy_block_status(BlockCopyState *s, int64_t offset, |
554 | int64_t bytes, int64_t *pnum) | |
555 | { | |
556 | int64_t num; | |
557 | BlockDriverState *base; | |
558 | int ret; | |
559 | ||
d0c389d2 | 560 | if (qatomic_read(&s->skip_unallocated)) { |
c6f6d846 | 561 | base = bdrv_backing_chain_next(s->source->bs); |
2d57511a VSO |
562 | } else { |
563 | base = NULL; | |
564 | } | |
565 | ||
566 | ret = bdrv_block_status_above(s->source->bs, base, offset, bytes, &num, | |
567 | NULL, NULL); | |
568 | if (ret < 0 || num < s->cluster_size) { | |
569 | /* | |
570 | * On error or if failed to obtain large enough chunk just fallback to | |
571 | * copy one cluster. | |
572 | */ | |
573 | num = s->cluster_size; | |
574 | ret = BDRV_BLOCK_ALLOCATED | BDRV_BLOCK_DATA; | |
575 | } else if (offset + num == s->len) { | |
576 | num = QEMU_ALIGN_UP(num, s->cluster_size); | |
577 | } else { | |
578 | num = QEMU_ALIGN_DOWN(num, s->cluster_size); | |
579 | } | |
580 | ||
581 | *pnum = num; | |
582 | return ret; | |
583 | } | |
584 | ||
beb5f545 VSO |
585 | /* |
586 | * Check if the cluster starting at offset is allocated or not. | |
587 | * return via pnum the number of contiguous clusters sharing this allocation. | |
588 | */ | |
589 | static int block_copy_is_cluster_allocated(BlockCopyState *s, int64_t offset, | |
590 | int64_t *pnum) | |
591 | { | |
00e30f05 | 592 | BlockDriverState *bs = s->source->bs; |
beb5f545 VSO |
593 | int64_t count, total_count = 0; |
594 | int64_t bytes = s->len - offset; | |
595 | int ret; | |
596 | ||
597 | assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); | |
598 | ||
599 | while (true) { | |
600 | ret = bdrv_is_allocated(bs, offset, bytes, &count); | |
601 | if (ret < 0) { | |
602 | return ret; | |
603 | } | |
604 | ||
605 | total_count += count; | |
606 | ||
607 | if (ret || count == 0) { | |
608 | /* | |
609 | * ret: partial segment(s) are considered allocated. | |
610 | * otherwise: unallocated tail is treated as an entire segment. | |
611 | */ | |
612 | *pnum = DIV_ROUND_UP(total_count, s->cluster_size); | |
613 | return ret; | |
614 | } | |
615 | ||
616 | /* Unallocated segment(s) with uncertain following segment(s) */ | |
617 | if (total_count >= s->cluster_size) { | |
618 | *pnum = total_count / s->cluster_size; | |
619 | return 0; | |
620 | } | |
621 | ||
622 | offset += count; | |
623 | bytes -= count; | |
624 | } | |
625 | } | |
626 | ||
627 | /* | |
628 | * Reset bits in copy_bitmap starting at offset if they represent unallocated | |
629 | * data in the image. May reset subsequent contiguous bits. | |
630 | * @return 0 when the cluster at @offset was unallocated, | |
631 | * 1 otherwise, and -ret on error. | |
632 | */ | |
633 | int64_t block_copy_reset_unallocated(BlockCopyState *s, | |
634 | int64_t offset, int64_t *count) | |
635 | { | |
636 | int ret; | |
637 | int64_t clusters, bytes; | |
638 | ||
639 | ret = block_copy_is_cluster_allocated(s, offset, &clusters); | |
640 | if (ret < 0) { | |
641 | return ret; | |
642 | } | |
643 | ||
644 | bytes = clusters * s->cluster_size; | |
645 | ||
646 | if (!ret) { | |
d0c389d2 | 647 | qemu_co_mutex_lock(&s->lock); |
beb5f545 | 648 | bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes); |
d0ebeca1 VSO |
649 | progress_set_remaining(s->progress, |
650 | bdrv_get_dirty_count(s->copy_bitmap) + | |
651 | s->in_flight_bytes); | |
d0c389d2 | 652 | qemu_co_mutex_unlock(&s->lock); |
beb5f545 VSO |
653 | } |
654 | ||
655 | *count = bytes; | |
656 | return ret; | |
657 | } | |
658 | ||
5332e5d2 VSO |
659 | /* |
660 | * block_copy_dirty_clusters | |
661 | * | |
662 | * Copy dirty clusters in @offset/@bytes range. | |
663 | * Returns 1 if dirty clusters found and successfully copied, 0 if no dirty | |
664 | * clusters found and -errno on failure. | |
665 | */ | |
3b8c2329 VSO |
666 | static int coroutine_fn |
667 | block_copy_dirty_clusters(BlockCopyCallState *call_state) | |
beb5f545 | 668 | { |
3b8c2329 VSO |
669 | BlockCopyState *s = call_state->s; |
670 | int64_t offset = call_state->offset; | |
671 | int64_t bytes = call_state->bytes; | |
672 | ||
beb5f545 | 673 | int ret = 0; |
5332e5d2 | 674 | bool found_dirty = false; |
42ac2144 | 675 | int64_t end = offset + bytes; |
4ce5dd3e | 676 | AioTaskPool *aio = NULL; |
beb5f545 VSO |
677 | |
678 | /* | |
679 | * block_copy() user is responsible for keeping source and target in same | |
680 | * aio context | |
681 | */ | |
00e30f05 VSO |
682 | assert(bdrv_get_aio_context(s->source->bs) == |
683 | bdrv_get_aio_context(s->target->bs)); | |
beb5f545 | 684 | |
8719091f | 685 | assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); |
dafaf135 | 686 | assert(QEMU_IS_ALIGNED(bytes, s->cluster_size)); |
beb5f545 | 687 | |
149009be EGE |
688 | while (bytes && aio_task_pool_status(aio) == 0 && |
689 | !qatomic_read(&call_state->cancelled)) { | |
4ce5dd3e | 690 | BlockCopyTask *task; |
42ac2144 | 691 | int64_t status_bytes; |
beb5f545 | 692 | |
3b8c2329 | 693 | task = block_copy_task_create(s, call_state, offset, bytes); |
42ac2144 VSO |
694 | if (!task) { |
695 | /* No more dirty bits in the bitmap */ | |
696 | trace_block_copy_skip_range(s, offset, bytes); | |
697 | break; | |
698 | } | |
699 | if (task->offset > offset) { | |
700 | trace_block_copy_skip_range(s, offset, task->offset - offset); | |
beb5f545 VSO |
701 | } |
702 | ||
5332e5d2 VSO |
703 | found_dirty = true; |
704 | ||
42ac2144 VSO |
705 | ret = block_copy_block_status(s, task->offset, task->bytes, |
706 | &status_bytes); | |
5332e5d2 | 707 | assert(ret >= 0); /* never fail */ |
42ac2144 VSO |
708 | if (status_bytes < task->bytes) { |
709 | block_copy_task_shrink(task, status_bytes); | |
710 | } | |
d0c389d2 EGE |
711 | if (qatomic_read(&s->skip_unallocated) && |
712 | !(ret & BDRV_BLOCK_ALLOCATED)) { | |
1348a657 | 713 | block_copy_task_end(task, 0); |
42ac2144 VSO |
714 | trace_block_copy_skip_range(s, task->offset, task->bytes); |
715 | offset = task_end(task); | |
716 | bytes = end - offset; | |
fc9aefc8 | 717 | g_free(task); |
2d57511a | 718 | continue; |
beb5f545 | 719 | } |
bed95234 | 720 | if (ret & BDRV_BLOCK_ZERO) { |
05d5e12b | 721 | task->method = COPY_WRITE_ZEROES; |
bed95234 | 722 | } |
beb5f545 | 723 | |
ca657c99 PB |
724 | if (!call_state->ignore_ratelimit) { |
725 | uint64_t ns = ratelimit_calculate_delay(&s->rate_limit, 0); | |
726 | if (ns > 0) { | |
727 | block_copy_task_end(task, -EAGAIN); | |
728 | g_free(task); | |
729 | qemu_co_sleep_ns_wakeable(&call_state->sleep, | |
730 | QEMU_CLOCK_REALTIME, ns); | |
731 | continue; | |
7e032df0 | 732 | } |
7e032df0 VSO |
733 | } |
734 | ||
ca657c99 PB |
735 | ratelimit_calculate_delay(&s->rate_limit, task->bytes); |
736 | ||
42ac2144 | 737 | trace_block_copy_process(s, task->offset); |
beb5f545 | 738 | |
42ac2144 | 739 | co_get_from_shres(s->mem, task->bytes); |
beb5f545 | 740 | |
42ac2144 VSO |
741 | offset = task_end(task); |
742 | bytes = end - offset; | |
4ce5dd3e VSO |
743 | |
744 | if (!aio && bytes) { | |
26be9d62 | 745 | aio = aio_task_pool_new(call_state->max_workers); |
4ce5dd3e VSO |
746 | } |
747 | ||
748 | ret = block_copy_task_run(aio, task); | |
749 | if (ret < 0) { | |
750 | goto out; | |
751 | } | |
752 | } | |
753 | ||
754 | out: | |
755 | if (aio) { | |
756 | aio_task_pool_wait_all(aio); | |
757 | ||
758 | /* | |
759 | * We are not really interested in -ECANCELED returned from | |
760 | * block_copy_task_run. If it fails, it means some task already failed | |
761 | * for real reason, let's return first failure. | |
762 | * Still, assert that we don't rewrite failure by success. | |
e8de7ba9 VSO |
763 | * |
764 | * Note: ret may be positive here because of block-status result. | |
4ce5dd3e | 765 | */ |
e8de7ba9 | 766 | assert(ret >= 0 || aio_task_pool_status(aio) < 0); |
4ce5dd3e VSO |
767 | ret = aio_task_pool_status(aio); |
768 | ||
769 | aio_task_pool_free(aio); | |
770 | } | |
beb5f545 | 771 | |
4ce5dd3e | 772 | return ret < 0 ? ret : found_dirty; |
5332e5d2 VSO |
773 | } |
774 | ||
7e032df0 VSO |
775 | void block_copy_kick(BlockCopyCallState *call_state) |
776 | { | |
29a6ea24 | 777 | qemu_co_sleep_wake(&call_state->sleep); |
7e032df0 VSO |
778 | } |
779 | ||
5332e5d2 | 780 | /* |
3b8c2329 | 781 | * block_copy_common |
5332e5d2 VSO |
782 | * |
783 | * Copy requested region, accordingly to dirty bitmap. | |
784 | * Collaborate with parallel block_copy requests: if they succeed it will help | |
785 | * us. If they fail, we will retry not-copied regions. So, if we return error, | |
786 | * it means that some I/O operation failed in context of _this_ block_copy call, | |
787 | * not some parallel operation. | |
788 | */ | |
3b8c2329 | 789 | static int coroutine_fn block_copy_common(BlockCopyCallState *call_state) |
5332e5d2 VSO |
790 | { |
791 | int ret; | |
c6a3e3df | 792 | BlockCopyState *s = call_state->s; |
5332e5d2 | 793 | |
d0c389d2 | 794 | qemu_co_mutex_lock(&s->lock); |
c6a3e3df | 795 | QLIST_INSERT_HEAD(&s->calls, call_state, list); |
d0c389d2 | 796 | qemu_co_mutex_unlock(&s->lock); |
2e099a9d | 797 | |
5332e5d2 | 798 | do { |
3b8c2329 | 799 | ret = block_copy_dirty_clusters(call_state); |
5332e5d2 | 800 | |
149009be | 801 | if (ret == 0 && !qatomic_read(&call_state->cancelled)) { |
d0c389d2 EGE |
802 | WITH_QEMU_LOCK_GUARD(&s->lock) { |
803 | /* | |
804 | * Check that there is no task we still need to | |
805 | * wait to complete | |
806 | */ | |
807 | ret = block_copy_wait_one(s, call_state->offset, | |
808 | call_state->bytes); | |
809 | if (ret == 0) { | |
810 | /* | |
811 | * No pending tasks, but check again the bitmap in this | |
812 | * same critical section, since a task might have failed | |
813 | * between this and the critical section in | |
814 | * block_copy_dirty_clusters(). | |
815 | * | |
816 | * block_copy_wait_one return value 0 also means that it | |
817 | * didn't release the lock. So, we are still in the same | |
818 | * critical section, not interrupted by any concurrent | |
819 | * access to state. | |
820 | */ | |
821 | ret = bdrv_dirty_bitmap_next_dirty(s->copy_bitmap, | |
822 | call_state->offset, | |
823 | call_state->bytes) >= 0; | |
824 | } | |
825 | } | |
5332e5d2 VSO |
826 | } |
827 | ||
828 | /* | |
829 | * We retry in two cases: | |
830 | * 1. Some progress done | |
831 | * Something was copied, which means that there were yield points | |
832 | * and some new dirty bits may have appeared (due to failed parallel | |
833 | * block-copy requests). | |
834 | * 2. We have waited for some intersecting block-copy request | |
835 | * It may have failed and produced new dirty bits. | |
836 | */ | |
149009be | 837 | } while (ret > 0 && !qatomic_read(&call_state->cancelled)); |
a6ffe199 | 838 | |
149009be | 839 | qatomic_store_release(&call_state->finished, true); |
de4641b4 VSO |
840 | |
841 | if (call_state->cb) { | |
842 | call_state->cb(call_state->cb_opaque); | |
843 | } | |
844 | ||
d0c389d2 | 845 | qemu_co_mutex_lock(&s->lock); |
2e099a9d | 846 | QLIST_REMOVE(call_state, list); |
d0c389d2 | 847 | qemu_co_mutex_unlock(&s->lock); |
2e099a9d | 848 | |
beb5f545 VSO |
849 | return ret; |
850 | } | |
397f4e9d | 851 | |
3b8c2329 | 852 | int coroutine_fn block_copy(BlockCopyState *s, int64_t start, int64_t bytes, |
143a6384 | 853 | bool ignore_ratelimit) |
3b8c2329 VSO |
854 | { |
855 | BlockCopyCallState call_state = { | |
856 | .s = s, | |
857 | .offset = start, | |
858 | .bytes = bytes, | |
7e032df0 | 859 | .ignore_ratelimit = ignore_ratelimit, |
26be9d62 | 860 | .max_workers = BLOCK_COPY_MAX_WORKERS, |
3b8c2329 VSO |
861 | }; |
862 | ||
143a6384 | 863 | return block_copy_common(&call_state); |
3b8c2329 VSO |
864 | } |
865 | ||
de4641b4 VSO |
866 | static void coroutine_fn block_copy_async_co_entry(void *opaque) |
867 | { | |
868 | block_copy_common(opaque); | |
869 | } | |
870 | ||
871 | BlockCopyCallState *block_copy_async(BlockCopyState *s, | |
872 | int64_t offset, int64_t bytes, | |
26be9d62 | 873 | int max_workers, int64_t max_chunk, |
de4641b4 VSO |
874 | BlockCopyAsyncCallbackFunc cb, |
875 | void *cb_opaque) | |
876 | { | |
877 | BlockCopyCallState *call_state = g_new(BlockCopyCallState, 1); | |
878 | ||
879 | *call_state = (BlockCopyCallState) { | |
880 | .s = s, | |
881 | .offset = offset, | |
882 | .bytes = bytes, | |
26be9d62 VSO |
883 | .max_workers = max_workers, |
884 | .max_chunk = max_chunk, | |
de4641b4 VSO |
885 | .cb = cb, |
886 | .cb_opaque = cb_opaque, | |
887 | ||
888 | .co = qemu_coroutine_create(block_copy_async_co_entry, call_state), | |
889 | }; | |
890 | ||
891 | qemu_coroutine_enter(call_state->co); | |
892 | ||
893 | return call_state; | |
894 | } | |
895 | ||
896 | void block_copy_call_free(BlockCopyCallState *call_state) | |
897 | { | |
898 | if (!call_state) { | |
899 | return; | |
900 | } | |
901 | ||
149009be | 902 | assert(qatomic_read(&call_state->finished)); |
de4641b4 VSO |
903 | g_free(call_state); |
904 | } | |
905 | ||
906 | bool block_copy_call_finished(BlockCopyCallState *call_state) | |
907 | { | |
149009be | 908 | return qatomic_read(&call_state->finished); |
de4641b4 VSO |
909 | } |
910 | ||
911 | bool block_copy_call_succeeded(BlockCopyCallState *call_state) | |
912 | { | |
149009be EGE |
913 | return qatomic_load_acquire(&call_state->finished) && |
914 | !qatomic_read(&call_state->cancelled) && | |
915 | call_state->ret == 0; | |
de4641b4 VSO |
916 | } |
917 | ||
918 | bool block_copy_call_failed(BlockCopyCallState *call_state) | |
919 | { | |
149009be EGE |
920 | return qatomic_load_acquire(&call_state->finished) && |
921 | !qatomic_read(&call_state->cancelled) && | |
922 | call_state->ret < 0; | |
a6d23d56 VSO |
923 | } |
924 | ||
925 | bool block_copy_call_cancelled(BlockCopyCallState *call_state) | |
926 | { | |
149009be | 927 | return qatomic_read(&call_state->cancelled); |
de4641b4 VSO |
928 | } |
929 | ||
930 | int block_copy_call_status(BlockCopyCallState *call_state, bool *error_is_read) | |
931 | { | |
149009be | 932 | assert(qatomic_load_acquire(&call_state->finished)); |
de4641b4 VSO |
933 | if (error_is_read) { |
934 | *error_is_read = call_state->error_is_read; | |
935 | } | |
936 | return call_state->ret; | |
937 | } | |
938 | ||
149009be EGE |
939 | /* |
940 | * Note that cancelling and finishing are racy. | |
941 | * User can cancel a block-copy that is already finished. | |
942 | */ | |
a6d23d56 VSO |
943 | void block_copy_call_cancel(BlockCopyCallState *call_state) |
944 | { | |
149009be | 945 | qatomic_set(&call_state->cancelled, true); |
a6d23d56 VSO |
946 | block_copy_kick(call_state); |
947 | } | |
948 | ||
397f4e9d VSO |
949 | BdrvDirtyBitmap *block_copy_dirty_bitmap(BlockCopyState *s) |
950 | { | |
951 | return s->copy_bitmap; | |
952 | } | |
953 | ||
954 | void block_copy_set_skip_unallocated(BlockCopyState *s, bool skip) | |
955 | { | |
d0c389d2 | 956 | qatomic_set(&s->skip_unallocated, skip); |
397f4e9d | 957 | } |
7e032df0 VSO |
958 | |
959 | void block_copy_set_speed(BlockCopyState *s, uint64_t speed) | |
960 | { | |
ca657c99 | 961 | ratelimit_set_speed(&s->rate_limit, speed, BLOCK_COPY_SLICE_TIME); |
7e032df0 VSO |
962 | |
963 | /* | |
964 | * Note: it's good to kick all call states from here, but it should be done | |
965 | * only from a coroutine, to not crash if s->calls list changed while | |
966 | * entering one call. So for now, the only user of this function kicks its | |
967 | * only one call_state by hand. | |
968 | */ | |
969 | } |