4 * Copyright (C) 2013 Proxmox Server Solutions
5 * Copyright (c) 2019 Virtuozzo International GmbH.
8 * Dietmar Maurer (dietmar@proxmox.com)
9 * Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
11 * This work is licensed under the terms of the GNU GPL, version 2 or later.
12 * See the COPYING file in the top-level directory.
15 #include "qemu/osdep.h"
18 #include "qapi/error.h"
19 #include "block/block-copy.h"
20 #include "sysemu/block-backend.h"
21 #include "qemu/units.h"
23 #define BLOCK_COPY_MAX_COPY_RANGE (16 * MiB)
24 #define BLOCK_COPY_MAX_BUFFER (1 * MiB)
25 #define BLOCK_COPY_MAX_MEM (128 * MiB)
27 static BlockCopyInFlightReq
*find_conflicting_inflight_req(BlockCopyState
*s
,
31 BlockCopyInFlightReq
*req
;
33 QLIST_FOREACH(req
, &s
->inflight_reqs
, list
) {
34 if (offset
+ bytes
> req
->offset
&& offset
< req
->offset
+ req
->bytes
) {
42 static void coroutine_fn
block_copy_wait_inflight_reqs(BlockCopyState
*s
,
46 BlockCopyInFlightReq
*req
;
48 while ((req
= find_conflicting_inflight_req(s
, offset
, bytes
))) {
49 qemu_co_queue_wait(&req
->wait_queue
, NULL
);
53 static void block_copy_inflight_req_begin(BlockCopyState
*s
,
54 BlockCopyInFlightReq
*req
,
55 int64_t offset
, int64_t bytes
)
59 qemu_co_queue_init(&req
->wait_queue
);
60 QLIST_INSERT_HEAD(&s
->inflight_reqs
, req
, list
);
63 static void coroutine_fn
block_copy_inflight_req_end(BlockCopyInFlightReq
*req
)
65 QLIST_REMOVE(req
, list
);
66 qemu_co_queue_restart_all(&req
->wait_queue
);
69 void block_copy_state_free(BlockCopyState
*s
)
75 bdrv_release_dirty_bitmap(s
->copy_bitmap
);
76 shres_destroy(s
->mem
);
80 static uint32_t block_copy_max_transfer(BdrvChild
*source
, BdrvChild
*target
)
82 return MIN_NON_ZERO(INT_MAX
,
83 MIN_NON_ZERO(source
->bs
->bl
.max_transfer
,
84 target
->bs
->bl
.max_transfer
));
87 BlockCopyState
*block_copy_state_new(BdrvChild
*source
, BdrvChild
*target
,
89 BdrvRequestFlags write_flags
, Error
**errp
)
92 BdrvDirtyBitmap
*copy_bitmap
;
94 copy_bitmap
= bdrv_create_dirty_bitmap(source
->bs
, cluster_size
, NULL
,
99 bdrv_disable_dirty_bitmap(copy_bitmap
);
101 s
= g_new(BlockCopyState
, 1);
102 *s
= (BlockCopyState
) {
105 .copy_bitmap
= copy_bitmap
,
106 .cluster_size
= cluster_size
,
107 .len
= bdrv_dirty_bitmap_size(copy_bitmap
),
108 .write_flags
= write_flags
,
109 .mem
= shres_create(BLOCK_COPY_MAX_MEM
),
112 if (block_copy_max_transfer(source
, target
) < cluster_size
) {
114 * copy_range does not respect max_transfer. We don't want to bother
115 * with requests smaller than block-copy cluster size, so fallback to
116 * buffered copying (read and write respect max_transfer on their
119 s
->use_copy_range
= false;
120 s
->copy_size
= cluster_size
;
121 } else if (write_flags
& BDRV_REQ_WRITE_COMPRESSED
) {
122 /* Compression supports only cluster-size writes and no copy-range. */
123 s
->use_copy_range
= false;
124 s
->copy_size
= cluster_size
;
127 * We enable copy-range, but keep small copy_size, until first
128 * successful copy_range (look at block_copy_do_copy).
130 s
->use_copy_range
= true;
131 s
->copy_size
= MAX(s
->cluster_size
, BLOCK_COPY_MAX_BUFFER
);
134 QLIST_INIT(&s
->inflight_reqs
);
139 void block_copy_set_progress_callback(
141 ProgressBytesCallbackFunc progress_bytes_callback
,
142 void *progress_opaque
)
144 s
->progress_bytes_callback
= progress_bytes_callback
;
145 s
->progress_opaque
= progress_opaque
;
148 void block_copy_set_progress_meter(BlockCopyState
*s
, ProgressMeter
*pm
)
156 * Do copy of cluster-aligned chunk. Requested region is allowed to exceed
157 * s->len only to cover last cluster when s->len is not aligned to clusters.
159 * No sync here: nor bitmap neighter intersecting requests handling, only copy.
161 * Returns 0 on success.
163 static int coroutine_fn
block_copy_do_copy(BlockCopyState
*s
,
164 int64_t offset
, int64_t bytes
,
165 bool zeroes
, bool *error_is_read
)
168 int64_t nbytes
= MIN(offset
+ bytes
, s
->len
) - offset
;
169 void *bounce_buffer
= NULL
;
171 assert(offset
>= 0 && bytes
> 0 && INT64_MAX
- offset
>= bytes
);
172 assert(QEMU_IS_ALIGNED(offset
, s
->cluster_size
));
173 assert(QEMU_IS_ALIGNED(bytes
, s
->cluster_size
));
174 assert(offset
< s
->len
);
175 assert(offset
+ bytes
<= s
->len
||
176 offset
+ bytes
== QEMU_ALIGN_UP(s
->len
, s
->cluster_size
));
177 assert(nbytes
< INT_MAX
);
180 ret
= bdrv_co_pwrite_zeroes(s
->target
, offset
, nbytes
, s
->write_flags
&
181 ~BDRV_REQ_WRITE_COMPRESSED
);
183 trace_block_copy_write_zeroes_fail(s
, offset
, ret
);
185 *error_is_read
= false;
191 if (s
->use_copy_range
) {
192 ret
= bdrv_co_copy_range(s
->source
, offset
, s
->target
, offset
, nbytes
,
195 trace_block_copy_copy_range_fail(s
, offset
, ret
);
196 s
->use_copy_range
= false;
197 s
->copy_size
= MAX(s
->cluster_size
, BLOCK_COPY_MAX_BUFFER
);
198 /* Fallback to read+write with allocated buffer */
200 if (s
->use_copy_range
) {
202 * Successful copy-range. Now increase copy_size. copy_range
203 * does not respect max_transfer (it's a TODO), so we factor
206 * Note: we double-check s->use_copy_range for the case when
207 * parallel block-copy request unsets it during previous
208 * bdrv_co_copy_range call.
211 MIN(MAX(s
->cluster_size
, BLOCK_COPY_MAX_COPY_RANGE
),
212 QEMU_ALIGN_DOWN(block_copy_max_transfer(s
->source
,
221 * In case of failed copy_range request above, we may proceed with buffered
222 * request larger than BLOCK_COPY_MAX_BUFFER. Still, further requests will
223 * be properly limited, so don't care too much. Moreover the most likely
224 * case (copy_range is unsupported for the configuration, so the very first
225 * copy_range request fails) is handled by setting large copy_size only
226 * after first successful copy_range.
229 bounce_buffer
= qemu_blockalign(s
->source
->bs
, nbytes
);
231 ret
= bdrv_co_pread(s
->source
, offset
, nbytes
, bounce_buffer
, 0);
233 trace_block_copy_read_fail(s
, offset
, ret
);
235 *error_is_read
= true;
240 ret
= bdrv_co_pwrite(s
->target
, offset
, nbytes
, bounce_buffer
,
243 trace_block_copy_write_fail(s
, offset
, ret
);
245 *error_is_read
= false;
251 qemu_vfree(bounce_buffer
);
256 static int block_copy_block_status(BlockCopyState
*s
, int64_t offset
,
257 int64_t bytes
, int64_t *pnum
)
260 BlockDriverState
*base
;
263 if (s
->skip_unallocated
&& s
->source
->bs
->backing
) {
264 base
= s
->source
->bs
->backing
->bs
;
269 ret
= bdrv_block_status_above(s
->source
->bs
, base
, offset
, bytes
, &num
,
271 if (ret
< 0 || num
< s
->cluster_size
) {
273 * On error or if failed to obtain large enough chunk just fallback to
276 num
= s
->cluster_size
;
277 ret
= BDRV_BLOCK_ALLOCATED
| BDRV_BLOCK_DATA
;
278 } else if (offset
+ num
== s
->len
) {
279 num
= QEMU_ALIGN_UP(num
, s
->cluster_size
);
281 num
= QEMU_ALIGN_DOWN(num
, s
->cluster_size
);
289 * Check if the cluster starting at offset is allocated or not.
290 * return via pnum the number of contiguous clusters sharing this allocation.
292 static int block_copy_is_cluster_allocated(BlockCopyState
*s
, int64_t offset
,
295 BlockDriverState
*bs
= s
->source
->bs
;
296 int64_t count
, total_count
= 0;
297 int64_t bytes
= s
->len
- offset
;
300 assert(QEMU_IS_ALIGNED(offset
, s
->cluster_size
));
303 ret
= bdrv_is_allocated(bs
, offset
, bytes
, &count
);
308 total_count
+= count
;
310 if (ret
|| count
== 0) {
312 * ret: partial segment(s) are considered allocated.
313 * otherwise: unallocated tail is treated as an entire segment.
315 *pnum
= DIV_ROUND_UP(total_count
, s
->cluster_size
);
319 /* Unallocated segment(s) with uncertain following segment(s) */
320 if (total_count
>= s
->cluster_size
) {
321 *pnum
= total_count
/ s
->cluster_size
;
331 * Reset bits in copy_bitmap starting at offset if they represent unallocated
332 * data in the image. May reset subsequent contiguous bits.
333 * @return 0 when the cluster at @offset was unallocated,
334 * 1 otherwise, and -ret on error.
336 int64_t block_copy_reset_unallocated(BlockCopyState
*s
,
337 int64_t offset
, int64_t *count
)
340 int64_t clusters
, bytes
;
342 ret
= block_copy_is_cluster_allocated(s
, offset
, &clusters
);
347 bytes
= clusters
* s
->cluster_size
;
350 bdrv_reset_dirty_bitmap(s
->copy_bitmap
, offset
, bytes
);
351 progress_set_remaining(s
->progress
,
352 bdrv_get_dirty_count(s
->copy_bitmap
) +
360 int coroutine_fn
block_copy(BlockCopyState
*s
,
361 int64_t offset
, int64_t bytes
,
365 BlockCopyInFlightReq req
;
368 * block_copy() user is responsible for keeping source and target in same
371 assert(bdrv_get_aio_context(s
->source
->bs
) ==
372 bdrv_get_aio_context(s
->target
->bs
));
374 assert(QEMU_IS_ALIGNED(offset
, s
->cluster_size
));
375 assert(QEMU_IS_ALIGNED(bytes
, s
->cluster_size
));
377 block_copy_wait_inflight_reqs(s
, offset
, bytes
);
378 block_copy_inflight_req_begin(s
, &req
, offset
, bytes
);
381 int64_t next_zero
, cur_bytes
, status_bytes
;
383 if (!bdrv_dirty_bitmap_get(s
->copy_bitmap
, offset
)) {
384 trace_block_copy_skip(s
, offset
);
385 offset
+= s
->cluster_size
;
386 bytes
-= s
->cluster_size
;
387 continue; /* already copied */
390 cur_bytes
= MIN(bytes
, s
->copy_size
);
392 next_zero
= bdrv_dirty_bitmap_next_zero(s
->copy_bitmap
, offset
,
394 if (next_zero
>= 0) {
395 assert(next_zero
> offset
); /* offset is dirty */
396 assert(next_zero
< offset
+ cur_bytes
); /* no need to do MIN() */
397 cur_bytes
= next_zero
- offset
;
400 ret
= block_copy_block_status(s
, offset
, cur_bytes
, &status_bytes
);
401 if (s
->skip_unallocated
&& !(ret
& BDRV_BLOCK_ALLOCATED
)) {
402 bdrv_reset_dirty_bitmap(s
->copy_bitmap
, offset
, status_bytes
);
403 progress_set_remaining(s
->progress
,
404 bdrv_get_dirty_count(s
->copy_bitmap
) +
406 trace_block_copy_skip_range(s
, offset
, status_bytes
);
407 offset
+= status_bytes
;
408 bytes
-= status_bytes
;
412 cur_bytes
= MIN(cur_bytes
, status_bytes
);
414 trace_block_copy_process(s
, offset
);
416 bdrv_reset_dirty_bitmap(s
->copy_bitmap
, offset
, cur_bytes
);
417 s
->in_flight_bytes
+= cur_bytes
;
419 co_get_from_shres(s
->mem
, cur_bytes
);
420 ret
= block_copy_do_copy(s
, offset
, cur_bytes
, ret
& BDRV_BLOCK_ZERO
,
422 co_put_to_shres(s
->mem
, cur_bytes
);
423 s
->in_flight_bytes
-= cur_bytes
;
425 bdrv_set_dirty_bitmap(s
->copy_bitmap
, offset
, cur_bytes
);
429 progress_work_done(s
->progress
, cur_bytes
);
430 s
->progress_bytes_callback(cur_bytes
, s
->progress_opaque
);
435 block_copy_inflight_req_end(&req
);