]> git.proxmox.com Git - mirror_qemu.git/blame - block/block-copy.c
block/block-copy: factor out find_conflicting_inflight_req
[mirror_qemu.git] / block / block-copy.c
CommitLineData
beb5f545
VSO
1/*
2 * block_copy API
3 *
4 * Copyright (C) 2013 Proxmox Server Solutions
5 * Copyright (c) 2019 Virtuozzo International GmbH.
6 *
7 * Authors:
8 * Dietmar Maurer (dietmar@proxmox.com)
9 * Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
10 *
11 * This work is licensed under the terms of the GNU GPL, version 2 or later.
12 * See the COPYING file in the top-level directory.
13 */
14
15#include "qemu/osdep.h"
16
17#include "trace.h"
18#include "qapi/error.h"
19#include "block/block-copy.h"
20#include "sysemu/block-backend.h"
b3b7036a
VSO
21#include "qemu/units.h"
22
23#define BLOCK_COPY_MAX_COPY_RANGE (16 * MiB)
0e240245 24#define BLOCK_COPY_MAX_BUFFER (1 * MiB)
7f739d0e 25#define BLOCK_COPY_MAX_MEM (128 * MiB)
beb5f545 26
17187cb6
VSO
27static BlockCopyInFlightReq *find_conflicting_inflight_req(BlockCopyState *s,
28 int64_t start,
29 int64_t end)
30{
31 BlockCopyInFlightReq *req;
32
33 QLIST_FOREACH(req, &s->inflight_reqs, list) {
34 if (end > req->start_byte && start < req->end_byte) {
35 return req;
36 }
37 }
38
39 return NULL;
40}
41
a6ffe199
VSO
42static void coroutine_fn block_copy_wait_inflight_reqs(BlockCopyState *s,
43 int64_t start,
44 int64_t end)
45{
46 BlockCopyInFlightReq *req;
17187cb6
VSO
47
48 while ((req = find_conflicting_inflight_req(s, start, end))) {
49 qemu_co_queue_wait(&req->wait_queue, NULL);
50 }
a6ffe199
VSO
51}
52
53static void block_copy_inflight_req_begin(BlockCopyState *s,
54 BlockCopyInFlightReq *req,
55 int64_t start, int64_t end)
56{
57 req->start_byte = start;
58 req->end_byte = end;
59 qemu_co_queue_init(&req->wait_queue);
60 QLIST_INSERT_HEAD(&s->inflight_reqs, req, list);
61}
62
63static void coroutine_fn block_copy_inflight_req_end(BlockCopyInFlightReq *req)
64{
65 QLIST_REMOVE(req, list);
66 qemu_co_queue_restart_all(&req->wait_queue);
67}
68
beb5f545
VSO
69void block_copy_state_free(BlockCopyState *s)
70{
71 if (!s) {
72 return;
73 }
74
5deb6cbd 75 bdrv_release_dirty_bitmap(s->copy_bitmap);
7f739d0e 76 shres_destroy(s->mem);
beb5f545
VSO
77 g_free(s);
78}
79
9d31bc53
VSO
80static uint32_t block_copy_max_transfer(BdrvChild *source, BdrvChild *target)
81{
82 return MIN_NON_ZERO(INT_MAX,
83 MIN_NON_ZERO(source->bs->bl.max_transfer,
84 target->bs->bl.max_transfer));
85}
86
00e30f05 87BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
0f4b02b7
VSO
88 int64_t cluster_size,
89 BdrvRequestFlags write_flags, Error **errp)
beb5f545
VSO
90{
91 BlockCopyState *s;
beb5f545
VSO
92 BdrvDirtyBitmap *copy_bitmap;
93
00e30f05
VSO
94 copy_bitmap = bdrv_create_dirty_bitmap(source->bs, cluster_size, NULL,
95 errp);
beb5f545
VSO
96 if (!copy_bitmap) {
97 return NULL;
98 }
99 bdrv_disable_dirty_bitmap(copy_bitmap);
100
101 s = g_new(BlockCopyState, 1);
102 *s = (BlockCopyState) {
00e30f05
VSO
103 .source = source,
104 .target = target,
beb5f545
VSO
105 .copy_bitmap = copy_bitmap,
106 .cluster_size = cluster_size,
107 .len = bdrv_dirty_bitmap_size(copy_bitmap),
108 .write_flags = write_flags,
7f739d0e 109 .mem = shres_create(BLOCK_COPY_MAX_MEM),
beb5f545
VSO
110 };
111
9d31bc53 112 if (block_copy_max_transfer(source, target) < cluster_size) {
0e240245
VSO
113 /*
114 * copy_range does not respect max_transfer. We don't want to bother
115 * with requests smaller than block-copy cluster size, so fallback to
116 * buffered copying (read and write respect max_transfer on their
117 * behalf).
118 */
119 s->use_copy_range = false;
120 s->copy_size = cluster_size;
121 } else if (write_flags & BDRV_REQ_WRITE_COMPRESSED) {
dcfbece6 122 /* Compression supports only cluster-size writes and no copy-range. */
0e240245 123 s->use_copy_range = false;
dcfbece6 124 s->copy_size = cluster_size;
0e240245
VSO
125 } else {
126 /*
9d31bc53
VSO
127 * We enable copy-range, but keep small copy_size, until first
128 * successful copy_range (look at block_copy_do_copy).
0e240245
VSO
129 */
130 s->use_copy_range = true;
9d31bc53 131 s->copy_size = MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER);
0e240245 132 }
beb5f545 133
a6ffe199
VSO
134 QLIST_INIT(&s->inflight_reqs);
135
beb5f545 136 return s;
beb5f545
VSO
137}
138
d0ebeca1 139void block_copy_set_progress_callback(
0f4b02b7
VSO
140 BlockCopyState *s,
141 ProgressBytesCallbackFunc progress_bytes_callback,
0f4b02b7
VSO
142 void *progress_opaque)
143{
144 s->progress_bytes_callback = progress_bytes_callback;
0f4b02b7
VSO
145 s->progress_opaque = progress_opaque;
146}
147
d0ebeca1
VSO
148void block_copy_set_progress_meter(BlockCopyState *s, ProgressMeter *pm)
149{
150 s->progress = pm;
151}
152
beb5f545 153/*
e332a726
VSO
154 * block_copy_do_copy
155 *
156 * Do copy of cluser-aligned chunk. @end is allowed to exceed s->len only to
157 * cover last cluster when s->len is not aligned to clusters.
158 *
159 * No sync here: nor bitmap neighter intersecting requests handling, only copy.
160 *
161 * Returns 0 on success.
beb5f545 162 */
e332a726
VSO
163static int coroutine_fn block_copy_do_copy(BlockCopyState *s,
164 int64_t start, int64_t end,
2d57511a 165 bool zeroes, bool *error_is_read)
beb5f545
VSO
166{
167 int ret;
e332a726
VSO
168 int nbytes = MIN(end, s->len) - start;
169 void *bounce_buffer = NULL;
beb5f545
VSO
170
171 assert(QEMU_IS_ALIGNED(start, s->cluster_size));
e332a726
VSO
172 assert(QEMU_IS_ALIGNED(end, s->cluster_size));
173 assert(end < s->len || end == QEMU_ALIGN_UP(s->len, s->cluster_size));
174
2d57511a
VSO
175 if (zeroes) {
176 ret = bdrv_co_pwrite_zeroes(s->target, start, nbytes, s->write_flags &
177 ~BDRV_REQ_WRITE_COMPRESSED);
178 if (ret < 0) {
179 trace_block_copy_write_zeroes_fail(s, start, ret);
180 if (error_is_read) {
181 *error_is_read = false;
182 }
183 }
184 return ret;
185 }
186
e332a726
VSO
187 if (s->use_copy_range) {
188 ret = bdrv_co_copy_range(s->source, start, s->target, start, nbytes,
189 0, s->write_flags);
190 if (ret < 0) {
191 trace_block_copy_copy_range_fail(s, start, ret);
192 s->use_copy_range = false;
0e240245 193 s->copy_size = MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER);
e332a726
VSO
194 /* Fallback to read+write with allocated buffer */
195 } else {
9d31bc53
VSO
196 if (s->use_copy_range) {
197 /*
198 * Successful copy-range. Now increase copy_size. copy_range
199 * does not respect max_transfer (it's a TODO), so we factor
200 * that in here.
201 *
202 * Note: we double-check s->use_copy_range for the case when
203 * parallel block-copy request unsets it during previous
204 * bdrv_co_copy_range call.
205 */
206 s->copy_size =
207 MIN(MAX(s->cluster_size, BLOCK_COPY_MAX_COPY_RANGE),
208 QEMU_ALIGN_DOWN(block_copy_max_transfer(s->source,
209 s->target),
210 s->cluster_size));
211 }
e332a726
VSO
212 goto out;
213 }
214 }
215
0e240245
VSO
216 /*
217 * In case of failed copy_range request above, we may proceed with buffered
218 * request larger than BLOCK_COPY_MAX_BUFFER. Still, further requests will
9d31bc53
VSO
219 * be properly limited, so don't care too much. Moreover the most likely
220 * case (copy_range is unsupported for the configuration, so the very first
221 * copy_range request fails) is handled by setting large copy_size only
222 * after first successful copy_range.
0e240245
VSO
223 */
224
e332a726 225 bounce_buffer = qemu_blockalign(s->source->bs, nbytes);
beb5f545 226
3816edd2 227 ret = bdrv_co_pread(s->source, start, nbytes, bounce_buffer, 0);
beb5f545 228 if (ret < 0) {
e332a726 229 trace_block_copy_read_fail(s, start, ret);
beb5f545
VSO
230 if (error_is_read) {
231 *error_is_read = true;
232 }
e332a726 233 goto out;
beb5f545
VSO
234 }
235
3816edd2 236 ret = bdrv_co_pwrite(s->target, start, nbytes, bounce_buffer,
00e30f05 237 s->write_flags);
beb5f545 238 if (ret < 0) {
e332a726 239 trace_block_copy_write_fail(s, start, ret);
beb5f545
VSO
240 if (error_is_read) {
241 *error_is_read = false;
242 }
e332a726 243 goto out;
beb5f545
VSO
244 }
245
e332a726 246out:
3816edd2
VSO
247 qemu_vfree(bounce_buffer);
248
beb5f545 249 return ret;
beb5f545
VSO
250}
251
2d57511a
VSO
252static int block_copy_block_status(BlockCopyState *s, int64_t offset,
253 int64_t bytes, int64_t *pnum)
254{
255 int64_t num;
256 BlockDriverState *base;
257 int ret;
258
259 if (s->skip_unallocated && s->source->bs->backing) {
260 base = s->source->bs->backing->bs;
261 } else {
262 base = NULL;
263 }
264
265 ret = bdrv_block_status_above(s->source->bs, base, offset, bytes, &num,
266 NULL, NULL);
267 if (ret < 0 || num < s->cluster_size) {
268 /*
269 * On error or if failed to obtain large enough chunk just fallback to
270 * copy one cluster.
271 */
272 num = s->cluster_size;
273 ret = BDRV_BLOCK_ALLOCATED | BDRV_BLOCK_DATA;
274 } else if (offset + num == s->len) {
275 num = QEMU_ALIGN_UP(num, s->cluster_size);
276 } else {
277 num = QEMU_ALIGN_DOWN(num, s->cluster_size);
278 }
279
280 *pnum = num;
281 return ret;
282}
283
beb5f545
VSO
284/*
285 * Check if the cluster starting at offset is allocated or not.
286 * return via pnum the number of contiguous clusters sharing this allocation.
287 */
288static int block_copy_is_cluster_allocated(BlockCopyState *s, int64_t offset,
289 int64_t *pnum)
290{
00e30f05 291 BlockDriverState *bs = s->source->bs;
beb5f545
VSO
292 int64_t count, total_count = 0;
293 int64_t bytes = s->len - offset;
294 int ret;
295
296 assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
297
298 while (true) {
299 ret = bdrv_is_allocated(bs, offset, bytes, &count);
300 if (ret < 0) {
301 return ret;
302 }
303
304 total_count += count;
305
306 if (ret || count == 0) {
307 /*
308 * ret: partial segment(s) are considered allocated.
309 * otherwise: unallocated tail is treated as an entire segment.
310 */
311 *pnum = DIV_ROUND_UP(total_count, s->cluster_size);
312 return ret;
313 }
314
315 /* Unallocated segment(s) with uncertain following segment(s) */
316 if (total_count >= s->cluster_size) {
317 *pnum = total_count / s->cluster_size;
318 return 0;
319 }
320
321 offset += count;
322 bytes -= count;
323 }
324}
325
326/*
327 * Reset bits in copy_bitmap starting at offset if they represent unallocated
328 * data in the image. May reset subsequent contiguous bits.
329 * @return 0 when the cluster at @offset was unallocated,
330 * 1 otherwise, and -ret on error.
331 */
332int64_t block_copy_reset_unallocated(BlockCopyState *s,
333 int64_t offset, int64_t *count)
334{
335 int ret;
336 int64_t clusters, bytes;
337
338 ret = block_copy_is_cluster_allocated(s, offset, &clusters);
339 if (ret < 0) {
340 return ret;
341 }
342
343 bytes = clusters * s->cluster_size;
344
345 if (!ret) {
346 bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes);
d0ebeca1
VSO
347 progress_set_remaining(s->progress,
348 bdrv_get_dirty_count(s->copy_bitmap) +
349 s->in_flight_bytes);
beb5f545
VSO
350 }
351
352 *count = bytes;
353 return ret;
354}
355
356int coroutine_fn block_copy(BlockCopyState *s,
357 int64_t start, uint64_t bytes,
00e30f05 358 bool *error_is_read)
beb5f545
VSO
359{
360 int ret = 0;
361 int64_t end = bytes + start; /* bytes */
a6ffe199 362 BlockCopyInFlightReq req;
beb5f545
VSO
363
364 /*
365 * block_copy() user is responsible for keeping source and target in same
366 * aio context
367 */
00e30f05
VSO
368 assert(bdrv_get_aio_context(s->source->bs) ==
369 bdrv_get_aio_context(s->target->bs));
beb5f545
VSO
370
371 assert(QEMU_IS_ALIGNED(start, s->cluster_size));
372 assert(QEMU_IS_ALIGNED(end, s->cluster_size));
373
a6ffe199
VSO
374 block_copy_wait_inflight_reqs(s, start, bytes);
375 block_copy_inflight_req_begin(s, &req, start, end);
376
beb5f545 377 while (start < end) {
2d57511a 378 int64_t next_zero, chunk_end, status_bytes;
beb5f545
VSO
379
380 if (!bdrv_dirty_bitmap_get(s->copy_bitmap, start)) {
381 trace_block_copy_skip(s, start);
382 start += s->cluster_size;
383 continue; /* already copied */
384 }
385
0e240245 386 chunk_end = MIN(end, start + s->copy_size);
e332a726
VSO
387
388 next_zero = bdrv_dirty_bitmap_next_zero(s->copy_bitmap, start,
389 chunk_end - start);
390 if (next_zero >= 0) {
391 assert(next_zero > start); /* start is dirty */
392 assert(next_zero < chunk_end); /* no need to do MIN() */
393 chunk_end = next_zero;
beb5f545
VSO
394 }
395
2d57511a
VSO
396 ret = block_copy_block_status(s, start, chunk_end - start,
397 &status_bytes);
398 if (s->skip_unallocated && !(ret & BDRV_BLOCK_ALLOCATED)) {
399 bdrv_reset_dirty_bitmap(s->copy_bitmap, start, status_bytes);
400 progress_set_remaining(s->progress,
401 bdrv_get_dirty_count(s->copy_bitmap) +
402 s->in_flight_bytes);
403 trace_block_copy_skip_range(s, start, status_bytes);
404 start += status_bytes;
405 continue;
beb5f545
VSO
406 }
407
2d57511a
VSO
408 chunk_end = MIN(chunk_end, start + status_bytes);
409
beb5f545
VSO
410 trace_block_copy_process(s, start);
411
e332a726 412 bdrv_reset_dirty_bitmap(s->copy_bitmap, start, chunk_end - start);
d0ebeca1 413 s->in_flight_bytes += chunk_end - start;
e332a726 414
7f739d0e 415 co_get_from_shres(s->mem, chunk_end - start);
2d57511a
VSO
416 ret = block_copy_do_copy(s, start, chunk_end, ret & BDRV_BLOCK_ZERO,
417 error_is_read);
7f739d0e 418 co_put_to_shres(s->mem, chunk_end - start);
d0ebeca1 419 s->in_flight_bytes -= chunk_end - start;
beb5f545 420 if (ret < 0) {
e332a726 421 bdrv_set_dirty_bitmap(s->copy_bitmap, start, chunk_end - start);
beb5f545
VSO
422 break;
423 }
424
d0ebeca1 425 progress_work_done(s->progress, chunk_end - start);
e332a726
VSO
426 s->progress_bytes_callback(chunk_end - start, s->progress_opaque);
427 start = chunk_end;
beb5f545
VSO
428 ret = 0;
429 }
430
a6ffe199
VSO
431 block_copy_inflight_req_end(&req);
432
beb5f545
VSO
433 return ret;
434}