]>
Commit | Line | Data |
---|---|---|
beb5f545 VSO |
1 | /* |
2 | * block_copy API | |
3 | * | |
4 | * Copyright (C) 2013 Proxmox Server Solutions | |
5 | * Copyright (c) 2019 Virtuozzo International GmbH. | |
6 | * | |
7 | * Authors: | |
8 | * Dietmar Maurer (dietmar@proxmox.com) | |
9 | * Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com> | |
10 | * | |
11 | * This work is licensed under the terms of the GNU GPL, version 2 or later. | |
12 | * See the COPYING file in the top-level directory. | |
13 | */ | |
14 | ||
15 | #include "qemu/osdep.h" | |
16 | ||
17 | #include "trace.h" | |
18 | #include "qapi/error.h" | |
19 | #include "block/block-copy.h" | |
20 | #include "sysemu/block-backend.h" | |
b3b7036a VSO |
21 | #include "qemu/units.h" |
22 | ||
23 | #define BLOCK_COPY_MAX_COPY_RANGE (16 * MiB) | |
0e240245 | 24 | #define BLOCK_COPY_MAX_BUFFER (1 * MiB) |
7f739d0e | 25 | #define BLOCK_COPY_MAX_MEM (128 * MiB) |
beb5f545 | 26 | |
17187cb6 VSO |
27 | static BlockCopyInFlightReq *find_conflicting_inflight_req(BlockCopyState *s, |
28 | int64_t start, | |
29 | int64_t end) | |
30 | { | |
31 | BlockCopyInFlightReq *req; | |
32 | ||
33 | QLIST_FOREACH(req, &s->inflight_reqs, list) { | |
34 | if (end > req->start_byte && start < req->end_byte) { | |
35 | return req; | |
36 | } | |
37 | } | |
38 | ||
39 | return NULL; | |
40 | } | |
41 | ||
a6ffe199 VSO |
42 | static void coroutine_fn block_copy_wait_inflight_reqs(BlockCopyState *s, |
43 | int64_t start, | |
44 | int64_t end) | |
45 | { | |
46 | BlockCopyInFlightReq *req; | |
17187cb6 VSO |
47 | |
48 | while ((req = find_conflicting_inflight_req(s, start, end))) { | |
49 | qemu_co_queue_wait(&req->wait_queue, NULL); | |
50 | } | |
a6ffe199 VSO |
51 | } |
52 | ||
53 | static void block_copy_inflight_req_begin(BlockCopyState *s, | |
54 | BlockCopyInFlightReq *req, | |
55 | int64_t start, int64_t end) | |
56 | { | |
57 | req->start_byte = start; | |
58 | req->end_byte = end; | |
59 | qemu_co_queue_init(&req->wait_queue); | |
60 | QLIST_INSERT_HEAD(&s->inflight_reqs, req, list); | |
61 | } | |
62 | ||
63 | static void coroutine_fn block_copy_inflight_req_end(BlockCopyInFlightReq *req) | |
64 | { | |
65 | QLIST_REMOVE(req, list); | |
66 | qemu_co_queue_restart_all(&req->wait_queue); | |
67 | } | |
68 | ||
beb5f545 VSO |
69 | void block_copy_state_free(BlockCopyState *s) |
70 | { | |
71 | if (!s) { | |
72 | return; | |
73 | } | |
74 | ||
5deb6cbd | 75 | bdrv_release_dirty_bitmap(s->copy_bitmap); |
7f739d0e | 76 | shres_destroy(s->mem); |
beb5f545 VSO |
77 | g_free(s); |
78 | } | |
79 | ||
9d31bc53 VSO |
80 | static uint32_t block_copy_max_transfer(BdrvChild *source, BdrvChild *target) |
81 | { | |
82 | return MIN_NON_ZERO(INT_MAX, | |
83 | MIN_NON_ZERO(source->bs->bl.max_transfer, | |
84 | target->bs->bl.max_transfer)); | |
85 | } | |
86 | ||
00e30f05 | 87 | BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target, |
0f4b02b7 VSO |
88 | int64_t cluster_size, |
89 | BdrvRequestFlags write_flags, Error **errp) | |
beb5f545 VSO |
90 | { |
91 | BlockCopyState *s; | |
beb5f545 VSO |
92 | BdrvDirtyBitmap *copy_bitmap; |
93 | ||
00e30f05 VSO |
94 | copy_bitmap = bdrv_create_dirty_bitmap(source->bs, cluster_size, NULL, |
95 | errp); | |
beb5f545 VSO |
96 | if (!copy_bitmap) { |
97 | return NULL; | |
98 | } | |
99 | bdrv_disable_dirty_bitmap(copy_bitmap); | |
100 | ||
101 | s = g_new(BlockCopyState, 1); | |
102 | *s = (BlockCopyState) { | |
00e30f05 VSO |
103 | .source = source, |
104 | .target = target, | |
beb5f545 VSO |
105 | .copy_bitmap = copy_bitmap, |
106 | .cluster_size = cluster_size, | |
107 | .len = bdrv_dirty_bitmap_size(copy_bitmap), | |
108 | .write_flags = write_flags, | |
7f739d0e | 109 | .mem = shres_create(BLOCK_COPY_MAX_MEM), |
beb5f545 VSO |
110 | }; |
111 | ||
9d31bc53 | 112 | if (block_copy_max_transfer(source, target) < cluster_size) { |
0e240245 VSO |
113 | /* |
114 | * copy_range does not respect max_transfer. We don't want to bother | |
115 | * with requests smaller than block-copy cluster size, so fallback to | |
116 | * buffered copying (read and write respect max_transfer on their | |
117 | * behalf). | |
118 | */ | |
119 | s->use_copy_range = false; | |
120 | s->copy_size = cluster_size; | |
121 | } else if (write_flags & BDRV_REQ_WRITE_COMPRESSED) { | |
dcfbece6 | 122 | /* Compression supports only cluster-size writes and no copy-range. */ |
0e240245 | 123 | s->use_copy_range = false; |
dcfbece6 | 124 | s->copy_size = cluster_size; |
0e240245 VSO |
125 | } else { |
126 | /* | |
9d31bc53 VSO |
127 | * We enable copy-range, but keep small copy_size, until first |
128 | * successful copy_range (look at block_copy_do_copy). | |
0e240245 VSO |
129 | */ |
130 | s->use_copy_range = true; | |
9d31bc53 | 131 | s->copy_size = MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER); |
0e240245 | 132 | } |
beb5f545 | 133 | |
a6ffe199 VSO |
134 | QLIST_INIT(&s->inflight_reqs); |
135 | ||
beb5f545 | 136 | return s; |
beb5f545 VSO |
137 | } |
138 | ||
d0ebeca1 | 139 | void block_copy_set_progress_callback( |
0f4b02b7 VSO |
140 | BlockCopyState *s, |
141 | ProgressBytesCallbackFunc progress_bytes_callback, | |
0f4b02b7 VSO |
142 | void *progress_opaque) |
143 | { | |
144 | s->progress_bytes_callback = progress_bytes_callback; | |
0f4b02b7 VSO |
145 | s->progress_opaque = progress_opaque; |
146 | } | |
147 | ||
d0ebeca1 VSO |
148 | void block_copy_set_progress_meter(BlockCopyState *s, ProgressMeter *pm) |
149 | { | |
150 | s->progress = pm; | |
151 | } | |
152 | ||
beb5f545 | 153 | /* |
e332a726 VSO |
154 | * block_copy_do_copy |
155 | * | |
156 | * Do copy of cluser-aligned chunk. @end is allowed to exceed s->len only to | |
157 | * cover last cluster when s->len is not aligned to clusters. | |
158 | * | |
159 | * No sync here: nor bitmap neighter intersecting requests handling, only copy. | |
160 | * | |
161 | * Returns 0 on success. | |
beb5f545 | 162 | */ |
e332a726 VSO |
163 | static int coroutine_fn block_copy_do_copy(BlockCopyState *s, |
164 | int64_t start, int64_t end, | |
2d57511a | 165 | bool zeroes, bool *error_is_read) |
beb5f545 VSO |
166 | { |
167 | int ret; | |
e332a726 VSO |
168 | int nbytes = MIN(end, s->len) - start; |
169 | void *bounce_buffer = NULL; | |
beb5f545 VSO |
170 | |
171 | assert(QEMU_IS_ALIGNED(start, s->cluster_size)); | |
e332a726 VSO |
172 | assert(QEMU_IS_ALIGNED(end, s->cluster_size)); |
173 | assert(end < s->len || end == QEMU_ALIGN_UP(s->len, s->cluster_size)); | |
174 | ||
2d57511a VSO |
175 | if (zeroes) { |
176 | ret = bdrv_co_pwrite_zeroes(s->target, start, nbytes, s->write_flags & | |
177 | ~BDRV_REQ_WRITE_COMPRESSED); | |
178 | if (ret < 0) { | |
179 | trace_block_copy_write_zeroes_fail(s, start, ret); | |
180 | if (error_is_read) { | |
181 | *error_is_read = false; | |
182 | } | |
183 | } | |
184 | return ret; | |
185 | } | |
186 | ||
e332a726 VSO |
187 | if (s->use_copy_range) { |
188 | ret = bdrv_co_copy_range(s->source, start, s->target, start, nbytes, | |
189 | 0, s->write_flags); | |
190 | if (ret < 0) { | |
191 | trace_block_copy_copy_range_fail(s, start, ret); | |
192 | s->use_copy_range = false; | |
0e240245 | 193 | s->copy_size = MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER); |
e332a726 VSO |
194 | /* Fallback to read+write with allocated buffer */ |
195 | } else { | |
9d31bc53 VSO |
196 | if (s->use_copy_range) { |
197 | /* | |
198 | * Successful copy-range. Now increase copy_size. copy_range | |
199 | * does not respect max_transfer (it's a TODO), so we factor | |
200 | * that in here. | |
201 | * | |
202 | * Note: we double-check s->use_copy_range for the case when | |
203 | * parallel block-copy request unsets it during previous | |
204 | * bdrv_co_copy_range call. | |
205 | */ | |
206 | s->copy_size = | |
207 | MIN(MAX(s->cluster_size, BLOCK_COPY_MAX_COPY_RANGE), | |
208 | QEMU_ALIGN_DOWN(block_copy_max_transfer(s->source, | |
209 | s->target), | |
210 | s->cluster_size)); | |
211 | } | |
e332a726 VSO |
212 | goto out; |
213 | } | |
214 | } | |
215 | ||
0e240245 VSO |
216 | /* |
217 | * In case of failed copy_range request above, we may proceed with buffered | |
218 | * request larger than BLOCK_COPY_MAX_BUFFER. Still, further requests will | |
9d31bc53 VSO |
219 | * be properly limited, so don't care too much. Moreover the most likely |
220 | * case (copy_range is unsupported for the configuration, so the very first | |
221 | * copy_range request fails) is handled by setting large copy_size only | |
222 | * after first successful copy_range. | |
0e240245 VSO |
223 | */ |
224 | ||
e332a726 | 225 | bounce_buffer = qemu_blockalign(s->source->bs, nbytes); |
beb5f545 | 226 | |
3816edd2 | 227 | ret = bdrv_co_pread(s->source, start, nbytes, bounce_buffer, 0); |
beb5f545 | 228 | if (ret < 0) { |
e332a726 | 229 | trace_block_copy_read_fail(s, start, ret); |
beb5f545 VSO |
230 | if (error_is_read) { |
231 | *error_is_read = true; | |
232 | } | |
e332a726 | 233 | goto out; |
beb5f545 VSO |
234 | } |
235 | ||
3816edd2 | 236 | ret = bdrv_co_pwrite(s->target, start, nbytes, bounce_buffer, |
00e30f05 | 237 | s->write_flags); |
beb5f545 | 238 | if (ret < 0) { |
e332a726 | 239 | trace_block_copy_write_fail(s, start, ret); |
beb5f545 VSO |
240 | if (error_is_read) { |
241 | *error_is_read = false; | |
242 | } | |
e332a726 | 243 | goto out; |
beb5f545 VSO |
244 | } |
245 | ||
e332a726 | 246 | out: |
3816edd2 VSO |
247 | qemu_vfree(bounce_buffer); |
248 | ||
beb5f545 | 249 | return ret; |
beb5f545 VSO |
250 | } |
251 | ||
2d57511a VSO |
252 | static int block_copy_block_status(BlockCopyState *s, int64_t offset, |
253 | int64_t bytes, int64_t *pnum) | |
254 | { | |
255 | int64_t num; | |
256 | BlockDriverState *base; | |
257 | int ret; | |
258 | ||
259 | if (s->skip_unallocated && s->source->bs->backing) { | |
260 | base = s->source->bs->backing->bs; | |
261 | } else { | |
262 | base = NULL; | |
263 | } | |
264 | ||
265 | ret = bdrv_block_status_above(s->source->bs, base, offset, bytes, &num, | |
266 | NULL, NULL); | |
267 | if (ret < 0 || num < s->cluster_size) { | |
268 | /* | |
269 | * On error or if failed to obtain large enough chunk just fallback to | |
270 | * copy one cluster. | |
271 | */ | |
272 | num = s->cluster_size; | |
273 | ret = BDRV_BLOCK_ALLOCATED | BDRV_BLOCK_DATA; | |
274 | } else if (offset + num == s->len) { | |
275 | num = QEMU_ALIGN_UP(num, s->cluster_size); | |
276 | } else { | |
277 | num = QEMU_ALIGN_DOWN(num, s->cluster_size); | |
278 | } | |
279 | ||
280 | *pnum = num; | |
281 | return ret; | |
282 | } | |
283 | ||
beb5f545 VSO |
284 | /* |
285 | * Check if the cluster starting at offset is allocated or not. | |
286 | * return via pnum the number of contiguous clusters sharing this allocation. | |
287 | */ | |
288 | static int block_copy_is_cluster_allocated(BlockCopyState *s, int64_t offset, | |
289 | int64_t *pnum) | |
290 | { | |
00e30f05 | 291 | BlockDriverState *bs = s->source->bs; |
beb5f545 VSO |
292 | int64_t count, total_count = 0; |
293 | int64_t bytes = s->len - offset; | |
294 | int ret; | |
295 | ||
296 | assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); | |
297 | ||
298 | while (true) { | |
299 | ret = bdrv_is_allocated(bs, offset, bytes, &count); | |
300 | if (ret < 0) { | |
301 | return ret; | |
302 | } | |
303 | ||
304 | total_count += count; | |
305 | ||
306 | if (ret || count == 0) { | |
307 | /* | |
308 | * ret: partial segment(s) are considered allocated. | |
309 | * otherwise: unallocated tail is treated as an entire segment. | |
310 | */ | |
311 | *pnum = DIV_ROUND_UP(total_count, s->cluster_size); | |
312 | return ret; | |
313 | } | |
314 | ||
315 | /* Unallocated segment(s) with uncertain following segment(s) */ | |
316 | if (total_count >= s->cluster_size) { | |
317 | *pnum = total_count / s->cluster_size; | |
318 | return 0; | |
319 | } | |
320 | ||
321 | offset += count; | |
322 | bytes -= count; | |
323 | } | |
324 | } | |
325 | ||
326 | /* | |
327 | * Reset bits in copy_bitmap starting at offset if they represent unallocated | |
328 | * data in the image. May reset subsequent contiguous bits. | |
329 | * @return 0 when the cluster at @offset was unallocated, | |
330 | * 1 otherwise, and -ret on error. | |
331 | */ | |
332 | int64_t block_copy_reset_unallocated(BlockCopyState *s, | |
333 | int64_t offset, int64_t *count) | |
334 | { | |
335 | int ret; | |
336 | int64_t clusters, bytes; | |
337 | ||
338 | ret = block_copy_is_cluster_allocated(s, offset, &clusters); | |
339 | if (ret < 0) { | |
340 | return ret; | |
341 | } | |
342 | ||
343 | bytes = clusters * s->cluster_size; | |
344 | ||
345 | if (!ret) { | |
346 | bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes); | |
d0ebeca1 VSO |
347 | progress_set_remaining(s->progress, |
348 | bdrv_get_dirty_count(s->copy_bitmap) + | |
349 | s->in_flight_bytes); | |
beb5f545 VSO |
350 | } |
351 | ||
352 | *count = bytes; | |
353 | return ret; | |
354 | } | |
355 | ||
356 | int coroutine_fn block_copy(BlockCopyState *s, | |
357 | int64_t start, uint64_t bytes, | |
00e30f05 | 358 | bool *error_is_read) |
beb5f545 VSO |
359 | { |
360 | int ret = 0; | |
361 | int64_t end = bytes + start; /* bytes */ | |
a6ffe199 | 362 | BlockCopyInFlightReq req; |
beb5f545 VSO |
363 | |
364 | /* | |
365 | * block_copy() user is responsible for keeping source and target in same | |
366 | * aio context | |
367 | */ | |
00e30f05 VSO |
368 | assert(bdrv_get_aio_context(s->source->bs) == |
369 | bdrv_get_aio_context(s->target->bs)); | |
beb5f545 VSO |
370 | |
371 | assert(QEMU_IS_ALIGNED(start, s->cluster_size)); | |
372 | assert(QEMU_IS_ALIGNED(end, s->cluster_size)); | |
373 | ||
a6ffe199 VSO |
374 | block_copy_wait_inflight_reqs(s, start, bytes); |
375 | block_copy_inflight_req_begin(s, &req, start, end); | |
376 | ||
beb5f545 | 377 | while (start < end) { |
2d57511a | 378 | int64_t next_zero, chunk_end, status_bytes; |
beb5f545 VSO |
379 | |
380 | if (!bdrv_dirty_bitmap_get(s->copy_bitmap, start)) { | |
381 | trace_block_copy_skip(s, start); | |
382 | start += s->cluster_size; | |
383 | continue; /* already copied */ | |
384 | } | |
385 | ||
0e240245 | 386 | chunk_end = MIN(end, start + s->copy_size); |
e332a726 VSO |
387 | |
388 | next_zero = bdrv_dirty_bitmap_next_zero(s->copy_bitmap, start, | |
389 | chunk_end - start); | |
390 | if (next_zero >= 0) { | |
391 | assert(next_zero > start); /* start is dirty */ | |
392 | assert(next_zero < chunk_end); /* no need to do MIN() */ | |
393 | chunk_end = next_zero; | |
beb5f545 VSO |
394 | } |
395 | ||
2d57511a VSO |
396 | ret = block_copy_block_status(s, start, chunk_end - start, |
397 | &status_bytes); | |
398 | if (s->skip_unallocated && !(ret & BDRV_BLOCK_ALLOCATED)) { | |
399 | bdrv_reset_dirty_bitmap(s->copy_bitmap, start, status_bytes); | |
400 | progress_set_remaining(s->progress, | |
401 | bdrv_get_dirty_count(s->copy_bitmap) + | |
402 | s->in_flight_bytes); | |
403 | trace_block_copy_skip_range(s, start, status_bytes); | |
404 | start += status_bytes; | |
405 | continue; | |
beb5f545 VSO |
406 | } |
407 | ||
2d57511a VSO |
408 | chunk_end = MIN(chunk_end, start + status_bytes); |
409 | ||
beb5f545 VSO |
410 | trace_block_copy_process(s, start); |
411 | ||
e332a726 | 412 | bdrv_reset_dirty_bitmap(s->copy_bitmap, start, chunk_end - start); |
d0ebeca1 | 413 | s->in_flight_bytes += chunk_end - start; |
e332a726 | 414 | |
7f739d0e | 415 | co_get_from_shres(s->mem, chunk_end - start); |
2d57511a VSO |
416 | ret = block_copy_do_copy(s, start, chunk_end, ret & BDRV_BLOCK_ZERO, |
417 | error_is_read); | |
7f739d0e | 418 | co_put_to_shres(s->mem, chunk_end - start); |
d0ebeca1 | 419 | s->in_flight_bytes -= chunk_end - start; |
beb5f545 | 420 | if (ret < 0) { |
e332a726 | 421 | bdrv_set_dirty_bitmap(s->copy_bitmap, start, chunk_end - start); |
beb5f545 VSO |
422 | break; |
423 | } | |
424 | ||
d0ebeca1 | 425 | progress_work_done(s->progress, chunk_end - start); |
e332a726 VSO |
426 | s->progress_bytes_callback(chunk_end - start, s->progress_opaque); |
427 | start = chunk_end; | |
beb5f545 VSO |
428 | ret = 0; |
429 | } | |
430 | ||
a6ffe199 VSO |
431 | block_copy_inflight_req_end(&req); |
432 | ||
beb5f545 VSO |
433 | return ret; |
434 | } |