4 * Copyright Red Hat, Inc. 2012
7 * Paolo Bonzini <pbonzini@redhat.com>
9 * This work is licensed under the terms of the GNU LGPL, version 2 or later.
10 * See the COPYING.LIB file in the top-level directory.
15 #include "block/blockjob.h"
16 #include "block/block_int.h"
17 #include "qemu/ratelimit.h"
18 #include "qemu/bitmap.h"
20 #define SLICE_TIME 100000000ULL /* ns */
22 typedef struct MirrorBlockJob
{
25 BlockDriverState
*target
;
27 BlockdevOnError on_source_error
, on_target_error
;
33 unsigned long *cow_bitmap
;
41 typedef struct MirrorOp
{
49 static BlockErrorAction
mirror_error_action(MirrorBlockJob
*s
, bool read
,
54 return block_job_error_action(&s
->common
, s
->common
.bs
,
55 s
->on_source_error
, true, error
);
57 return block_job_error_action(&s
->common
, s
->target
,
58 s
->on_target_error
, false, error
);
62 static void mirror_iteration_done(MirrorOp
*op
, int ret
)
64 MirrorBlockJob
*s
= op
->s
;
66 int nb_chunks
, sectors_per_chunk
;
68 trace_mirror_iteration_done(s
, op
->sector_num
, op
->nb_sectors
, ret
);
71 sectors_per_chunk
= s
->granularity
>> BDRV_SECTOR_BITS
;
72 chunk_num
= op
->sector_num
/ sectors_per_chunk
;
73 nb_chunks
= op
->nb_sectors
/ sectors_per_chunk
;
74 if (s
->cow_bitmap
&& ret
>= 0) {
75 bitmap_set(s
->cow_bitmap
, chunk_num
, nb_chunks
);
78 g_slice_free(MirrorOp
, op
);
79 qemu_coroutine_enter(s
->common
.co
, NULL
);
82 static void mirror_write_complete(void *opaque
, int ret
)
84 MirrorOp
*op
= opaque
;
85 MirrorBlockJob
*s
= op
->s
;
87 BlockDriverState
*source
= s
->common
.bs
;
88 BlockErrorAction action
;
90 bdrv_set_dirty(source
, op
->sector_num
, op
->nb_sectors
);
91 action
= mirror_error_action(s
, false, -ret
);
92 if (action
== BDRV_ACTION_REPORT
&& s
->ret
>= 0) {
96 mirror_iteration_done(op
, ret
);
99 static void mirror_read_complete(void *opaque
, int ret
)
101 MirrorOp
*op
= opaque
;
102 MirrorBlockJob
*s
= op
->s
;
104 BlockDriverState
*source
= s
->common
.bs
;
105 BlockErrorAction action
;
107 bdrv_set_dirty(source
, op
->sector_num
, op
->nb_sectors
);
108 action
= mirror_error_action(s
, true, -ret
);
109 if (action
== BDRV_ACTION_REPORT
&& s
->ret
>= 0) {
113 mirror_iteration_done(op
, ret
);
116 bdrv_aio_writev(s
->target
, op
->sector_num
, &op
->qiov
, op
->nb_sectors
,
117 mirror_write_complete
, op
);
120 static void coroutine_fn
mirror_iteration(MirrorBlockJob
*s
)
122 BlockDriverState
*source
= s
->common
.bs
;
123 int nb_sectors
, sectors_per_chunk
;
124 int64_t end
, sector_num
, chunk_num
;
127 s
->sector_num
= hbitmap_iter_next(&s
->hbi
);
128 if (s
->sector_num
< 0) {
129 bdrv_dirty_iter_init(source
, &s
->hbi
);
130 s
->sector_num
= hbitmap_iter_next(&s
->hbi
);
131 trace_mirror_restart_iter(s
, bdrv_get_dirty_count(source
));
132 assert(s
->sector_num
>= 0);
135 /* If we have no backing file yet in the destination, and the cluster size
136 * is very large, we need to do COW ourselves. The first time a cluster is
137 * copied, copy it entirely.
139 * Because both the granularity and the cluster size are powers of two, the
140 * number of sectors to copy cannot exceed one cluster.
142 sector_num
= s
->sector_num
;
143 sectors_per_chunk
= nb_sectors
= s
->granularity
>> BDRV_SECTOR_BITS
;
144 chunk_num
= sector_num
/ sectors_per_chunk
;
145 if (s
->cow_bitmap
&& !test_bit(chunk_num
, s
->cow_bitmap
)) {
146 trace_mirror_cow(s
, sector_num
);
147 bdrv_round_to_clusters(s
->target
,
148 sector_num
, sectors_per_chunk
,
149 §or_num
, &nb_sectors
);
152 end
= s
->common
.len
>> BDRV_SECTOR_BITS
;
153 nb_sectors
= MIN(nb_sectors
, end
- sector_num
);
155 /* Allocate a MirrorOp that is used as an AIO callback. */
156 op
= g_slice_new(MirrorOp
);
158 op
->iov
.iov_base
= s
->buf
;
159 op
->iov
.iov_len
= nb_sectors
* 512;
160 op
->sector_num
= sector_num
;
161 op
->nb_sectors
= nb_sectors
;
162 qemu_iovec_init_external(&op
->qiov
, &op
->iov
, 1);
164 bdrv_reset_dirty(source
, sector_num
, nb_sectors
);
166 /* Copy the dirty cluster. */
168 trace_mirror_one_iteration(s
, sector_num
, nb_sectors
);
169 bdrv_aio_readv(source
, sector_num
, &op
->qiov
, nb_sectors
,
170 mirror_read_complete
, op
);
173 static void mirror_drain(MirrorBlockJob
*s
)
175 while (s
->in_flight
> 0) {
176 qemu_coroutine_yield();
180 static void coroutine_fn
mirror_run(void *opaque
)
182 MirrorBlockJob
*s
= opaque
;
183 BlockDriverState
*bs
= s
->common
.bs
;
184 int64_t sector_num
, end
, sectors_per_chunk
, length
;
185 uint64_t last_pause_ns
;
187 char backing_filename
[1024];
191 if (block_job_is_cancelled(&s
->common
)) {
195 s
->common
.len
= bdrv_getlength(bs
);
196 if (s
->common
.len
< 0) {
197 block_job_completed(&s
->common
, s
->common
.len
);
201 /* If we have no backing file yet in the destination, we cannot let
202 * the destination do COW. Instead, we copy sectors around the
203 * dirty data if needed. We need a bitmap to do that.
205 bdrv_get_backing_filename(s
->target
, backing_filename
,
206 sizeof(backing_filename
));
207 if (backing_filename
[0] && !s
->target
->backing_hd
) {
208 bdrv_get_info(s
->target
, &bdi
);
209 if (s
->granularity
< bdi
.cluster_size
) {
210 s
->buf_size
= bdi
.cluster_size
;
211 length
= (bdrv_getlength(bs
) + s
->granularity
- 1) / s
->granularity
;
212 s
->cow_bitmap
= bitmap_new(length
);
216 end
= s
->common
.len
>> BDRV_SECTOR_BITS
;
217 s
->buf
= qemu_blockalign(bs
, s
->buf_size
);
218 sectors_per_chunk
= s
->granularity
>> BDRV_SECTOR_BITS
;
220 if (s
->mode
!= MIRROR_SYNC_MODE_NONE
) {
221 /* First part, loop on the sectors and initialize the dirty bitmap. */
222 BlockDriverState
*base
;
223 base
= s
->mode
== MIRROR_SYNC_MODE_FULL
? NULL
: bs
->backing_hd
;
224 for (sector_num
= 0; sector_num
< end
; ) {
225 int64_t next
= (sector_num
| (sectors_per_chunk
- 1)) + 1;
226 ret
= bdrv_co_is_allocated_above(bs
, base
,
227 sector_num
, next
- sector_num
, &n
);
235 bdrv_set_dirty(bs
, sector_num
, n
);
243 bdrv_dirty_iter_init(bs
, &s
->hbi
);
244 last_pause_ns
= qemu_get_clock_ns(rt_clock
);
248 bool should_complete
;
255 cnt
= bdrv_get_dirty_count(bs
);
257 /* Note that even when no rate limit is applied we need to yield
258 * periodically with no pending I/O so that qemu_aio_flush() returns.
259 * We do so every SLICE_TIME nanoseconds, or when there is an error,
260 * or when the source is clean, whichever comes first.
262 if (qemu_get_clock_ns(rt_clock
) - last_pause_ns
< SLICE_TIME
&&
263 s
->common
.iostatus
== BLOCK_DEVICE_IO_STATUS_OK
) {
264 if (s
->in_flight
> 0) {
265 trace_mirror_yield(s
, s
->in_flight
, cnt
);
266 qemu_coroutine_yield();
268 } else if (cnt
!= 0) {
274 should_complete
= false;
275 if (s
->in_flight
== 0 && cnt
== 0) {
276 trace_mirror_before_flush(s
);
277 ret
= bdrv_flush(s
->target
);
279 if (mirror_error_action(s
, false, -ret
) == BDRV_ACTION_REPORT
) {
283 /* We're out of the streaming phase. From now on, if the job
284 * is cancelled we will actually complete all pending I/O and
285 * report completion. This way, block-job-cancel will leave
286 * the target in a consistent state.
288 s
->common
.offset
= end
* BDRV_SECTOR_SIZE
;
290 block_job_ready(&s
->common
);
294 should_complete
= s
->should_complete
||
295 block_job_is_cancelled(&s
->common
);
296 cnt
= bdrv_get_dirty_count(bs
);
300 if (cnt
== 0 && should_complete
) {
301 /* The dirty bitmap is not updated while operations are pending.
302 * If we're about to exit, wait for pending operations before
303 * calling bdrv_get_dirty_count(bs), or we may exit while the
304 * source has dirty data to copy!
306 * Note that I/O can be submitted by the guest while
307 * mirror_populate runs.
309 trace_mirror_before_drain(s
, cnt
);
311 cnt
= bdrv_get_dirty_count(bs
);
315 trace_mirror_before_sleep(s
, cnt
, s
->synced
);
317 /* Publish progress */
318 s
->common
.offset
= (end
- cnt
) * BDRV_SECTOR_SIZE
;
320 if (s
->common
.speed
) {
321 delay_ns
= ratelimit_calculate_delay(&s
->limit
, sectors_per_chunk
);
326 block_job_sleep_ns(&s
->common
, rt_clock
, delay_ns
);
327 if (block_job_is_cancelled(&s
->common
)) {
330 } else if (!should_complete
) {
331 delay_ns
= (s
->in_flight
== 0 && cnt
== 0 ? SLICE_TIME
: 0);
332 block_job_sleep_ns(&s
->common
, rt_clock
, delay_ns
);
333 } else if (cnt
== 0) {
334 /* The two disks are in sync. Exit and report successful
337 assert(QLIST_EMPTY(&bs
->tracked_requests
));
338 s
->common
.cancelled
= false;
341 last_pause_ns
= qemu_get_clock_ns(rt_clock
);
345 if (s
->in_flight
> 0) {
346 /* We get here only if something went wrong. Either the job failed,
347 * or it was cancelled prematurely so that we do not guarantee that
348 * the target is a copy of the source.
350 assert(ret
< 0 || (!s
->synced
&& block_job_is_cancelled(&s
->common
)));
354 assert(s
->in_flight
== 0);
356 g_free(s
->cow_bitmap
);
357 bdrv_set_dirty_tracking(bs
, 0);
358 bdrv_iostatus_disable(s
->target
);
359 if (s
->should_complete
&& ret
== 0) {
360 if (bdrv_get_flags(s
->target
) != bdrv_get_flags(s
->common
.bs
)) {
361 bdrv_reopen(s
->target
, bdrv_get_flags(s
->common
.bs
), NULL
);
363 bdrv_swap(s
->target
, s
->common
.bs
);
365 bdrv_close(s
->target
);
366 bdrv_delete(s
->target
);
367 block_job_completed(&s
->common
, ret
);
370 static void mirror_set_speed(BlockJob
*job
, int64_t speed
, Error
**errp
)
372 MirrorBlockJob
*s
= container_of(job
, MirrorBlockJob
, common
);
375 error_set(errp
, QERR_INVALID_PARAMETER
, "speed");
378 ratelimit_set_speed(&s
->limit
, speed
/ BDRV_SECTOR_SIZE
, SLICE_TIME
);
381 static void mirror_iostatus_reset(BlockJob
*job
)
383 MirrorBlockJob
*s
= container_of(job
, MirrorBlockJob
, common
);
385 bdrv_iostatus_reset(s
->target
);
388 static void mirror_complete(BlockJob
*job
, Error
**errp
)
390 MirrorBlockJob
*s
= container_of(job
, MirrorBlockJob
, common
);
393 ret
= bdrv_open_backing_file(s
->target
);
395 char backing_filename
[PATH_MAX
];
396 bdrv_get_full_backing_filename(s
->target
, backing_filename
,
397 sizeof(backing_filename
));
398 error_set(errp
, QERR_OPEN_FILE_FAILED
, backing_filename
);
402 error_set(errp
, QERR_BLOCK_JOB_NOT_READY
, job
->bs
->device_name
);
406 s
->should_complete
= true;
407 block_job_resume(job
);
410 static BlockJobType mirror_job_type
= {
411 .instance_size
= sizeof(MirrorBlockJob
),
412 .job_type
= "mirror",
413 .set_speed
= mirror_set_speed
,
414 .iostatus_reset
= mirror_iostatus_reset
,
415 .complete
= mirror_complete
,
418 void mirror_start(BlockDriverState
*bs
, BlockDriverState
*target
,
419 int64_t speed
, int64_t granularity
, MirrorSyncMode mode
,
420 BlockdevOnError on_source_error
,
421 BlockdevOnError on_target_error
,
422 BlockDriverCompletionFunc
*cb
,
423 void *opaque
, Error
**errp
)
427 if (granularity
== 0) {
428 /* Choose the default granularity based on the target file's cluster
429 * size, clamped between 4k and 64k. */
431 if (bdrv_get_info(target
, &bdi
) >= 0 && bdi
.cluster_size
!= 0) {
432 granularity
= MAX(4096, bdi
.cluster_size
);
433 granularity
= MIN(65536, granularity
);
439 assert ((granularity
& (granularity
- 1)) == 0);
441 if ((on_source_error
== BLOCKDEV_ON_ERROR_STOP
||
442 on_source_error
== BLOCKDEV_ON_ERROR_ENOSPC
) &&
443 !bdrv_iostatus_is_enabled(bs
)) {
444 error_set(errp
, QERR_INVALID_PARAMETER
, "on-source-error");
448 s
= block_job_create(&mirror_job_type
, bs
, speed
, cb
, opaque
, errp
);
453 s
->on_source_error
= on_source_error
;
454 s
->on_target_error
= on_target_error
;
457 s
->granularity
= granularity
;
458 s
->buf_size
= granularity
;
460 bdrv_set_dirty_tracking(bs
, granularity
);
461 bdrv_set_enable_write_cache(s
->target
, true);
462 bdrv_set_on_error(s
->target
, on_target_error
, on_target_error
);
463 bdrv_iostatus_enable(s
->target
);
464 s
->common
.co
= qemu_coroutine_create(mirror_run
);
465 trace_mirror_start(bs
, s
, s
->common
.co
, opaque
);
466 qemu_coroutine_enter(s
->common
.co
, s
);