2 * QEMU live block migration
4 * Copyright IBM, Corp. 2009
7 * Liran Schour <lirans@il.ibm.com>
9 * This work is licensed under the terms of the GNU GPL, version 2. See
10 * the COPYING file in the top-level directory.
14 #include "qemu-common.h"
15 #include "block_int.h"
17 #include "block-migration.h"
22 #define SECTOR_SIZE (1 << SECTOR_BITS)
23 #define SECTOR_MASK ~(SECTOR_SIZE - 1);
25 #define BLOCK_SIZE (block_mig_state->sectors_per_block << SECTOR_BITS)
27 #define BLK_MIG_FLAG_DEVICE_BLOCK 0x01
28 #define BLK_MIG_FLAG_EOS 0x02
30 #define MAX_IS_ALLOCATED_SEARCH 65536
31 #define MAX_BLOCKS_READ 10000
32 #define BLOCKS_READ_CHANGE 100
33 #define INITIAL_BLOCKS_READ 100
35 //#define DEBUG_BLK_MIGRATION
37 #ifdef DEBUG_BLK_MIGRATION
38 #define dprintf(fmt, ...) \
39 do { printf("blk_migration: " fmt, ## __VA_ARGS__); } while (0)
41 #define dprintf(fmt, ...) \
45 typedef struct BlkMigBlock
{
51 BlockDriverAIOCB
*aiocb
;
53 struct BlkMigBlock
*next
;
56 typedef struct BlkMigState
{
62 BlkMigDevState
*bmds_first
;
63 int sectors_per_block
;
64 BlkMigBlock
*first_blk
;
65 BlkMigBlock
*last_blk
;
69 int64_t print_completion
;
72 static BlkMigState
*block_mig_state
= NULL
;
74 static void blk_mig_read_cb(void *opaque
, int ret
)
76 BlkMigBlock
*blk
= opaque
;
80 /* insert at the end */
81 if(block_mig_state
->last_blk
== NULL
) {
82 block_mig_state
->first_blk
= blk
;
83 block_mig_state
->last_blk
= blk
;
85 block_mig_state
->last_blk
->next
= blk
;
86 block_mig_state
->last_blk
= blk
;
89 block_mig_state
->submitted
--;
90 block_mig_state
->read_done
++;
91 assert(block_mig_state
->submitted
>= 0);
96 static int mig_read_device_bulk(QEMUFile
*f
, BlkMigDevState
*bms
)
99 int64_t total_sectors
, cur_sector
= 0;
100 BlockDriverState
*bs
= bms
->bs
;
103 blk
= qemu_malloc(sizeof(BlkMigBlock
));
104 blk
->buf
= qemu_malloc(BLOCK_SIZE
);
106 cur_sector
= bms
->cur_sector
;
107 total_sectors
= bdrv_getlength(bs
) >> SECTOR_BITS
;
109 if(bms
->shared_base
) {
110 while(cur_sector
< bms
->total_sectors
&&
111 !bdrv_is_allocated(bms
->bs
, cur_sector
,
112 MAX_IS_ALLOCATED_SEARCH
, &nr_sectors
)) {
113 cur_sector
+= nr_sectors
;
117 if(cur_sector
>= total_sectors
) {
118 bms
->cur_sector
= total_sectors
;
124 if(cur_sector
>= block_mig_state
->print_completion
) {
125 printf("Completed %" PRId64
" %%\r", cur_sector
* 100 / total_sectors
);
127 block_mig_state
->print_completion
+=
128 (block_mig_state
->sectors_per_block
* 10000);
131 /* we going to transfder BLOCK_SIZE any way even if it is not allocated */
132 nr_sectors
= block_mig_state
->sectors_per_block
;
134 cur_sector
&= ~((int64_t)block_mig_state
->sectors_per_block
-1);
136 if(total_sectors
- cur_sector
< block_mig_state
->sectors_per_block
) {
137 nr_sectors
= (total_sectors
- cur_sector
);
140 bms
->cur_sector
= cur_sector
+ nr_sectors
;
141 blk
->sector
= cur_sector
;
145 blk
->iov
.iov_base
= blk
->buf
;
146 blk
->iov
.iov_len
= nr_sectors
* SECTOR_SIZE
;
147 qemu_iovec_init_external(&blk
->qiov
, &blk
->iov
, 1);
149 blk
->aiocb
= bdrv_aio_readv(bs
, cur_sector
, &blk
->qiov
,
150 nr_sectors
, blk_mig_read_cb
, blk
);
153 printf("Error reading sector %" PRId64
"\n", cur_sector
);
159 bdrv_reset_dirty(bms
->bs
, cur_sector
, nr_sectors
);
160 block_mig_state
->submitted
++;
162 return (bms
->cur_sector
>= total_sectors
);
165 static int mig_save_device_bulk(QEMUFile
*f
, BlkMigDevState
*bmds
)
168 int64_t total_sectors
= bmds
->total_sectors
, cur_sector
= 0;
169 uint8_t *tmp_buf
= NULL
;
170 BlockDriverState
*bs
= bmds
->bs
;
172 tmp_buf
= qemu_malloc(BLOCK_SIZE
);
174 cur_sector
= bmds
->cur_sector
;
176 if(bmds
->shared_base
) {
177 while(cur_sector
< bmds
->total_sectors
&&
178 !bdrv_is_allocated(bmds
->bs
, cur_sector
,
179 MAX_IS_ALLOCATED_SEARCH
, &nr_sectors
)) {
180 cur_sector
+= nr_sectors
;
184 if(cur_sector
>= total_sectors
) {
185 bmds
->cur_sector
= total_sectors
;
190 if(cur_sector
>= block_mig_state
->print_completion
) {
191 printf("Completed %" PRId64
" %%\r", cur_sector
* 100 / total_sectors
);
193 block_mig_state
->print_completion
+=
194 (block_mig_state
->sectors_per_block
* 10000);
197 cur_sector
&= ~((int64_t)block_mig_state
->sectors_per_block
-1);
199 /* we going to transfer
201 any way even if it is not allocated */
202 nr_sectors
= block_mig_state
->sectors_per_block
;
204 if(total_sectors
- cur_sector
< block_mig_state
->sectors_per_block
) {
205 nr_sectors
= (total_sectors
- cur_sector
);
208 if(bdrv_read(bs
, cur_sector
, tmp_buf
, nr_sectors
) < 0) {
209 printf("Error reading sector %" PRId64
"\n", cur_sector
);
212 bdrv_reset_dirty(bs
, cur_sector
, nr_sectors
);
215 qemu_put_be64(f
,(cur_sector
<< SECTOR_BITS
) | BLK_MIG_FLAG_DEVICE_BLOCK
);
217 len
= strlen(bs
->device_name
);
218 qemu_put_byte(f
, len
);
219 qemu_put_buffer(f
, (uint8_t *)bs
->device_name
, len
);
221 qemu_put_buffer(f
, tmp_buf
,
224 bmds
->cur_sector
= cur_sector
+ block_mig_state
->sectors_per_block
;
228 return (bmds
->cur_sector
>= total_sectors
);
231 static void send_blk(QEMUFile
*f
, BlkMigBlock
* blk
)
236 qemu_put_be64(f
,(blk
->sector
<< SECTOR_BITS
) | BLK_MIG_FLAG_DEVICE_BLOCK
);
238 len
= strlen(blk
->bmds
->bs
->device_name
);
239 qemu_put_byte(f
, len
);
240 qemu_put_buffer(f
, (uint8_t *)blk
->bmds
->bs
->device_name
, len
);
242 qemu_put_buffer(f
, blk
->buf
,
248 static void blk_mig_save_dev_info(QEMUFile
*f
, BlkMigDevState
*bmds
)
252 static void set_dirty_tracking(int enable
)
254 BlkMigDevState
*bmds
;
255 for(bmds
= block_mig_state
->bmds_first
; bmds
!= NULL
; bmds
= bmds
->next
) {
256 bdrv_set_dirty_tracking(bmds
->bs
,enable
);
262 static void init_blk_migration(QEMUFile
*f
)
264 BlkMigDevState
**pbmds
, *bmds
;
265 BlockDriverState
*bs
;
267 for (bs
= bdrv_first
; bs
!= NULL
; bs
= bs
->next
) {
268 if(bs
->type
== BDRV_TYPE_HD
) {
269 bmds
= qemu_mallocz(sizeof(BlkMigDevState
));
271 bmds
->bulk_completed
= 0;
272 bmds
->total_sectors
= bdrv_getlength(bs
) >> SECTOR_BITS
;
273 bmds
->shared_base
= block_mig_state
->shared_base
;
275 if(bmds
->shared_base
) {
276 printf("Start migration for %s with shared base image\n",
279 printf("Start full migration for %s\n", bs
->device_name
);
282 /* insert at the end */
283 pbmds
= &block_mig_state
->bmds_first
;
284 while (*pbmds
!= NULL
)
285 pbmds
= &(*pbmds
)->next
;
288 blk_mig_save_dev_info(f
, bmds
);
293 block_mig_state
->sectors_per_block
= bdrv_get_sectors_per_chunk();
298 static int blk_mig_save_bulked_block(QEMUFile
*f
, int is_async
)
300 BlkMigDevState
*bmds
;
302 for (bmds
= block_mig_state
->bmds_first
; bmds
!= NULL
; bmds
= bmds
->next
) {
303 if(bmds
->bulk_completed
== 0) {
305 if(mig_read_device_bulk(f
, bmds
) == 1) {
306 /* completed bulk section for this device */
307 bmds
->bulk_completed
= 1;
310 if(mig_save_device_bulk(f
,bmds
) == 1) {
311 /* completed bulk section for this device */
312 bmds
->bulk_completed
= 1;
319 /* we reached here means bulk is completed */
320 block_mig_state
->bulk_completed
= 1;
326 #define MAX_NUM_BLOCKS 4
328 static void blk_mig_save_dirty_blocks(QEMUFile
*f
)
330 BlkMigDevState
*bmds
;
331 uint8_t buf
[BLOCK_SIZE
];
335 for(bmds
= block_mig_state
->bmds_first
; bmds
!= NULL
; bmds
= bmds
->next
) {
336 for(sector
= 0; sector
< bmds
->cur_sector
;) {
338 if(bdrv_get_dirty(bmds
->bs
,sector
)) {
340 if(bdrv_read(bmds
->bs
, sector
, buf
,
341 block_mig_state
->sectors_per_block
) < 0) {
345 qemu_put_be64(f
,(sector
<< SECTOR_BITS
)
346 | BLK_MIG_FLAG_DEVICE_BLOCK
);
348 len
= strlen(bmds
->bs
->device_name
);
350 qemu_put_byte(f
, len
);
351 qemu_put_buffer(f
, (uint8_t *)bmds
->bs
->device_name
, len
);
353 qemu_put_buffer(f
, buf
,
354 (block_mig_state
->sectors_per_block
*
357 bdrv_reset_dirty(bmds
->bs
, sector
,
358 block_mig_state
->sectors_per_block
);
360 sector
+= block_mig_state
->sectors_per_block
;
362 /* sector is clean */
363 sector
+= block_mig_state
->sectors_per_block
;
371 static void flush_blks(QEMUFile
* f
)
373 BlkMigBlock
*blk
, *tmp
;
375 dprintf("%s Enter submitted %d read_done %d transfered\n", __FUNCTION__
,
376 submitted
, read_done
, transfered
);
378 for(blk
= block_mig_state
->first_blk
;
379 blk
!= NULL
&& !qemu_file_rate_limit(f
); blk
= tmp
) {
386 block_mig_state
->read_done
--;
387 block_mig_state
->transferred
++;
388 assert(block_mig_state
->read_done
>= 0);
390 block_mig_state
->first_blk
= blk
;
392 if(block_mig_state
->first_blk
== NULL
) {
393 block_mig_state
->last_blk
= NULL
;
396 dprintf("%s Exit submitted %d read_done %d transferred%d\n", __FUNCTION__
,
397 block_mig_state
->submitted
, block_mig_state
->read_done
,
398 block_mig_state
->transferred
);
403 static int is_stage2_completed(void)
405 BlkMigDevState
*bmds
;
407 if(block_mig_state
->submitted
> 0) {
411 for (bmds
= block_mig_state
->bmds_first
; bmds
!= NULL
; bmds
= bmds
->next
) {
412 if(bmds
->bulk_completed
== 0) {
420 static int block_save_live(QEMUFile
*f
, int stage
, void *opaque
)
424 dprintf("Enter save live stage %d submitted %d transferred %d\n", stage
,
425 submitted
, transferred
);
427 if(block_mig_state
->blk_enable
!= 1) {
428 /* no need to migrate storage */
430 qemu_put_be64(f
,BLK_MIG_FLAG_EOS
);
435 init_blk_migration(f
);
437 /* start track dirty blocks */
438 set_dirty_tracking(1);
444 /* control the rate of transfer */
445 while ((block_mig_state
->submitted
+ block_mig_state
->read_done
) *
447 (qemu_file_get_rate_limit(f
))) {
449 ret
= blk_mig_save_bulked_block(f
, 1);
451 if (ret
== 0) /* no more bulk blocks for now*/
459 while(blk_mig_save_bulked_block(f
, 0) != 0);
461 blk_mig_save_dirty_blocks(f
);
463 /* stop track dirty blocks */
464 set_dirty_tracking(0);;
466 printf("\nBlock migration completed\n");
469 qemu_put_be64(f
,BLK_MIG_FLAG_EOS
);
471 return ((stage
== 2) && is_stage2_completed());
474 static int block_load(QEMUFile
*f
, void *opaque
, int version_id
)
477 char device_name
[256];
479 BlockDriverState
*bs
;
482 block_mig_state
->sectors_per_block
= bdrv_get_sectors_per_chunk();
483 buf
= qemu_malloc(BLOCK_SIZE
);
487 addr
= qemu_get_be64(f
);
489 flags
= addr
& ~SECTOR_MASK
;
492 if(flags
& BLK_MIG_FLAG_DEVICE_BLOCK
) {
494 /* get device name */
495 len
= qemu_get_byte(f
);
497 qemu_get_buffer(f
, (uint8_t *)device_name
, len
);
498 device_name
[len
] = '\0';
500 bs
= bdrv_find(device_name
);
502 qemu_get_buffer(f
, buf
,
506 bdrv_write(bs
, (addr
>> SECTOR_BITS
),
507 buf
, block_mig_state
->sectors_per_block
);
509 printf("Error unknown block device %s\n", device_name
);
511 } else if(flags
& BLK_MIG_FLAG_EOS
) {
514 printf("Unknown flags\n");
516 } while(!(flags
& BLK_MIG_FLAG_EOS
));
523 static void block_set_params(int blk_enable
, int shared_base
, void *opaque
)
525 assert(opaque
== block_mig_state
);
527 block_mig_state
->blk_enable
= blk_enable
;
528 block_mig_state
->shared_base
= shared_base
;
530 /* shared base means that blk_enable = 1 */
531 block_mig_state
->blk_enable
|= shared_base
;
536 void blk_mig_info(void)
538 BlockDriverState
*bs
;
540 for (bs
= bdrv_first
; bs
!= NULL
; bs
= bs
->next
) {
541 printf("Device %s\n", bs
->device_name
);
542 if(bs
->type
== BDRV_TYPE_HD
) {
543 printf("device %s format %s\n",
544 bs
->device_name
, bs
->drv
->format_name
);
549 void blk_mig_init(void)
552 block_mig_state
= qemu_mallocz(sizeof(BlkMigState
));
554 register_savevm_live("block", 0, 1, block_set_params
, block_save_live
,
555 NULL
, block_load
, block_mig_state
);