]> git.proxmox.com Git - qemu.git/blame - block-migration.c
block migration: Cleanup dirty tracking code
[qemu.git] / block-migration.c
CommitLineData
c163b5ca 1/*
2 * QEMU live block migration
3 *
4 * Copyright IBM, Corp. 2009
5 *
6 * Authors:
7 * Liran Schour <lirans@il.ibm.com>
8 *
9 * This work is licensed under the terms of the GNU GPL, version 2. See
10 * the COPYING file in the top-level directory.
11 *
12 */
13
14#include "qemu-common.h"
15#include "block_int.h"
16#include "hw/hw.h"
17#include "block-migration.h"
18#include <assert.h>
c163b5ca 19
6ea44308 20#define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS)
c163b5ca 21
22#define BLK_MIG_FLAG_DEVICE_BLOCK 0x01
23#define BLK_MIG_FLAG_EOS 0x02
24
25#define MAX_IS_ALLOCATED_SEARCH 65536
26#define MAX_BLOCKS_READ 10000
27#define BLOCKS_READ_CHANGE 100
28#define INITIAL_BLOCKS_READ 100
29
30//#define DEBUG_BLK_MIGRATION
31
32#ifdef DEBUG_BLK_MIGRATION
a55eb92c 33#define dprintf(fmt, ...) \
c163b5ca 34 do { printf("blk_migration: " fmt, ## __VA_ARGS__); } while (0)
35#else
a55eb92c 36#define dprintf(fmt, ...) \
c163b5ca 37 do { } while (0)
38#endif
39
a55eb92c
JK
40typedef struct BlkMigDevState {
41 BlockDriverState *bs;
42 int bulk_completed;
43 int shared_base;
44 struct BlkMigDevState *next;
45 int64_t cur_sector;
46 int64_t total_sectors;
47 int64_t dirty;
48} BlkMigDevState;
49
c163b5ca 50typedef struct BlkMigBlock {
51 uint8_t *buf;
52 BlkMigDevState *bmds;
53 int64_t sector;
54 struct iovec iov;
55 QEMUIOVector qiov;
56 BlockDriverAIOCB *aiocb;
57 int ret;
58 struct BlkMigBlock *next;
59} BlkMigBlock;
60
61typedef struct BlkMigState {
62 int bulk_completed;
63 int blk_enable;
64 int shared_base;
65 int no_dirty;
66 QEMUFile *load_file;
67 BlkMigDevState *bmds_first;
c163b5ca 68 BlkMigBlock *first_blk;
69 BlkMigBlock *last_blk;
70 int submitted;
71 int read_done;
72 int transferred;
73 int64_t print_completion;
74} BlkMigState;
75
a55eb92c 76static BlkMigState *block_mig_state = NULL;
c163b5ca 77
78static void blk_mig_read_cb(void *opaque, int ret)
79{
80 BlkMigBlock *blk = opaque;
a55eb92c 81
c163b5ca 82 blk->ret = ret;
a55eb92c 83
c163b5ca 84 /* insert at the end */
a55eb92c 85 if (block_mig_state->last_blk == NULL) {
c163b5ca 86 block_mig_state->first_blk = blk;
87 block_mig_state->last_blk = blk;
88 } else {
89 block_mig_state->last_blk->next = blk;
90 block_mig_state->last_blk = blk;
91 }
a55eb92c 92
c163b5ca 93 block_mig_state->submitted--;
94 block_mig_state->read_done++;
95 assert(block_mig_state->submitted >= 0);
c163b5ca 96}
97
98static int mig_read_device_bulk(QEMUFile *f, BlkMigDevState *bms)
a55eb92c 99{
c163b5ca 100 int nr_sectors;
101 int64_t total_sectors, cur_sector = 0;
102 BlockDriverState *bs = bms->bs;
103 BlkMigBlock *blk;
a55eb92c 104
c163b5ca 105 blk = qemu_malloc(sizeof(BlkMigBlock));
106 blk->buf = qemu_malloc(BLOCK_SIZE);
a55eb92c 107
c163b5ca 108 cur_sector = bms->cur_sector;
6ea44308 109 total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
a55eb92c
JK
110
111 if (bms->shared_base) {
112 while (cur_sector < bms->total_sectors &&
113 !bdrv_is_allocated(bms->bs, cur_sector,
114 MAX_IS_ALLOCATED_SEARCH, &nr_sectors)) {
c163b5ca 115 cur_sector += nr_sectors;
116 }
117 }
a55eb92c
JK
118
119 if (cur_sector >= total_sectors) {
c163b5ca 120 bms->cur_sector = total_sectors;
121 qemu_free(blk->buf);
122 qemu_free(blk);
123 return 1;
124 }
a55eb92c
JK
125
126 if (cur_sector >= block_mig_state->print_completion) {
c163b5ca 127 printf("Completed %" PRId64 " %%\r", cur_sector * 100 / total_sectors);
128 fflush(stdout);
a55eb92c 129 block_mig_state->print_completion +=
6ea44308 130 (BDRV_SECTORS_PER_DIRTY_CHUNK * 10000);
c163b5ca 131 }
a55eb92c 132
6ea44308
JK
133 /* we are going to transfer a full block even if it is not allocated */
134 nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
c163b5ca 135
6ea44308 136 cur_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1);
a55eb92c 137
6ea44308 138 if (total_sectors - cur_sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
c163b5ca 139 nr_sectors = (total_sectors - cur_sector);
140 }
a55eb92c 141
c163b5ca 142 bms->cur_sector = cur_sector + nr_sectors;
143 blk->sector = cur_sector;
144 blk->bmds = bms;
145 blk->next = NULL;
a55eb92c 146
c163b5ca 147 blk->iov.iov_base = blk->buf;
6ea44308 148 blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
c163b5ca 149 qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
a55eb92c 150
c163b5ca 151 blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
152 nr_sectors, blk_mig_read_cb, blk);
a55eb92c
JK
153
154 if (!blk->aiocb) {
c163b5ca 155 printf("Error reading sector %" PRId64 "\n", cur_sector);
156 qemu_free(blk->buf);
157 qemu_free(blk);
158 return 0;
159 }
160
161 bdrv_reset_dirty(bms->bs, cur_sector, nr_sectors);
162 block_mig_state->submitted++;
a55eb92c 163
c163b5ca 164 return (bms->cur_sector >= total_sectors);
165}
166
167static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
a55eb92c 168{
c163b5ca 169 int len, nr_sectors;
170 int64_t total_sectors = bmds->total_sectors, cur_sector = 0;
171 uint8_t *tmp_buf = NULL;
172 BlockDriverState *bs = bmds->bs;
173
174 tmp_buf = qemu_malloc(BLOCK_SIZE);
a55eb92c 175
c163b5ca 176 cur_sector = bmds->cur_sector;
a55eb92c
JK
177
178 if (bmds->shared_base) {
179 while (cur_sector < bmds->total_sectors &&
180 !bdrv_is_allocated(bmds->bs, cur_sector,
181 MAX_IS_ALLOCATED_SEARCH, &nr_sectors)) {
c163b5ca 182 cur_sector += nr_sectors;
183 }
184 }
a55eb92c
JK
185
186 if (cur_sector >= total_sectors) {
c163b5ca 187 bmds->cur_sector = total_sectors;
188 qemu_free(tmp_buf);
189 return 1;
190 }
a55eb92c
JK
191
192 if (cur_sector >= block_mig_state->print_completion) {
c163b5ca 193 printf("Completed %" PRId64 " %%\r", cur_sector * 100 / total_sectors);
194 fflush(stdout);
a55eb92c 195 block_mig_state->print_completion +=
6ea44308 196 (BDRV_SECTORS_PER_DIRTY_CHUNK * 10000);
c163b5ca 197 }
a55eb92c 198
6ea44308 199 cur_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1);
a55eb92c 200
6ea44308
JK
201 /* we are going to transfer a full block even if it is not allocated */
202 nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
a55eb92c 203
6ea44308 204 if (total_sectors - cur_sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
c163b5ca 205 nr_sectors = (total_sectors - cur_sector);
206 }
a55eb92c
JK
207
208 if (bdrv_read(bs, cur_sector, tmp_buf, nr_sectors) < 0) {
c163b5ca 209 printf("Error reading sector %" PRId64 "\n", cur_sector);
210 }
211
212 bdrv_reset_dirty(bs, cur_sector, nr_sectors);
a55eb92c
JK
213
214 /* sector number and flags */
6ea44308
JK
215 qemu_put_be64(f, (cur_sector << BDRV_SECTOR_BITS)
216 | BLK_MIG_FLAG_DEVICE_BLOCK);
a55eb92c
JK
217
218 /* device name */
c163b5ca 219 len = strlen(bs->device_name);
220 qemu_put_byte(f, len);
221 qemu_put_buffer(f, (uint8_t *)bs->device_name, len);
a55eb92c
JK
222
223 qemu_put_buffer(f, tmp_buf, BLOCK_SIZE);
224
6ea44308 225 bmds->cur_sector = cur_sector + BDRV_SECTORS_PER_DIRTY_CHUNK;
a55eb92c 226
c163b5ca 227 qemu_free(tmp_buf);
a55eb92c 228
c163b5ca 229 return (bmds->cur_sector >= total_sectors);
230}
231
232static void send_blk(QEMUFile *f, BlkMigBlock * blk)
233{
234 int len;
a55eb92c
JK
235
236 /* sector number and flags */
6ea44308
JK
237 qemu_put_be64(f, (blk->sector << BDRV_SECTOR_BITS)
238 | BLK_MIG_FLAG_DEVICE_BLOCK);
a55eb92c
JK
239
240 /* device name */
c163b5ca 241 len = strlen(blk->bmds->bs->device_name);
242 qemu_put_byte(f, len);
243 qemu_put_buffer(f, (uint8_t *)blk->bmds->bs->device_name, len);
a55eb92c
JK
244
245 qemu_put_buffer(f, blk->buf, BLOCK_SIZE);
c163b5ca 246}
247
248static void blk_mig_save_dev_info(QEMUFile *f, BlkMigDevState *bmds)
249{
250}
251
252static void set_dirty_tracking(int enable)
253{
254 BlkMigDevState *bmds;
a55eb92c
JK
255 for (bmds = block_mig_state->bmds_first; bmds != NULL; bmds = bmds->next) {
256 bdrv_set_dirty_tracking(bmds->bs, enable);
c163b5ca 257 }
c163b5ca 258}
259
260static void init_blk_migration(QEMUFile *f)
261{
262 BlkMigDevState **pbmds, *bmds;
263 BlockDriverState *bs;
a55eb92c 264
c163b5ca 265 for (bs = bdrv_first; bs != NULL; bs = bs->next) {
a55eb92c 266 if (bs->type == BDRV_TYPE_HD) {
c163b5ca 267 bmds = qemu_mallocz(sizeof(BlkMigDevState));
268 bmds->bs = bs;
269 bmds->bulk_completed = 0;
6ea44308 270 bmds->total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
c163b5ca 271 bmds->shared_base = block_mig_state->shared_base;
a55eb92c
JK
272
273 if (bmds->shared_base) {
274 printf("Start migration for %s with shared base image\n",
c163b5ca 275 bs->device_name);
276 } else {
277 printf("Start full migration for %s\n", bs->device_name);
278 }
a55eb92c 279
c163b5ca 280 /* insert at the end */
281 pbmds = &block_mig_state->bmds_first;
a55eb92c 282 while (*pbmds != NULL) {
c163b5ca 283 pbmds = &(*pbmds)->next;
a55eb92c 284 }
c163b5ca 285 *pbmds = bmds;
a55eb92c 286
c163b5ca 287 blk_mig_save_dev_info(f, bmds);
c163b5ca 288 }
a55eb92c 289 }
c163b5ca 290}
291
292static int blk_mig_save_bulked_block(QEMUFile *f, int is_async)
293{
294 BlkMigDevState *bmds;
295
296 for (bmds = block_mig_state->bmds_first; bmds != NULL; bmds = bmds->next) {
a55eb92c
JK
297 if (bmds->bulk_completed == 0) {
298 if (is_async) {
299 if (mig_read_device_bulk(f, bmds) == 1) {
c163b5ca 300 /* completed bulk section for this device */
301 bmds->bulk_completed = 1;
302 }
303 } else {
a55eb92c 304 if (mig_save_device_bulk(f, bmds) == 1) {
c163b5ca 305 /* completed bulk section for this device */
306 bmds->bulk_completed = 1;
307 }
308 }
309 return 1;
310 }
311 }
a55eb92c 312
c163b5ca 313 /* we reached here means bulk is completed */
314 block_mig_state->bulk_completed = 1;
a55eb92c 315
c163b5ca 316 return 0;
c163b5ca 317}
318
319#define MAX_NUM_BLOCKS 4
320
321static void blk_mig_save_dirty_blocks(QEMUFile *f)
322{
323 BlkMigDevState *bmds;
324 uint8_t buf[BLOCK_SIZE];
325 int64_t sector;
326 int len;
a55eb92c
JK
327
328 for (bmds = block_mig_state->bmds_first; bmds != NULL; bmds = bmds->next) {
329 for (sector = 0; sector < bmds->cur_sector;) {
330 if (bdrv_get_dirty(bmds->bs, sector)) {
331 if (bdrv_read(bmds->bs, sector, buf,
6ea44308 332 BDRV_SECTORS_PER_DIRTY_CHUNK) < 0) {
a55eb92c 333 /* FIXME: add error handling */
c163b5ca 334 }
a55eb92c
JK
335
336 /* sector number and flags */
6ea44308 337 qemu_put_be64(f, (sector << BDRV_SECTOR_BITS)
a55eb92c
JK
338 | BLK_MIG_FLAG_DEVICE_BLOCK);
339
c163b5ca 340 /* device name */
c163b5ca 341 len = strlen(bmds->bs->device_name);
c163b5ca 342 qemu_put_byte(f, len);
343 qemu_put_buffer(f, (uint8_t *)bmds->bs->device_name, len);
a55eb92c 344
6ea44308 345 qemu_put_buffer(f, buf, BLOCK_SIZE);
a55eb92c
JK
346
347 bdrv_reset_dirty(bmds->bs, sector,
6ea44308 348 BDRV_SECTORS_PER_DIRTY_CHUNK);
a55eb92c 349 }
6ea44308 350 sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
c163b5ca 351 }
352 }
c163b5ca 353}
354
355static void flush_blks(QEMUFile* f)
356{
a55eb92c
JK
357 BlkMigBlock *blk, *next;
358
359 dprintf("%s Enter submitted %d read_done %d transfered\n", __FUNCTION__,
c163b5ca 360 submitted, read_done, transfered);
a55eb92c
JK
361
362 for (blk = block_mig_state->first_blk;
363 blk != NULL && !qemu_file_rate_limit(f);
364 blk = next) {
c163b5ca 365 send_blk(f, blk);
a55eb92c
JK
366
367 next = blk->next;
c163b5ca 368 qemu_free(blk->buf);
369 qemu_free(blk);
a55eb92c 370
c163b5ca 371 block_mig_state->read_done--;
372 block_mig_state->transferred++;
373 assert(block_mig_state->read_done >= 0);
374 }
375 block_mig_state->first_blk = blk;
a55eb92c
JK
376
377 if (block_mig_state->first_blk == NULL) {
c163b5ca 378 block_mig_state->last_blk = NULL;
379 }
380
a55eb92c
JK
381 dprintf("%s Exit submitted %d read_done %d transferred%d\n", __FUNCTION__,
382 block_mig_state->submitted, block_mig_state->read_done,
c163b5ca 383 block_mig_state->transferred);
c163b5ca 384}
385
386static int is_stage2_completed(void)
387{
388 BlkMigDevState *bmds;
a55eb92c
JK
389
390 if (block_mig_state->submitted > 0) {
c163b5ca 391 return 0;
392 }
a55eb92c 393
c163b5ca 394 for (bmds = block_mig_state->bmds_first; bmds != NULL; bmds = bmds->next) {
a55eb92c 395 if (bmds->bulk_completed == 0) {
c163b5ca 396 return 0;
397 }
398 }
a55eb92c 399
c163b5ca 400 return 1;
401}
402
403static int block_save_live(QEMUFile *f, int stage, void *opaque)
404{
a55eb92c 405 dprintf("Enter save live stage %d submitted %d transferred %d\n", stage,
c163b5ca 406 submitted, transferred);
a55eb92c
JK
407
408 if (block_mig_state->blk_enable != 1) {
c163b5ca 409 /* no need to migrate storage */
a55eb92c 410 qemu_put_be64(f, BLK_MIG_FLAG_EOS);
c163b5ca 411 return 1;
412 }
a55eb92c
JK
413
414 if (stage == 1) {
c163b5ca 415 init_blk_migration(f);
a55eb92c 416
c163b5ca 417 /* start track dirty blocks */
418 set_dirty_tracking(1);
c163b5ca 419 }
420
421 flush_blks(f);
a55eb92c 422
c163b5ca 423 /* control the rate of transfer */
a55eb92c
JK
424 while ((block_mig_state->submitted +
425 block_mig_state->read_done) * BLOCK_SIZE <
426 qemu_file_get_rate_limit(f)) {
427 if (blk_mig_save_bulked_block(f, 1) == 0) {
428 /* no more bulk blocks for now */
c163b5ca 429 break;
a55eb92c 430 }
c163b5ca 431 }
a55eb92c 432
c163b5ca 433 flush_blks(f);
a55eb92c
JK
434
435 if (stage == 3) {
436 while (blk_mig_save_bulked_block(f, 0) != 0) {
437 /* empty */
438 }
439
c163b5ca 440 blk_mig_save_dirty_blocks(f);
a55eb92c 441
c163b5ca 442 /* stop track dirty blocks */
a55eb92c
JK
443 set_dirty_tracking(0);
444
445 printf("\nBlock migration completed\n");
c163b5ca 446 }
a55eb92c
JK
447
448 qemu_put_be64(f, BLK_MIG_FLAG_EOS);
449
c163b5ca 450 return ((stage == 2) && is_stage2_completed());
451}
452
453static int block_load(QEMUFile *f, void *opaque, int version_id)
454{
455 int len, flags;
456 char device_name[256];
457 int64_t addr;
458 BlockDriverState *bs;
459 uint8_t *buf;
a55eb92c 460
c163b5ca 461 buf = qemu_malloc(BLOCK_SIZE);
a55eb92c 462
c163b5ca 463 do {
c163b5ca 464 addr = qemu_get_be64(f);
a55eb92c 465
6ea44308
JK
466 flags = addr & ~BDRV_SECTOR_MASK;
467 addr >>= BDRV_SECTOR_BITS;
a55eb92c
JK
468
469 if (flags & BLK_MIG_FLAG_DEVICE_BLOCK) {
c163b5ca 470 /* get device name */
471 len = qemu_get_byte(f);
a55eb92c 472
c163b5ca 473 qemu_get_buffer(f, (uint8_t *)device_name, len);
474 device_name[len] = '\0';
a55eb92c 475
c163b5ca 476 bs = bdrv_find(device_name);
a55eb92c
JK
477
478 qemu_get_buffer(f, buf, BLOCK_SIZE);
479 if (bs != NULL) {
6ea44308 480 bdrv_write(bs, addr, buf, BDRV_SECTORS_PER_DIRTY_CHUNK);
c163b5ca 481 } else {
482 printf("Error unknown block device %s\n", device_name);
a55eb92c 483 /* FIXME: add error handling */
c163b5ca 484 }
a55eb92c 485 } else if (!(flags & BLK_MIG_FLAG_EOS)) {
c163b5ca 486 printf("Unknown flags\n");
a55eb92c 487 /* FIXME: add error handling */
c163b5ca 488 }
a55eb92c
JK
489 } while (!(flags & BLK_MIG_FLAG_EOS));
490
c163b5ca 491 qemu_free(buf);
492
493 return 0;
494}
495
496static void block_set_params(int blk_enable, int shared_base, void *opaque)
497{
498 assert(opaque == block_mig_state);
499
500 block_mig_state->blk_enable = blk_enable;
501 block_mig_state->shared_base = shared_base;
a55eb92c 502
c163b5ca 503 /* shared base means that blk_enable = 1 */
504 block_mig_state->blk_enable |= shared_base;
c163b5ca 505}
506
507void blk_mig_info(void)
508{
509 BlockDriverState *bs;
a55eb92c 510
c163b5ca 511 for (bs = bdrv_first; bs != NULL; bs = bs->next) {
512 printf("Device %s\n", bs->device_name);
a55eb92c
JK
513 if (bs->type == BDRV_TYPE_HD) {
514 printf("device %s format %s\n",
c163b5ca 515 bs->device_name, bs->drv->format_name);
516 }
517 }
518}
519
520void blk_mig_init(void)
a55eb92c 521{
c163b5ca 522 block_mig_state = qemu_mallocz(sizeof(BlkMigState));
c163b5ca 523
a55eb92c
JK
524 register_savevm_live("block", 0, 1, block_set_params, block_save_live,
525 NULL, block_load, block_mig_state);
c163b5ca 526}