]> git.proxmox.com Git - qemu.git/blob - block-migration.c
f6fac73961571d74e2db267a87601488ce0947a3
[qemu.git] / block-migration.c
1 /*
2 * QEMU live block migration
3 *
4 * Copyright IBM, Corp. 2009
5 *
6 * Authors:
7 * Liran Schour <lirans@il.ibm.com>
8 *
9 * This work is licensed under the terms of the GNU GPL, version 2. See
10 * the COPYING file in the top-level directory.
11 *
12 */
13
14 #include "qemu-common.h"
15 #include "block_int.h"
16 #include "hw/hw.h"
17 #include "block-migration.h"
18 #include <assert.h>
19
20 #define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS)
21
22 #define BLK_MIG_FLAG_DEVICE_BLOCK 0x01
23 #define BLK_MIG_FLAG_EOS 0x02
24
25 #define MAX_IS_ALLOCATED_SEARCH 65536
26 #define MAX_BLOCKS_READ 10000
27 #define BLOCKS_READ_CHANGE 100
28 #define INITIAL_BLOCKS_READ 100
29
30 //#define DEBUG_BLK_MIGRATION
31
32 #ifdef DEBUG_BLK_MIGRATION
33 #define dprintf(fmt, ...) \
34 do { printf("blk_migration: " fmt, ## __VA_ARGS__); } while (0)
35 #else
36 #define dprintf(fmt, ...) \
37 do { } while (0)
38 #endif
39
40 typedef struct BlkMigDevState {
41 BlockDriverState *bs;
42 int bulk_completed;
43 int shared_base;
44 struct BlkMigDevState *next;
45 int64_t cur_sector;
46 int64_t total_sectors;
47 int64_t dirty;
48 } BlkMigDevState;
49
50 typedef struct BlkMigBlock {
51 uint8_t *buf;
52 BlkMigDevState *bmds;
53 int64_t sector;
54 struct iovec iov;
55 QEMUIOVector qiov;
56 BlockDriverAIOCB *aiocb;
57 int ret;
58 struct BlkMigBlock *next;
59 } BlkMigBlock;
60
61 typedef struct BlkMigState {
62 int bulk_completed;
63 int blk_enable;
64 int shared_base;
65 int no_dirty;
66 QEMUFile *load_file;
67 BlkMigDevState *bmds_first;
68 BlkMigBlock *first_blk;
69 BlkMigBlock *last_blk;
70 int submitted;
71 int read_done;
72 int transferred;
73 int64_t print_completion;
74 } BlkMigState;
75
76 static BlkMigState block_mig_state;
77
78 static void blk_mig_read_cb(void *opaque, int ret)
79 {
80 BlkMigBlock *blk = opaque;
81
82 blk->ret = ret;
83
84 /* insert at the end */
85 if (block_mig_state.last_blk == NULL) {
86 block_mig_state.first_blk = blk;
87 block_mig_state.last_blk = blk;
88 } else {
89 block_mig_state.last_blk->next = blk;
90 block_mig_state.last_blk = blk;
91 }
92
93 block_mig_state.submitted--;
94 block_mig_state.read_done++;
95 assert(block_mig_state.submitted >= 0);
96 }
97
98 static int mig_read_device_bulk(QEMUFile *f, BlkMigDevState *bms)
99 {
100 int nr_sectors;
101 int64_t total_sectors, cur_sector = 0;
102 BlockDriverState *bs = bms->bs;
103 BlkMigBlock *blk;
104
105 blk = qemu_malloc(sizeof(BlkMigBlock));
106 blk->buf = qemu_malloc(BLOCK_SIZE);
107
108 cur_sector = bms->cur_sector;
109 total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
110
111 if (bms->shared_base) {
112 while (cur_sector < bms->total_sectors &&
113 !bdrv_is_allocated(bms->bs, cur_sector,
114 MAX_IS_ALLOCATED_SEARCH, &nr_sectors)) {
115 cur_sector += nr_sectors;
116 }
117 }
118
119 if (cur_sector >= total_sectors) {
120 bms->cur_sector = total_sectors;
121 qemu_free(blk->buf);
122 qemu_free(blk);
123 return 1;
124 }
125
126 if (cur_sector >= block_mig_state.print_completion) {
127 printf("Completed %" PRId64 " %%\r", cur_sector * 100 / total_sectors);
128 fflush(stdout);
129 block_mig_state.print_completion +=
130 (BDRV_SECTORS_PER_DIRTY_CHUNK * 10000);
131 }
132
133 /* we are going to transfer a full block even if it is not allocated */
134 nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
135
136 cur_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1);
137
138 if (total_sectors - cur_sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
139 nr_sectors = (total_sectors - cur_sector);
140 }
141
142 bms->cur_sector = cur_sector + nr_sectors;
143 blk->sector = cur_sector;
144 blk->bmds = bms;
145 blk->next = NULL;
146
147 blk->iov.iov_base = blk->buf;
148 blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
149 qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
150
151 blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
152 nr_sectors, blk_mig_read_cb, blk);
153
154 if (!blk->aiocb) {
155 printf("Error reading sector %" PRId64 "\n", cur_sector);
156 qemu_free(blk->buf);
157 qemu_free(blk);
158 return 0;
159 }
160
161 bdrv_reset_dirty(bms->bs, cur_sector, nr_sectors);
162 block_mig_state.submitted++;
163
164 return (bms->cur_sector >= total_sectors);
165 }
166
167 static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
168 {
169 int len, nr_sectors;
170 int64_t total_sectors = bmds->total_sectors, cur_sector = 0;
171 uint8_t *tmp_buf = NULL;
172 BlockDriverState *bs = bmds->bs;
173
174 tmp_buf = qemu_malloc(BLOCK_SIZE);
175
176 cur_sector = bmds->cur_sector;
177
178 if (bmds->shared_base) {
179 while (cur_sector < bmds->total_sectors &&
180 !bdrv_is_allocated(bmds->bs, cur_sector,
181 MAX_IS_ALLOCATED_SEARCH, &nr_sectors)) {
182 cur_sector += nr_sectors;
183 }
184 }
185
186 if (cur_sector >= total_sectors) {
187 bmds->cur_sector = total_sectors;
188 qemu_free(tmp_buf);
189 return 1;
190 }
191
192 if (cur_sector >= block_mig_state.print_completion) {
193 printf("Completed %" PRId64 " %%\r", cur_sector * 100 / total_sectors);
194 fflush(stdout);
195 block_mig_state.print_completion +=
196 (BDRV_SECTORS_PER_DIRTY_CHUNK * 10000);
197 }
198
199 cur_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1);
200
201 /* we are going to transfer a full block even if it is not allocated */
202 nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
203
204 if (total_sectors - cur_sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
205 nr_sectors = (total_sectors - cur_sector);
206 }
207
208 if (bdrv_read(bs, cur_sector, tmp_buf, nr_sectors) < 0) {
209 printf("Error reading sector %" PRId64 "\n", cur_sector);
210 }
211
212 bdrv_reset_dirty(bs, cur_sector, nr_sectors);
213
214 /* sector number and flags */
215 qemu_put_be64(f, (cur_sector << BDRV_SECTOR_BITS)
216 | BLK_MIG_FLAG_DEVICE_BLOCK);
217
218 /* device name */
219 len = strlen(bs->device_name);
220 qemu_put_byte(f, len);
221 qemu_put_buffer(f, (uint8_t *)bs->device_name, len);
222
223 qemu_put_buffer(f, tmp_buf, BLOCK_SIZE);
224
225 bmds->cur_sector = cur_sector + BDRV_SECTORS_PER_DIRTY_CHUNK;
226
227 qemu_free(tmp_buf);
228
229 return (bmds->cur_sector >= total_sectors);
230 }
231
232 static void send_blk(QEMUFile *f, BlkMigBlock * blk)
233 {
234 int len;
235
236 /* sector number and flags */
237 qemu_put_be64(f, (blk->sector << BDRV_SECTOR_BITS)
238 | BLK_MIG_FLAG_DEVICE_BLOCK);
239
240 /* device name */
241 len = strlen(blk->bmds->bs->device_name);
242 qemu_put_byte(f, len);
243 qemu_put_buffer(f, (uint8_t *)blk->bmds->bs->device_name, len);
244
245 qemu_put_buffer(f, blk->buf, BLOCK_SIZE);
246 }
247
248 static void blk_mig_save_dev_info(QEMUFile *f, BlkMigDevState *bmds)
249 {
250 }
251
252 static void set_dirty_tracking(int enable)
253 {
254 BlkMigDevState *bmds;
255 for (bmds = block_mig_state.bmds_first; bmds != NULL; bmds = bmds->next) {
256 bdrv_set_dirty_tracking(bmds->bs, enable);
257 }
258 }
259
260 static void init_blk_migration(QEMUFile *f)
261 {
262 BlkMigDevState **pbmds, *bmds;
263 BlockDriverState *bs;
264
265 for (bs = bdrv_first; bs != NULL; bs = bs->next) {
266 if (bs->type == BDRV_TYPE_HD) {
267 bmds = qemu_mallocz(sizeof(BlkMigDevState));
268 bmds->bs = bs;
269 bmds->bulk_completed = 0;
270 bmds->total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
271 bmds->shared_base = block_mig_state.shared_base;
272
273 if (bmds->shared_base) {
274 printf("Start migration for %s with shared base image\n",
275 bs->device_name);
276 } else {
277 printf("Start full migration for %s\n", bs->device_name);
278 }
279
280 /* insert at the end */
281 pbmds = &block_mig_state.bmds_first;
282 while (*pbmds != NULL) {
283 pbmds = &(*pbmds)->next;
284 }
285 *pbmds = bmds;
286
287 blk_mig_save_dev_info(f, bmds);
288 }
289 }
290 }
291
292 static int blk_mig_save_bulked_block(QEMUFile *f, int is_async)
293 {
294 BlkMigDevState *bmds;
295
296 for (bmds = block_mig_state.bmds_first; bmds != NULL; bmds = bmds->next) {
297 if (bmds->bulk_completed == 0) {
298 if (is_async) {
299 if (mig_read_device_bulk(f, bmds) == 1) {
300 /* completed bulk section for this device */
301 bmds->bulk_completed = 1;
302 }
303 } else {
304 if (mig_save_device_bulk(f, bmds) == 1) {
305 /* completed bulk section for this device */
306 bmds->bulk_completed = 1;
307 }
308 }
309 return 1;
310 }
311 }
312
313 /* we reached here means bulk is completed */
314 block_mig_state.bulk_completed = 1;
315
316 return 0;
317 }
318
319 #define MAX_NUM_BLOCKS 4
320
321 static void blk_mig_save_dirty_blocks(QEMUFile *f)
322 {
323 BlkMigDevState *bmds;
324 uint8_t *buf;
325 int64_t sector;
326 int len;
327
328 buf = qemu_malloc(BLOCK_SIZE);
329
330 for (bmds = block_mig_state.bmds_first; bmds != NULL; bmds = bmds->next) {
331 for (sector = 0; sector < bmds->cur_sector;) {
332 if (bdrv_get_dirty(bmds->bs, sector)) {
333 if (bdrv_read(bmds->bs, sector, buf,
334 BDRV_SECTORS_PER_DIRTY_CHUNK) < 0) {
335 /* FIXME: add error handling */
336 }
337
338 /* sector number and flags */
339 qemu_put_be64(f, (sector << BDRV_SECTOR_BITS)
340 | BLK_MIG_FLAG_DEVICE_BLOCK);
341
342 /* device name */
343 len = strlen(bmds->bs->device_name);
344 qemu_put_byte(f, len);
345 qemu_put_buffer(f, (uint8_t *)bmds->bs->device_name, len);
346
347 qemu_put_buffer(f, buf, BLOCK_SIZE);
348
349 bdrv_reset_dirty(bmds->bs, sector,
350 BDRV_SECTORS_PER_DIRTY_CHUNK);
351 }
352 sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
353 }
354 }
355
356 qemu_free(buf);
357 }
358
359 static void flush_blks(QEMUFile* f)
360 {
361 BlkMigBlock *blk, *next;
362
363 dprintf("%s Enter submitted %d read_done %d transferred %d\n",
364 __FUNCTION__, block_mig_state.submitted, block_mig_state.read_done,
365 block_mig_state.transferred);
366
367 for (blk = block_mig_state.first_blk;
368 blk != NULL && !qemu_file_rate_limit(f);
369 blk = next) {
370 send_blk(f, blk);
371
372 next = blk->next;
373 qemu_free(blk->buf);
374 qemu_free(blk);
375
376 block_mig_state.read_done--;
377 block_mig_state.transferred++;
378 assert(block_mig_state.read_done >= 0);
379 }
380 block_mig_state.first_blk = blk;
381
382 if (block_mig_state.first_blk == NULL) {
383 block_mig_state.last_blk = NULL;
384 }
385
386 dprintf("%s Exit submitted %d read_done %d transferred %d\n", __FUNCTION__,
387 block_mig_state.submitted, block_mig_state.read_done,
388 block_mig_state.transferred);
389 }
390
391 static int is_stage2_completed(void)
392 {
393 BlkMigDevState *bmds;
394
395 if (block_mig_state.submitted > 0) {
396 return 0;
397 }
398
399 for (bmds = block_mig_state.bmds_first; bmds != NULL; bmds = bmds->next) {
400 if (bmds->bulk_completed == 0) {
401 return 0;
402 }
403 }
404
405 return 1;
406 }
407
408 static int block_save_live(QEMUFile *f, int stage, void *opaque)
409 {
410 dprintf("Enter save live stage %d submitted %d transferred %d\n",
411 stage, block_mig_state.submitted, block_mig_state.transferred);
412
413 if (block_mig_state.blk_enable != 1) {
414 /* no need to migrate storage */
415 qemu_put_be64(f, BLK_MIG_FLAG_EOS);
416 return 1;
417 }
418
419 if (stage == 1) {
420 init_blk_migration(f);
421
422 /* start track dirty blocks */
423 set_dirty_tracking(1);
424 }
425
426 flush_blks(f);
427
428 /* control the rate of transfer */
429 while ((block_mig_state.submitted +
430 block_mig_state.read_done) * BLOCK_SIZE <
431 qemu_file_get_rate_limit(f)) {
432 if (blk_mig_save_bulked_block(f, 1) == 0) {
433 /* no more bulk blocks for now */
434 break;
435 }
436 }
437
438 flush_blks(f);
439
440 if (stage == 3) {
441 while (blk_mig_save_bulked_block(f, 0) != 0) {
442 /* empty */
443 }
444
445 blk_mig_save_dirty_blocks(f);
446
447 /* stop track dirty blocks */
448 set_dirty_tracking(0);
449
450 printf("\nBlock migration completed\n");
451 }
452
453 qemu_put_be64(f, BLK_MIG_FLAG_EOS);
454
455 return ((stage == 2) && is_stage2_completed());
456 }
457
458 static int block_load(QEMUFile *f, void *opaque, int version_id)
459 {
460 int len, flags;
461 char device_name[256];
462 int64_t addr;
463 BlockDriverState *bs;
464 uint8_t *buf;
465
466 do {
467 addr = qemu_get_be64(f);
468
469 flags = addr & ~BDRV_SECTOR_MASK;
470 addr >>= BDRV_SECTOR_BITS;
471
472 if (flags & BLK_MIG_FLAG_DEVICE_BLOCK) {
473 /* get device name */
474 len = qemu_get_byte(f);
475
476 qemu_get_buffer(f, (uint8_t *)device_name, len);
477 device_name[len] = '\0';
478
479 bs = bdrv_find(device_name);
480
481 buf = qemu_malloc(BLOCK_SIZE);
482
483 qemu_get_buffer(f, buf, BLOCK_SIZE);
484 if (bs != NULL) {
485 bdrv_write(bs, addr, buf, BDRV_SECTORS_PER_DIRTY_CHUNK);
486 } else {
487 printf("Error unknown block device %s\n", device_name);
488 /* FIXME: add error handling */
489 }
490
491 qemu_free(buf);
492 } else if (!(flags & BLK_MIG_FLAG_EOS)) {
493 printf("Unknown flags\n");
494 /* FIXME: add error handling */
495 }
496 } while (!(flags & BLK_MIG_FLAG_EOS));
497
498 return 0;
499 }
500
501 static void block_set_params(int blk_enable, int shared_base, void *opaque)
502 {
503 block_mig_state.blk_enable = blk_enable;
504 block_mig_state.shared_base = shared_base;
505
506 /* shared base means that blk_enable = 1 */
507 block_mig_state.blk_enable |= shared_base;
508 }
509
510 void blk_mig_info(void)
511 {
512 BlockDriverState *bs;
513
514 for (bs = bdrv_first; bs != NULL; bs = bs->next) {
515 printf("Device %s\n", bs->device_name);
516 if (bs->type == BDRV_TYPE_HD) {
517 printf("device %s format %s\n",
518 bs->device_name, bs->drv->format_name);
519 }
520 }
521 }
522
523 void blk_mig_init(void)
524 {
525 register_savevm_live("block", 0, 1, block_set_params, block_save_live,
526 NULL, block_load, &block_mig_state);
527 }