]> git.proxmox.com Git - qemu.git/blame - block-migration.c
Block live migration
[qemu.git] / block-migration.c
CommitLineData
c163b5ca 1/*
2 * QEMU live block migration
3 *
4 * Copyright IBM, Corp. 2009
5 *
6 * Authors:
7 * Liran Schour <lirans@il.ibm.com>
8 *
9 * This work is licensed under the terms of the GNU GPL, version 2. See
10 * the COPYING file in the top-level directory.
11 *
12 */
13
14#include "qemu-common.h"
15#include "block_int.h"
16#include "hw/hw.h"
17#include "block-migration.h"
18#include <assert.h>
19#include <pthread.h>
20
21#define SECTOR_BITS 9
22#define SECTOR_SIZE (1 << SECTOR_BITS)
23#define SECTOR_MASK ~(SECTOR_SIZE - 1);
24
25#define BLOCK_SIZE (block_mig_state->sectors_per_block << SECTOR_BITS)
26
27#define BLK_MIG_FLAG_DEVICE_BLOCK 0x01
28#define BLK_MIG_FLAG_EOS 0x02
29
30#define MAX_IS_ALLOCATED_SEARCH 65536
31#define MAX_BLOCKS_READ 10000
32#define BLOCKS_READ_CHANGE 100
33#define INITIAL_BLOCKS_READ 100
34
35//#define DEBUG_BLK_MIGRATION
36
37#ifdef DEBUG_BLK_MIGRATION
38#define dprintf(fmt, ...) \
39 do { printf("blk_migration: " fmt, ## __VA_ARGS__); } while (0)
40#else
41#define dprintf(fmt, ...) \
42 do { } while (0)
43#endif
44
45typedef struct BlkMigBlock {
46 uint8_t *buf;
47 BlkMigDevState *bmds;
48 int64_t sector;
49 struct iovec iov;
50 QEMUIOVector qiov;
51 BlockDriverAIOCB *aiocb;
52 int ret;
53 struct BlkMigBlock *next;
54} BlkMigBlock;
55
56typedef struct BlkMigState {
57 int bulk_completed;
58 int blk_enable;
59 int shared_base;
60 int no_dirty;
61 QEMUFile *load_file;
62 BlkMigDevState *bmds_first;
63 int sectors_per_block;
64 BlkMigBlock *first_blk;
65 BlkMigBlock *last_blk;
66 int submitted;
67 int read_done;
68 int transferred;
69 int64_t print_completion;
70} BlkMigState;
71
72static BlkMigState *block_mig_state = NULL;
73
74static void blk_mig_read_cb(void *opaque, int ret)
75{
76 BlkMigBlock *blk = opaque;
77
78 blk->ret = ret;
79
80 /* insert at the end */
81 if(block_mig_state->last_blk == NULL) {
82 block_mig_state->first_blk = blk;
83 block_mig_state->last_blk = blk;
84 } else {
85 block_mig_state->last_blk->next = blk;
86 block_mig_state->last_blk = blk;
87 }
88
89 block_mig_state->submitted--;
90 block_mig_state->read_done++;
91 assert(block_mig_state->submitted >= 0);
92
93 return;
94}
95
96static int mig_read_device_bulk(QEMUFile *f, BlkMigDevState *bms)
97{
98 int nr_sectors;
99 int64_t total_sectors, cur_sector = 0;
100 BlockDriverState *bs = bms->bs;
101 BlkMigBlock *blk;
102
103 blk = qemu_malloc(sizeof(BlkMigBlock));
104 blk->buf = qemu_malloc(BLOCK_SIZE);
105
106 cur_sector = bms->cur_sector;
107 total_sectors = bdrv_getlength(bs) >> SECTOR_BITS;
108
109 if(bms->shared_base) {
110 while(cur_sector < bms->total_sectors &&
111 !bdrv_is_allocated(bms->bs, cur_sector,
112 MAX_IS_ALLOCATED_SEARCH, &nr_sectors)) {
113 cur_sector += nr_sectors;
114 }
115 }
116
117 if(cur_sector >= total_sectors) {
118 bms->cur_sector = total_sectors;
119 qemu_free(blk->buf);
120 qemu_free(blk);
121 return 1;
122 }
123
124 if(cur_sector >= block_mig_state->print_completion) {
125 printf("Completed %" PRId64 " %%\r", cur_sector * 100 / total_sectors);
126 fflush(stdout);
127 block_mig_state->print_completion +=
128 (block_mig_state->sectors_per_block * 10000);
129 }
130
131 /* we going to transfder BLOCK_SIZE any way even if it is not allocated */
132 nr_sectors = block_mig_state->sectors_per_block;
133
134 cur_sector &= ~((int64_t)block_mig_state->sectors_per_block -1);
135
136 if(total_sectors - cur_sector < block_mig_state->sectors_per_block) {
137 nr_sectors = (total_sectors - cur_sector);
138 }
139
140 bms->cur_sector = cur_sector + nr_sectors;
141 blk->sector = cur_sector;
142 blk->bmds = bms;
143 blk->next = NULL;
144
145 blk->iov.iov_base = blk->buf;
146 blk->iov.iov_len = nr_sectors * SECTOR_SIZE;
147 qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
148
149 blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
150 nr_sectors, blk_mig_read_cb, blk);
151
152 if(!blk->aiocb) {
153 printf("Error reading sector %" PRId64 "\n", cur_sector);
154 qemu_free(blk->buf);
155 qemu_free(blk);
156 return 0;
157 }
158
159 bdrv_reset_dirty(bms->bs, cur_sector, nr_sectors);
160 block_mig_state->submitted++;
161
162 return (bms->cur_sector >= total_sectors);
163}
164
165static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
166{
167 int len, nr_sectors;
168 int64_t total_sectors = bmds->total_sectors, cur_sector = 0;
169 uint8_t *tmp_buf = NULL;
170 BlockDriverState *bs = bmds->bs;
171
172 tmp_buf = qemu_malloc(BLOCK_SIZE);
173
174 cur_sector = bmds->cur_sector;
175
176 if(bmds->shared_base) {
177 while(cur_sector < bmds->total_sectors &&
178 !bdrv_is_allocated(bmds->bs, cur_sector,
179 MAX_IS_ALLOCATED_SEARCH, &nr_sectors)) {
180 cur_sector += nr_sectors;
181 }
182 }
183
184 if(cur_sector >= total_sectors) {
185 bmds->cur_sector = total_sectors;
186 qemu_free(tmp_buf);
187 return 1;
188 }
189
190 if(cur_sector >= block_mig_state->print_completion) {
191 printf("Completed %" PRId64 " %%\r", cur_sector * 100 / total_sectors);
192 fflush(stdout);
193 block_mig_state->print_completion +=
194 (block_mig_state->sectors_per_block * 10000);
195 }
196
197 cur_sector &= ~((int64_t)block_mig_state->sectors_per_block -1);
198
199 /* we going to transfer
200 BLOCK_SIZE
201 any way even if it is not allocated */
202 nr_sectors = block_mig_state->sectors_per_block;
203
204 if(total_sectors - cur_sector < block_mig_state->sectors_per_block) {
205 nr_sectors = (total_sectors - cur_sector);
206 }
207
208 if(bdrv_read(bs, cur_sector, tmp_buf, nr_sectors) < 0) {
209 printf("Error reading sector %" PRId64 "\n", cur_sector);
210 }
211
212 bdrv_reset_dirty(bs, cur_sector, nr_sectors);
213
214 /* Device name */
215 qemu_put_be64(f,(cur_sector << SECTOR_BITS) | BLK_MIG_FLAG_DEVICE_BLOCK);
216
217 len = strlen(bs->device_name);
218 qemu_put_byte(f, len);
219 qemu_put_buffer(f, (uint8_t *)bs->device_name, len);
220
221 qemu_put_buffer(f, tmp_buf,
222 BLOCK_SIZE);
223
224 bmds->cur_sector = cur_sector + block_mig_state->sectors_per_block;
225
226 qemu_free(tmp_buf);
227
228 return (bmds->cur_sector >= total_sectors);
229}
230
231static void send_blk(QEMUFile *f, BlkMigBlock * blk)
232{
233 int len;
234
235 /* Device name */
236 qemu_put_be64(f,(blk->sector << SECTOR_BITS) | BLK_MIG_FLAG_DEVICE_BLOCK);
237
238 len = strlen(blk->bmds->bs->device_name);
239 qemu_put_byte(f, len);
240 qemu_put_buffer(f, (uint8_t *)blk->bmds->bs->device_name, len);
241
242 qemu_put_buffer(f, blk->buf,
243 BLOCK_SIZE);
244
245 return;
246}
247
248static void blk_mig_save_dev_info(QEMUFile *f, BlkMigDevState *bmds)
249{
250}
251
252static void set_dirty_tracking(int enable)
253{
254 BlkMigDevState *bmds;
255 for(bmds = block_mig_state->bmds_first; bmds != NULL; bmds = bmds->next) {
256 bdrv_set_dirty_tracking(bmds->bs,enable);
257 }
258
259 return;
260}
261
262static void init_blk_migration(QEMUFile *f)
263{
264 BlkMigDevState **pbmds, *bmds;
265 BlockDriverState *bs;
266
267 for (bs = bdrv_first; bs != NULL; bs = bs->next) {
268 if(bs->type == BDRV_TYPE_HD) {
269 bmds = qemu_mallocz(sizeof(BlkMigDevState));
270 bmds->bs = bs;
271 bmds->bulk_completed = 0;
272 bmds->total_sectors = bdrv_getlength(bs) >> SECTOR_BITS;
273 bmds->shared_base = block_mig_state->shared_base;
274
275 if(bmds->shared_base) {
276 printf("Start migration for %s with shared base image\n",
277 bs->device_name);
278 } else {
279 printf("Start full migration for %s\n", bs->device_name);
280 }
281
282 /* insert at the end */
283 pbmds = &block_mig_state->bmds_first;
284 while (*pbmds != NULL)
285 pbmds = &(*pbmds)->next;
286 *pbmds = bmds;
287
288 blk_mig_save_dev_info(f, bmds);
289
290 }
291 }
292
293 block_mig_state->sectors_per_block = bdrv_get_sectors_per_chunk();
294
295 return;
296}
297
298static int blk_mig_save_bulked_block(QEMUFile *f, int is_async)
299{
300 BlkMigDevState *bmds;
301
302 for (bmds = block_mig_state->bmds_first; bmds != NULL; bmds = bmds->next) {
303 if(bmds->bulk_completed == 0) {
304 if(is_async) {
305 if(mig_read_device_bulk(f, bmds) == 1) {
306 /* completed bulk section for this device */
307 bmds->bulk_completed = 1;
308 }
309 } else {
310 if(mig_save_device_bulk(f,bmds) == 1) {
311 /* completed bulk section for this device */
312 bmds->bulk_completed = 1;
313 }
314 }
315 return 1;
316 }
317 }
318
319 /* we reached here means bulk is completed */
320 block_mig_state->bulk_completed = 1;
321
322 return 0;
323
324}
325
326#define MAX_NUM_BLOCKS 4
327
328static void blk_mig_save_dirty_blocks(QEMUFile *f)
329{
330 BlkMigDevState *bmds;
331 uint8_t buf[BLOCK_SIZE];
332 int64_t sector;
333 int len;
334
335 for(bmds = block_mig_state->bmds_first; bmds != NULL; bmds = bmds->next) {
336 for(sector = 0; sector < bmds->cur_sector;) {
337
338 if(bdrv_get_dirty(bmds->bs,sector)) {
339
340 if(bdrv_read(bmds->bs, sector, buf,
341 block_mig_state->sectors_per_block) < 0) {
342 }
343
344 /* device name */
345 qemu_put_be64(f,(sector << SECTOR_BITS)
346 | BLK_MIG_FLAG_DEVICE_BLOCK);
347
348 len = strlen(bmds->bs->device_name);
349
350 qemu_put_byte(f, len);
351 qemu_put_buffer(f, (uint8_t *)bmds->bs->device_name, len);
352
353 qemu_put_buffer(f, buf,
354 (block_mig_state->sectors_per_block *
355 SECTOR_SIZE));
356
357 bdrv_reset_dirty(bmds->bs, sector,
358 block_mig_state->sectors_per_block);
359
360 sector += block_mig_state->sectors_per_block;
361 } else {
362 /* sector is clean */
363 sector += block_mig_state->sectors_per_block;
364 }
365 }
366 }
367
368 return;
369}
370
371static void flush_blks(QEMUFile* f)
372{
373 BlkMigBlock *blk, *tmp;
374
375 dprintf("%s Enter submitted %d read_done %d transfered\n", __FUNCTION__,
376 submitted, read_done, transfered);
377
378 for(blk = block_mig_state->first_blk;
379 blk != NULL && !qemu_file_rate_limit(f); blk = tmp) {
380 send_blk(f, blk);
381
382 tmp = blk->next;
383 qemu_free(blk->buf);
384 qemu_free(blk);
385
386 block_mig_state->read_done--;
387 block_mig_state->transferred++;
388 assert(block_mig_state->read_done >= 0);
389 }
390 block_mig_state->first_blk = blk;
391
392 if(block_mig_state->first_blk == NULL) {
393 block_mig_state->last_blk = NULL;
394 }
395
396 dprintf("%s Exit submitted %d read_done %d transferred%d\n", __FUNCTION__,
397 block_mig_state->submitted, block_mig_state->read_done,
398 block_mig_state->transferred);
399
400 return;
401}
402
403static int is_stage2_completed(void)
404{
405 BlkMigDevState *bmds;
406
407 if(block_mig_state->submitted > 0) {
408 return 0;
409 }
410
411 for (bmds = block_mig_state->bmds_first; bmds != NULL; bmds = bmds->next) {
412 if(bmds->bulk_completed == 0) {
413 return 0;
414 }
415 }
416
417 return 1;
418}
419
420static int block_save_live(QEMUFile *f, int stage, void *opaque)
421{
422 int ret = 1;
423
424 dprintf("Enter save live stage %d submitted %d transferred %d\n", stage,
425 submitted, transferred);
426
427 if(block_mig_state->blk_enable != 1) {
428 /* no need to migrate storage */
429
430 qemu_put_be64(f,BLK_MIG_FLAG_EOS);
431 return 1;
432 }
433
434 if(stage == 1) {
435 init_blk_migration(f);
436
437 /* start track dirty blocks */
438 set_dirty_tracking(1);
439
440 }
441
442 flush_blks(f);
443
444 /* control the rate of transfer */
445 while ((block_mig_state->submitted + block_mig_state->read_done) *
446 (BLOCK_SIZE) <
447 (qemu_file_get_rate_limit(f))) {
448
449 ret = blk_mig_save_bulked_block(f, 1);
450
451 if (ret == 0) /* no more bulk blocks for now*/
452 break;
453 }
454
455 flush_blks(f);
456
457 if(stage == 3) {
458
459 while(blk_mig_save_bulked_block(f, 0) != 0);
460
461 blk_mig_save_dirty_blocks(f);
462
463 /* stop track dirty blocks */
464 set_dirty_tracking(0);;
465
466 printf("\nBlock migration completed\n");
467 }
468
469 qemu_put_be64(f,BLK_MIG_FLAG_EOS);
470
471 return ((stage == 2) && is_stage2_completed());
472}
473
474static int block_load(QEMUFile *f, void *opaque, int version_id)
475{
476 int len, flags;
477 char device_name[256];
478 int64_t addr;
479 BlockDriverState *bs;
480 uint8_t *buf;
481
482 block_mig_state->sectors_per_block = bdrv_get_sectors_per_chunk();
483 buf = qemu_malloc(BLOCK_SIZE);
484
485 do {
486
487 addr = qemu_get_be64(f);
488
489 flags = addr & ~SECTOR_MASK;
490 addr &= SECTOR_MASK;
491
492 if(flags & BLK_MIG_FLAG_DEVICE_BLOCK) {
493
494 /* get device name */
495 len = qemu_get_byte(f);
496
497 qemu_get_buffer(f, (uint8_t *)device_name, len);
498 device_name[len] = '\0';
499
500 bs = bdrv_find(device_name);
501
502 qemu_get_buffer(f, buf,
503 BLOCK_SIZE);
504 if(bs != NULL) {
505
506 bdrv_write(bs, (addr >> SECTOR_BITS),
507 buf, block_mig_state->sectors_per_block);
508 } else {
509 printf("Error unknown block device %s\n", device_name);
510 }
511 } else if(flags & BLK_MIG_FLAG_EOS) {
512
513 } else {
514 printf("Unknown flags\n");
515 }
516 } while(!(flags & BLK_MIG_FLAG_EOS));
517
518 qemu_free(buf);
519
520 return 0;
521}
522
523static void block_set_params(int blk_enable, int shared_base, void *opaque)
524{
525 assert(opaque == block_mig_state);
526
527 block_mig_state->blk_enable = blk_enable;
528 block_mig_state->shared_base = shared_base;
529
530 /* shared base means that blk_enable = 1 */
531 block_mig_state->blk_enable |= shared_base;
532
533 return;
534}
535
536void blk_mig_info(void)
537{
538 BlockDriverState *bs;
539
540 for (bs = bdrv_first; bs != NULL; bs = bs->next) {
541 printf("Device %s\n", bs->device_name);
542 if(bs->type == BDRV_TYPE_HD) {
543 printf("device %s format %s\n",
544 bs->device_name, bs->drv->format_name);
545 }
546 }
547}
548
549void blk_mig_init(void)
550{
551
552 block_mig_state = qemu_mallocz(sizeof(BlkMigState));
553
554 register_savevm_live("block", 0, 1, block_set_params, block_save_live,
555 NULL, block_load, block_mig_state);
556
557
558}