]> git.proxmox.com Git - mirror_qemu.git/blob - block-migration.c
30705891d0b225c77361999b359ca987ad456555
[mirror_qemu.git] / block-migration.c
1 /*
2 * QEMU live block migration
3 *
4 * Copyright IBM, Corp. 2009
5 *
6 * Authors:
7 * Liran Schour <lirans@il.ibm.com>
8 *
9 * This work is licensed under the terms of the GNU GPL, version 2. See
10 * the COPYING file in the top-level directory.
11 *
12 */
13
14 #include "qemu-common.h"
15 #include "block_int.h"
16 #include "hw/hw.h"
17 #include "qemu-queue.h"
18 #include "monitor.h"
19 #include "block-migration.h"
20 #include <assert.h>
21
22 #define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS)
23
24 #define BLK_MIG_FLAG_DEVICE_BLOCK 0x01
25 #define BLK_MIG_FLAG_EOS 0x02
26 #define BLK_MIG_FLAG_PROGRESS 0x04
27
28 #define MAX_IS_ALLOCATED_SEARCH 65536
29
30 //#define DEBUG_BLK_MIGRATION
31
32 #ifdef DEBUG_BLK_MIGRATION
33 #define DPRINTF(fmt, ...) \
34 do { printf("blk_migration: " fmt, ## __VA_ARGS__); } while (0)
35 #else
36 #define DPRINTF(fmt, ...) \
37 do { } while (0)
38 #endif
39
40 typedef struct BlkMigDevState {
41 BlockDriverState *bs;
42 int bulk_completed;
43 int shared_base;
44 int64_t cur_sector;
45 int64_t cur_dirty;
46 int64_t completed_sectors;
47 int64_t total_sectors;
48 int64_t dirty;
49 QSIMPLEQ_ENTRY(BlkMigDevState) entry;
50 } BlkMigDevState;
51
52 typedef struct BlkMigBlock {
53 uint8_t *buf;
54 BlkMigDevState *bmds;
55 int64_t sector;
56 struct iovec iov;
57 QEMUIOVector qiov;
58 BlockDriverAIOCB *aiocb;
59 int ret;
60 QSIMPLEQ_ENTRY(BlkMigBlock) entry;
61 } BlkMigBlock;
62
63 typedef struct BlkMigState {
64 int blk_enable;
65 int shared_base;
66 QSIMPLEQ_HEAD(bmds_list, BlkMigDevState) bmds_list;
67 QSIMPLEQ_HEAD(blk_list, BlkMigBlock) blk_list;
68 int submitted;
69 int read_done;
70 int transferred;
71 int64_t total_sector_sum;
72 int prev_progress;
73 int bulk_completed;
74 int dirty_iterations;
75 } BlkMigState;
76
77 static BlkMigState block_mig_state;
78
79 static void blk_send(QEMUFile *f, BlkMigBlock * blk)
80 {
81 int len;
82
83 /* sector number and flags */
84 qemu_put_be64(f, (blk->sector << BDRV_SECTOR_BITS)
85 | BLK_MIG_FLAG_DEVICE_BLOCK);
86
87 /* device name */
88 len = strlen(blk->bmds->bs->device_name);
89 qemu_put_byte(f, len);
90 qemu_put_buffer(f, (uint8_t *)blk->bmds->bs->device_name, len);
91
92 qemu_put_buffer(f, blk->buf, BLOCK_SIZE);
93 }
94
95 int blk_mig_active(void)
96 {
97 return !QSIMPLEQ_EMPTY(&block_mig_state.bmds_list);
98 }
99
100 uint64_t blk_mig_bytes_transferred(void)
101 {
102 BlkMigDevState *bmds;
103 uint64_t sum = 0;
104
105 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
106 sum += bmds->completed_sectors;
107 }
108 return sum << BDRV_SECTOR_BITS;
109 }
110
111 uint64_t blk_mig_bytes_remaining(void)
112 {
113 return blk_mig_bytes_total() - blk_mig_bytes_transferred();
114 }
115
116 uint64_t blk_mig_bytes_total(void)
117 {
118 BlkMigDevState *bmds;
119 uint64_t sum = 0;
120
121 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
122 sum += bmds->total_sectors;
123 }
124 return sum << BDRV_SECTOR_BITS;
125 }
126
127 static void blk_mig_read_cb(void *opaque, int ret)
128 {
129 BlkMigBlock *blk = opaque;
130
131 blk->ret = ret;
132
133 QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
134
135 block_mig_state.submitted--;
136 block_mig_state.read_done++;
137 assert(block_mig_state.submitted >= 0);
138 }
139
140 static int mig_save_device_bulk(Monitor *mon, QEMUFile *f,
141 BlkMigDevState *bmds)
142 {
143 int64_t total_sectors = bmds->total_sectors;
144 int64_t cur_sector = bmds->cur_sector;
145 BlockDriverState *bs = bmds->bs;
146 BlkMigBlock *blk;
147 int nr_sectors;
148
149 if (bmds->shared_base) {
150 while (cur_sector < total_sectors &&
151 !bdrv_is_allocated(bs, cur_sector, MAX_IS_ALLOCATED_SEARCH,
152 &nr_sectors)) {
153 cur_sector += nr_sectors;
154 }
155 }
156
157 if (cur_sector >= total_sectors) {
158 bmds->cur_sector = bmds->completed_sectors = total_sectors;
159 return 1;
160 }
161
162 bmds->completed_sectors = cur_sector;
163
164 cur_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1);
165
166 /* we are going to transfer a full block even if it is not allocated */
167 nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
168
169 if (total_sectors - cur_sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
170 nr_sectors = total_sectors - cur_sector;
171 }
172
173 blk = qemu_malloc(sizeof(BlkMigBlock));
174 blk->buf = qemu_malloc(BLOCK_SIZE);
175 blk->bmds = bmds;
176 blk->sector = cur_sector;
177
178 blk->iov.iov_base = blk->buf;
179 blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
180 qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
181
182 blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
183 nr_sectors, blk_mig_read_cb, blk);
184 if (!blk->aiocb) {
185 goto error;
186 }
187 block_mig_state.submitted++;
188
189 bdrv_reset_dirty(bs, cur_sector, nr_sectors);
190 bmds->cur_sector = cur_sector + nr_sectors;
191
192 return (bmds->cur_sector >= total_sectors);
193
194 error:
195 monitor_printf(mon, "Error reading sector %" PRId64 "\n", cur_sector);
196 qemu_file_set_error(f);
197 qemu_free(blk->buf);
198 qemu_free(blk);
199 return 0;
200 }
201
202 static void set_dirty_tracking(int enable)
203 {
204 BlkMigDevState *bmds;
205
206 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
207 bdrv_set_dirty_tracking(bmds->bs, enable);
208 }
209 }
210
211 static void init_blk_migration(Monitor *mon, QEMUFile *f)
212 {
213 BlkMigDevState *bmds;
214 BlockDriverState *bs;
215 int64_t sectors;
216
217 block_mig_state.submitted = 0;
218 block_mig_state.read_done = 0;
219 block_mig_state.transferred = 0;
220 block_mig_state.total_sector_sum = 0;
221 block_mig_state.prev_progress = -1;
222 block_mig_state.bulk_completed = 0;
223
224 for (bs = bdrv_first; bs != NULL; bs = bs->next) {
225 if (bs->type == BDRV_TYPE_HD) {
226 sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
227 if (sectors == 0) {
228 continue;
229 }
230
231 bmds = qemu_mallocz(sizeof(BlkMigDevState));
232 bmds->bs = bs;
233 bmds->bulk_completed = 0;
234 bmds->total_sectors = sectors;
235 bmds->completed_sectors = 0;
236 bmds->shared_base = block_mig_state.shared_base;
237
238 block_mig_state.total_sector_sum += sectors;
239
240 if (bmds->shared_base) {
241 monitor_printf(mon, "Start migration for %s with shared base "
242 "image\n",
243 bs->device_name);
244 } else {
245 monitor_printf(mon, "Start full migration for %s\n",
246 bs->device_name);
247 }
248
249 QSIMPLEQ_INSERT_TAIL(&block_mig_state.bmds_list, bmds, entry);
250 }
251 }
252 }
253
254 static int blk_mig_save_bulked_block(Monitor *mon, QEMUFile *f)
255 {
256 int64_t completed_sector_sum = 0;
257 BlkMigDevState *bmds;
258 int progress;
259 int ret = 0;
260
261 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
262 if (bmds->bulk_completed == 0) {
263 if (mig_save_device_bulk(mon, f, bmds) == 1) {
264 /* completed bulk section for this device */
265 bmds->bulk_completed = 1;
266 }
267 completed_sector_sum += bmds->completed_sectors;
268 ret = 1;
269 break;
270 } else {
271 completed_sector_sum += bmds->completed_sectors;
272 }
273 }
274
275 progress = completed_sector_sum * 100 / block_mig_state.total_sector_sum;
276 if (progress != block_mig_state.prev_progress) {
277 block_mig_state.prev_progress = progress;
278 qemu_put_be64(f, (progress << BDRV_SECTOR_BITS)
279 | BLK_MIG_FLAG_PROGRESS);
280 monitor_printf(mon, "Completed %d %%\r", progress);
281 monitor_flush(mon);
282 }
283
284 return ret;
285 }
286
287 static void blk_mig_reset_dirty_cursor(void)
288 {
289 BlkMigDevState *bmds;
290
291 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
292 bmds->cur_dirty = 0;
293 }
294 }
295
296 static int mig_save_device_dirty(Monitor *mon, QEMUFile *f,
297 BlkMigDevState *bmds, int is_async)
298 {
299 BlkMigBlock *blk;
300 int64_t total_sectors = bmds->total_sectors;
301 int64_t sector;
302 int nr_sectors;
303
304 for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) {
305 if (bdrv_get_dirty(bmds->bs, sector)) {
306
307 if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
308 nr_sectors = total_sectors - sector;
309 } else {
310 nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
311 }
312 blk = qemu_malloc(sizeof(BlkMigBlock));
313 blk->buf = qemu_malloc(BLOCK_SIZE);
314 blk->bmds = bmds;
315 blk->sector = sector;
316
317 if(is_async) {
318 blk->iov.iov_base = blk->buf;
319 blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
320 qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
321
322 blk->aiocb = bdrv_aio_readv(bmds->bs, sector, &blk->qiov,
323 nr_sectors, blk_mig_read_cb, blk);
324 if (!blk->aiocb) {
325 goto error;
326 }
327 block_mig_state.submitted++;
328 } else {
329 if (bdrv_read(bmds->bs, sector, blk->buf,
330 nr_sectors) < 0) {
331 goto error;
332 }
333 blk_send(f, blk);
334
335 qemu_free(blk->buf);
336 qemu_free(blk);
337 }
338
339 bdrv_reset_dirty(bmds->bs, sector, nr_sectors);
340 break;
341 }
342 sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
343 bmds->cur_dirty = sector;
344 }
345
346 return (bmds->cur_dirty >= bmds->total_sectors);
347
348 error:
349 monitor_printf(mon, "Error reading sector %" PRId64 "\n", sector);
350 qemu_file_set_error(f);
351 qemu_free(blk->buf);
352 qemu_free(blk);
353 return 0;
354 }
355
356 static int blk_mig_save_dirty_block(Monitor *mon, QEMUFile *f, int is_async)
357 {
358 BlkMigDevState *bmds;
359 int ret = 0;
360
361 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
362 if(mig_save_device_dirty(mon, f, bmds, is_async) == 0) {
363 ret = 1;
364 break;
365 }
366 }
367
368 return ret;
369 }
370
371 static void flush_blks(QEMUFile* f)
372 {
373 BlkMigBlock *blk;
374
375 DPRINTF("%s Enter submitted %d read_done %d transferred %d\n",
376 __FUNCTION__, block_mig_state.submitted, block_mig_state.read_done,
377 block_mig_state.transferred);
378
379 while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
380 if (qemu_file_rate_limit(f)) {
381 break;
382 }
383 if (blk->ret < 0) {
384 qemu_file_set_error(f);
385 break;
386 }
387 blk_send(f, blk);
388
389 QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
390 qemu_free(blk->buf);
391 qemu_free(blk);
392
393 block_mig_state.read_done--;
394 block_mig_state.transferred++;
395 assert(block_mig_state.read_done >= 0);
396 }
397
398 DPRINTF("%s Exit submitted %d read_done %d transferred %d\n", __FUNCTION__,
399 block_mig_state.submitted, block_mig_state.read_done,
400 block_mig_state.transferred);
401 }
402
403 static int is_stage2_completed(void)
404 {
405 return (block_mig_state.submitted == 0 && block_mig_state.bulk_completed);
406 }
407
408 static void blk_mig_cleanup(Monitor *mon)
409 {
410 BlkMigDevState *bmds;
411 BlkMigBlock *blk;
412
413 while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) {
414 QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry);
415 qemu_free(bmds);
416 }
417
418 while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
419 QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
420 qemu_free(blk->buf);
421 qemu_free(blk);
422 }
423
424 set_dirty_tracking(0);
425
426 monitor_printf(mon, "\n");
427 }
428
429 static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
430 {
431 DPRINTF("Enter save live stage %d submitted %d transferred %d\n",
432 stage, block_mig_state.submitted, block_mig_state.transferred);
433
434 if (stage < 0) {
435 blk_mig_cleanup(mon);
436 return 0;
437 }
438
439 if (block_mig_state.blk_enable != 1) {
440 /* no need to migrate storage */
441 qemu_put_be64(f, BLK_MIG_FLAG_EOS);
442 return 1;
443 }
444
445 if (stage == 1) {
446 init_blk_migration(mon, f);
447
448 /* start track dirty blocks */
449 set_dirty_tracking(1);
450 }
451
452 flush_blks(f);
453
454 if (qemu_file_has_error(f)) {
455 blk_mig_cleanup(mon);
456 return 0;
457 }
458
459 blk_mig_reset_dirty_cursor();
460
461 if(stage == 2) {
462 /* control the rate of transfer */
463 while ((block_mig_state.submitted +
464 block_mig_state.read_done) * BLOCK_SIZE <
465 qemu_file_get_rate_limit(f)) {
466 if (block_mig_state.bulk_completed == 0) {
467 /* first finish the bulk phase */
468 if (blk_mig_save_bulked_block(mon, f) == 0) {
469 /* finish saving bulk on all devices */
470 block_mig_state.bulk_completed = 1;
471 }
472 } else {
473 if (blk_mig_save_dirty_block(mon, f, 1) == 0) {
474 /* no more dirty blocks */
475 break;
476 }
477 }
478 }
479
480 flush_blks(f);
481
482 if (qemu_file_has_error(f)) {
483 blk_mig_cleanup(mon);
484 return 0;
485 }
486 }
487
488 if (stage == 3) {
489 /* we know for sure that save bulk is completed */
490
491 while(blk_mig_save_dirty_block(mon, f, 0) != 0);
492 blk_mig_cleanup(mon);
493
494 /* report completion */
495 qemu_put_be64(f, (100 << BDRV_SECTOR_BITS) | BLK_MIG_FLAG_PROGRESS);
496
497 if (qemu_file_has_error(f)) {
498 return 0;
499 }
500
501 monitor_printf(mon, "Block migration completed\n");
502 }
503
504 qemu_put_be64(f, BLK_MIG_FLAG_EOS);
505
506 return ((stage == 2) && is_stage2_completed());
507 }
508
509 static int block_load(QEMUFile *f, void *opaque, int version_id)
510 {
511 static int banner_printed;
512 int len, flags;
513 char device_name[256];
514 int64_t addr;
515 BlockDriverState *bs;
516 uint8_t *buf;
517
518 do {
519 addr = qemu_get_be64(f);
520
521 flags = addr & ~BDRV_SECTOR_MASK;
522 addr >>= BDRV_SECTOR_BITS;
523
524 if (flags & BLK_MIG_FLAG_DEVICE_BLOCK) {
525 /* get device name */
526 len = qemu_get_byte(f);
527 qemu_get_buffer(f, (uint8_t *)device_name, len);
528 device_name[len] = '\0';
529
530 bs = bdrv_find(device_name);
531 if (!bs) {
532 fprintf(stderr, "Error unknown block device %s\n",
533 device_name);
534 return -EINVAL;
535 }
536
537 buf = qemu_malloc(BLOCK_SIZE);
538
539 qemu_get_buffer(f, buf, BLOCK_SIZE);
540 bdrv_write(bs, addr, buf, BDRV_SECTORS_PER_DIRTY_CHUNK);
541
542 qemu_free(buf);
543 } else if (flags & BLK_MIG_FLAG_PROGRESS) {
544 if (!banner_printed) {
545 printf("Receiving block device images\n");
546 banner_printed = 1;
547 }
548 printf("Completed %d %%%c", (int)addr,
549 (addr == 100) ? '\n' : '\r');
550 fflush(stdout);
551 } else if (!(flags & BLK_MIG_FLAG_EOS)) {
552 fprintf(stderr, "Unknown flags\n");
553 return -EINVAL;
554 }
555 if (qemu_file_has_error(f)) {
556 return -EIO;
557 }
558 } while (!(flags & BLK_MIG_FLAG_EOS));
559
560 return 0;
561 }
562
563 static void block_set_params(int blk_enable, int shared_base, void *opaque)
564 {
565 block_mig_state.blk_enable = blk_enable;
566 block_mig_state.shared_base = shared_base;
567
568 /* shared base means that blk_enable = 1 */
569 block_mig_state.blk_enable |= shared_base;
570 }
571
572 void blk_mig_init(void)
573 {
574 QSIMPLEQ_INIT(&block_mig_state.bmds_list);
575 QSIMPLEQ_INIT(&block_mig_state.blk_list);
576
577 register_savevm_live("block", 0, 1, block_set_params, block_save_live,
578 NULL, block_load, &block_mig_state);
579 }