]>
Commit | Line | Data |
---|---|---|
c163b5ca | 1 | /* |
2 | * QEMU live block migration | |
3 | * | |
4 | * Copyright IBM, Corp. 2009 | |
5 | * | |
6 | * Authors: | |
7 | * Liran Schour <lirans@il.ibm.com> | |
8 | * | |
9 | * This work is licensed under the terms of the GNU GPL, version 2. See | |
10 | * the COPYING file in the top-level directory. | |
11 | * | |
12 | */ | |
13 | ||
14 | #include "qemu-common.h" | |
15 | #include "block_int.h" | |
16 | #include "hw/hw.h" | |
17 | #include "block-migration.h" | |
18 | #include <assert.h> | |
c163b5ca | 19 | |
20 | #define SECTOR_BITS 9 | |
21 | #define SECTOR_SIZE (1 << SECTOR_BITS) | |
22 | #define SECTOR_MASK ~(SECTOR_SIZE - 1); | |
23 | ||
24 | #define BLOCK_SIZE (block_mig_state->sectors_per_block << SECTOR_BITS) | |
25 | ||
26 | #define BLK_MIG_FLAG_DEVICE_BLOCK 0x01 | |
27 | #define BLK_MIG_FLAG_EOS 0x02 | |
28 | ||
29 | #define MAX_IS_ALLOCATED_SEARCH 65536 | |
30 | #define MAX_BLOCKS_READ 10000 | |
31 | #define BLOCKS_READ_CHANGE 100 | |
32 | #define INITIAL_BLOCKS_READ 100 | |
33 | ||
34 | //#define DEBUG_BLK_MIGRATION | |
35 | ||
36 | #ifdef DEBUG_BLK_MIGRATION | |
37 | #define dprintf(fmt, ...) \ | |
38 | do { printf("blk_migration: " fmt, ## __VA_ARGS__); } while (0) | |
39 | #else | |
40 | #define dprintf(fmt, ...) \ | |
41 | do { } while (0) | |
42 | #endif | |
43 | ||
44 | typedef struct BlkMigBlock { | |
45 | uint8_t *buf; | |
46 | BlkMigDevState *bmds; | |
47 | int64_t sector; | |
48 | struct iovec iov; | |
49 | QEMUIOVector qiov; | |
50 | BlockDriverAIOCB *aiocb; | |
51 | int ret; | |
52 | struct BlkMigBlock *next; | |
53 | } BlkMigBlock; | |
54 | ||
55 | typedef struct BlkMigState { | |
56 | int bulk_completed; | |
57 | int blk_enable; | |
58 | int shared_base; | |
59 | int no_dirty; | |
60 | QEMUFile *load_file; | |
61 | BlkMigDevState *bmds_first; | |
62 | int sectors_per_block; | |
63 | BlkMigBlock *first_blk; | |
64 | BlkMigBlock *last_blk; | |
65 | int submitted; | |
66 | int read_done; | |
67 | int transferred; | |
68 | int64_t print_completion; | |
69 | } BlkMigState; | |
70 | ||
71 | static BlkMigState *block_mig_state = NULL; | |
72 | ||
73 | static void blk_mig_read_cb(void *opaque, int ret) | |
74 | { | |
75 | BlkMigBlock *blk = opaque; | |
76 | ||
77 | blk->ret = ret; | |
78 | ||
79 | /* insert at the end */ | |
80 | if(block_mig_state->last_blk == NULL) { | |
81 | block_mig_state->first_blk = blk; | |
82 | block_mig_state->last_blk = blk; | |
83 | } else { | |
84 | block_mig_state->last_blk->next = blk; | |
85 | block_mig_state->last_blk = blk; | |
86 | } | |
87 | ||
88 | block_mig_state->submitted--; | |
89 | block_mig_state->read_done++; | |
90 | assert(block_mig_state->submitted >= 0); | |
91 | ||
92 | return; | |
93 | } | |
94 | ||
95 | static int mig_read_device_bulk(QEMUFile *f, BlkMigDevState *bms) | |
96 | { | |
97 | int nr_sectors; | |
98 | int64_t total_sectors, cur_sector = 0; | |
99 | BlockDriverState *bs = bms->bs; | |
100 | BlkMigBlock *blk; | |
101 | ||
102 | blk = qemu_malloc(sizeof(BlkMigBlock)); | |
103 | blk->buf = qemu_malloc(BLOCK_SIZE); | |
104 | ||
105 | cur_sector = bms->cur_sector; | |
106 | total_sectors = bdrv_getlength(bs) >> SECTOR_BITS; | |
107 | ||
108 | if(bms->shared_base) { | |
109 | while(cur_sector < bms->total_sectors && | |
110 | !bdrv_is_allocated(bms->bs, cur_sector, | |
111 | MAX_IS_ALLOCATED_SEARCH, &nr_sectors)) { | |
112 | cur_sector += nr_sectors; | |
113 | } | |
114 | } | |
115 | ||
116 | if(cur_sector >= total_sectors) { | |
117 | bms->cur_sector = total_sectors; | |
118 | qemu_free(blk->buf); | |
119 | qemu_free(blk); | |
120 | return 1; | |
121 | } | |
122 | ||
123 | if(cur_sector >= block_mig_state->print_completion) { | |
124 | printf("Completed %" PRId64 " %%\r", cur_sector * 100 / total_sectors); | |
125 | fflush(stdout); | |
126 | block_mig_state->print_completion += | |
127 | (block_mig_state->sectors_per_block * 10000); | |
128 | } | |
129 | ||
130 | /* we going to transfder BLOCK_SIZE any way even if it is not allocated */ | |
131 | nr_sectors = block_mig_state->sectors_per_block; | |
132 | ||
133 | cur_sector &= ~((int64_t)block_mig_state->sectors_per_block -1); | |
134 | ||
135 | if(total_sectors - cur_sector < block_mig_state->sectors_per_block) { | |
136 | nr_sectors = (total_sectors - cur_sector); | |
137 | } | |
138 | ||
139 | bms->cur_sector = cur_sector + nr_sectors; | |
140 | blk->sector = cur_sector; | |
141 | blk->bmds = bms; | |
142 | blk->next = NULL; | |
143 | ||
144 | blk->iov.iov_base = blk->buf; | |
145 | blk->iov.iov_len = nr_sectors * SECTOR_SIZE; | |
146 | qemu_iovec_init_external(&blk->qiov, &blk->iov, 1); | |
147 | ||
148 | blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov, | |
149 | nr_sectors, blk_mig_read_cb, blk); | |
150 | ||
151 | if(!blk->aiocb) { | |
152 | printf("Error reading sector %" PRId64 "\n", cur_sector); | |
153 | qemu_free(blk->buf); | |
154 | qemu_free(blk); | |
155 | return 0; | |
156 | } | |
157 | ||
158 | bdrv_reset_dirty(bms->bs, cur_sector, nr_sectors); | |
159 | block_mig_state->submitted++; | |
160 | ||
161 | return (bms->cur_sector >= total_sectors); | |
162 | } | |
163 | ||
164 | static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds) | |
165 | { | |
166 | int len, nr_sectors; | |
167 | int64_t total_sectors = bmds->total_sectors, cur_sector = 0; | |
168 | uint8_t *tmp_buf = NULL; | |
169 | BlockDriverState *bs = bmds->bs; | |
170 | ||
171 | tmp_buf = qemu_malloc(BLOCK_SIZE); | |
172 | ||
173 | cur_sector = bmds->cur_sector; | |
174 | ||
175 | if(bmds->shared_base) { | |
176 | while(cur_sector < bmds->total_sectors && | |
177 | !bdrv_is_allocated(bmds->bs, cur_sector, | |
178 | MAX_IS_ALLOCATED_SEARCH, &nr_sectors)) { | |
179 | cur_sector += nr_sectors; | |
180 | } | |
181 | } | |
182 | ||
183 | if(cur_sector >= total_sectors) { | |
184 | bmds->cur_sector = total_sectors; | |
185 | qemu_free(tmp_buf); | |
186 | return 1; | |
187 | } | |
188 | ||
189 | if(cur_sector >= block_mig_state->print_completion) { | |
190 | printf("Completed %" PRId64 " %%\r", cur_sector * 100 / total_sectors); | |
191 | fflush(stdout); | |
192 | block_mig_state->print_completion += | |
193 | (block_mig_state->sectors_per_block * 10000); | |
194 | } | |
195 | ||
196 | cur_sector &= ~((int64_t)block_mig_state->sectors_per_block -1); | |
197 | ||
198 | /* we going to transfer | |
199 | BLOCK_SIZE | |
200 | any way even if it is not allocated */ | |
201 | nr_sectors = block_mig_state->sectors_per_block; | |
202 | ||
203 | if(total_sectors - cur_sector < block_mig_state->sectors_per_block) { | |
204 | nr_sectors = (total_sectors - cur_sector); | |
205 | } | |
206 | ||
207 | if(bdrv_read(bs, cur_sector, tmp_buf, nr_sectors) < 0) { | |
208 | printf("Error reading sector %" PRId64 "\n", cur_sector); | |
209 | } | |
210 | ||
211 | bdrv_reset_dirty(bs, cur_sector, nr_sectors); | |
212 | ||
213 | /* Device name */ | |
214 | qemu_put_be64(f,(cur_sector << SECTOR_BITS) | BLK_MIG_FLAG_DEVICE_BLOCK); | |
215 | ||
216 | len = strlen(bs->device_name); | |
217 | qemu_put_byte(f, len); | |
218 | qemu_put_buffer(f, (uint8_t *)bs->device_name, len); | |
219 | ||
220 | qemu_put_buffer(f, tmp_buf, | |
221 | BLOCK_SIZE); | |
222 | ||
223 | bmds->cur_sector = cur_sector + block_mig_state->sectors_per_block; | |
224 | ||
225 | qemu_free(tmp_buf); | |
226 | ||
227 | return (bmds->cur_sector >= total_sectors); | |
228 | } | |
229 | ||
230 | static void send_blk(QEMUFile *f, BlkMigBlock * blk) | |
231 | { | |
232 | int len; | |
233 | ||
234 | /* Device name */ | |
235 | qemu_put_be64(f,(blk->sector << SECTOR_BITS) | BLK_MIG_FLAG_DEVICE_BLOCK); | |
236 | ||
237 | len = strlen(blk->bmds->bs->device_name); | |
238 | qemu_put_byte(f, len); | |
239 | qemu_put_buffer(f, (uint8_t *)blk->bmds->bs->device_name, len); | |
240 | ||
241 | qemu_put_buffer(f, blk->buf, | |
242 | BLOCK_SIZE); | |
243 | ||
244 | return; | |
245 | } | |
246 | ||
247 | static void blk_mig_save_dev_info(QEMUFile *f, BlkMigDevState *bmds) | |
248 | { | |
249 | } | |
250 | ||
251 | static void set_dirty_tracking(int enable) | |
252 | { | |
253 | BlkMigDevState *bmds; | |
254 | for(bmds = block_mig_state->bmds_first; bmds != NULL; bmds = bmds->next) { | |
255 | bdrv_set_dirty_tracking(bmds->bs,enable); | |
256 | } | |
257 | ||
258 | return; | |
259 | } | |
260 | ||
261 | static void init_blk_migration(QEMUFile *f) | |
262 | { | |
263 | BlkMigDevState **pbmds, *bmds; | |
264 | BlockDriverState *bs; | |
265 | ||
266 | for (bs = bdrv_first; bs != NULL; bs = bs->next) { | |
267 | if(bs->type == BDRV_TYPE_HD) { | |
268 | bmds = qemu_mallocz(sizeof(BlkMigDevState)); | |
269 | bmds->bs = bs; | |
270 | bmds->bulk_completed = 0; | |
271 | bmds->total_sectors = bdrv_getlength(bs) >> SECTOR_BITS; | |
272 | bmds->shared_base = block_mig_state->shared_base; | |
273 | ||
274 | if(bmds->shared_base) { | |
275 | printf("Start migration for %s with shared base image\n", | |
276 | bs->device_name); | |
277 | } else { | |
278 | printf("Start full migration for %s\n", bs->device_name); | |
279 | } | |
280 | ||
281 | /* insert at the end */ | |
282 | pbmds = &block_mig_state->bmds_first; | |
283 | while (*pbmds != NULL) | |
284 | pbmds = &(*pbmds)->next; | |
285 | *pbmds = bmds; | |
286 | ||
287 | blk_mig_save_dev_info(f, bmds); | |
288 | ||
289 | } | |
290 | } | |
291 | ||
292 | block_mig_state->sectors_per_block = bdrv_get_sectors_per_chunk(); | |
293 | ||
294 | return; | |
295 | } | |
296 | ||
297 | static int blk_mig_save_bulked_block(QEMUFile *f, int is_async) | |
298 | { | |
299 | BlkMigDevState *bmds; | |
300 | ||
301 | for (bmds = block_mig_state->bmds_first; bmds != NULL; bmds = bmds->next) { | |
302 | if(bmds->bulk_completed == 0) { | |
303 | if(is_async) { | |
304 | if(mig_read_device_bulk(f, bmds) == 1) { | |
305 | /* completed bulk section for this device */ | |
306 | bmds->bulk_completed = 1; | |
307 | } | |
308 | } else { | |
309 | if(mig_save_device_bulk(f,bmds) == 1) { | |
310 | /* completed bulk section for this device */ | |
311 | bmds->bulk_completed = 1; | |
312 | } | |
313 | } | |
314 | return 1; | |
315 | } | |
316 | } | |
317 | ||
318 | /* we reached here means bulk is completed */ | |
319 | block_mig_state->bulk_completed = 1; | |
320 | ||
321 | return 0; | |
322 | ||
323 | } | |
324 | ||
325 | #define MAX_NUM_BLOCKS 4 | |
326 | ||
327 | static void blk_mig_save_dirty_blocks(QEMUFile *f) | |
328 | { | |
329 | BlkMigDevState *bmds; | |
330 | uint8_t buf[BLOCK_SIZE]; | |
331 | int64_t sector; | |
332 | int len; | |
333 | ||
334 | for(bmds = block_mig_state->bmds_first; bmds != NULL; bmds = bmds->next) { | |
335 | for(sector = 0; sector < bmds->cur_sector;) { | |
336 | ||
337 | if(bdrv_get_dirty(bmds->bs,sector)) { | |
338 | ||
339 | if(bdrv_read(bmds->bs, sector, buf, | |
340 | block_mig_state->sectors_per_block) < 0) { | |
341 | } | |
342 | ||
343 | /* device name */ | |
344 | qemu_put_be64(f,(sector << SECTOR_BITS) | |
345 | | BLK_MIG_FLAG_DEVICE_BLOCK); | |
346 | ||
347 | len = strlen(bmds->bs->device_name); | |
348 | ||
349 | qemu_put_byte(f, len); | |
350 | qemu_put_buffer(f, (uint8_t *)bmds->bs->device_name, len); | |
351 | ||
352 | qemu_put_buffer(f, buf, | |
353 | (block_mig_state->sectors_per_block * | |
354 | SECTOR_SIZE)); | |
355 | ||
356 | bdrv_reset_dirty(bmds->bs, sector, | |
357 | block_mig_state->sectors_per_block); | |
358 | ||
359 | sector += block_mig_state->sectors_per_block; | |
360 | } else { | |
361 | /* sector is clean */ | |
362 | sector += block_mig_state->sectors_per_block; | |
363 | } | |
364 | } | |
365 | } | |
366 | ||
367 | return; | |
368 | } | |
369 | ||
370 | static void flush_blks(QEMUFile* f) | |
371 | { | |
372 | BlkMigBlock *blk, *tmp; | |
373 | ||
374 | dprintf("%s Enter submitted %d read_done %d transfered\n", __FUNCTION__, | |
375 | submitted, read_done, transfered); | |
376 | ||
377 | for(blk = block_mig_state->first_blk; | |
378 | blk != NULL && !qemu_file_rate_limit(f); blk = tmp) { | |
379 | send_blk(f, blk); | |
380 | ||
381 | tmp = blk->next; | |
382 | qemu_free(blk->buf); | |
383 | qemu_free(blk); | |
384 | ||
385 | block_mig_state->read_done--; | |
386 | block_mig_state->transferred++; | |
387 | assert(block_mig_state->read_done >= 0); | |
388 | } | |
389 | block_mig_state->first_blk = blk; | |
390 | ||
391 | if(block_mig_state->first_blk == NULL) { | |
392 | block_mig_state->last_blk = NULL; | |
393 | } | |
394 | ||
395 | dprintf("%s Exit submitted %d read_done %d transferred%d\n", __FUNCTION__, | |
396 | block_mig_state->submitted, block_mig_state->read_done, | |
397 | block_mig_state->transferred); | |
398 | ||
399 | return; | |
400 | } | |
401 | ||
402 | static int is_stage2_completed(void) | |
403 | { | |
404 | BlkMigDevState *bmds; | |
405 | ||
406 | if(block_mig_state->submitted > 0) { | |
407 | return 0; | |
408 | } | |
409 | ||
410 | for (bmds = block_mig_state->bmds_first; bmds != NULL; bmds = bmds->next) { | |
411 | if(bmds->bulk_completed == 0) { | |
412 | return 0; | |
413 | } | |
414 | } | |
415 | ||
416 | return 1; | |
417 | } | |
418 | ||
419 | static int block_save_live(QEMUFile *f, int stage, void *opaque) | |
420 | { | |
421 | int ret = 1; | |
422 | ||
423 | dprintf("Enter save live stage %d submitted %d transferred %d\n", stage, | |
424 | submitted, transferred); | |
425 | ||
426 | if(block_mig_state->blk_enable != 1) { | |
427 | /* no need to migrate storage */ | |
428 | ||
429 | qemu_put_be64(f,BLK_MIG_FLAG_EOS); | |
430 | return 1; | |
431 | } | |
432 | ||
433 | if(stage == 1) { | |
434 | init_blk_migration(f); | |
435 | ||
436 | /* start track dirty blocks */ | |
437 | set_dirty_tracking(1); | |
438 | ||
439 | } | |
440 | ||
441 | flush_blks(f); | |
442 | ||
443 | /* control the rate of transfer */ | |
444 | while ((block_mig_state->submitted + block_mig_state->read_done) * | |
445 | (BLOCK_SIZE) < | |
446 | (qemu_file_get_rate_limit(f))) { | |
447 | ||
448 | ret = blk_mig_save_bulked_block(f, 1); | |
449 | ||
450 | if (ret == 0) /* no more bulk blocks for now*/ | |
451 | break; | |
452 | } | |
453 | ||
454 | flush_blks(f); | |
455 | ||
456 | if(stage == 3) { | |
457 | ||
458 | while(blk_mig_save_bulked_block(f, 0) != 0); | |
459 | ||
460 | blk_mig_save_dirty_blocks(f); | |
461 | ||
462 | /* stop track dirty blocks */ | |
463 | set_dirty_tracking(0);; | |
464 | ||
465 | printf("\nBlock migration completed\n"); | |
466 | } | |
467 | ||
468 | qemu_put_be64(f,BLK_MIG_FLAG_EOS); | |
469 | ||
470 | return ((stage == 2) && is_stage2_completed()); | |
471 | } | |
472 | ||
473 | static int block_load(QEMUFile *f, void *opaque, int version_id) | |
474 | { | |
475 | int len, flags; | |
476 | char device_name[256]; | |
477 | int64_t addr; | |
478 | BlockDriverState *bs; | |
479 | uint8_t *buf; | |
480 | ||
481 | block_mig_state->sectors_per_block = bdrv_get_sectors_per_chunk(); | |
482 | buf = qemu_malloc(BLOCK_SIZE); | |
483 | ||
484 | do { | |
485 | ||
486 | addr = qemu_get_be64(f); | |
487 | ||
488 | flags = addr & ~SECTOR_MASK; | |
489 | addr &= SECTOR_MASK; | |
490 | ||
491 | if(flags & BLK_MIG_FLAG_DEVICE_BLOCK) { | |
492 | ||
493 | /* get device name */ | |
494 | len = qemu_get_byte(f); | |
495 | ||
496 | qemu_get_buffer(f, (uint8_t *)device_name, len); | |
497 | device_name[len] = '\0'; | |
498 | ||
499 | bs = bdrv_find(device_name); | |
500 | ||
501 | qemu_get_buffer(f, buf, | |
502 | BLOCK_SIZE); | |
503 | if(bs != NULL) { | |
504 | ||
505 | bdrv_write(bs, (addr >> SECTOR_BITS), | |
506 | buf, block_mig_state->sectors_per_block); | |
507 | } else { | |
508 | printf("Error unknown block device %s\n", device_name); | |
509 | } | |
510 | } else if(flags & BLK_MIG_FLAG_EOS) { | |
511 | ||
512 | } else { | |
513 | printf("Unknown flags\n"); | |
514 | } | |
515 | } while(!(flags & BLK_MIG_FLAG_EOS)); | |
516 | ||
517 | qemu_free(buf); | |
518 | ||
519 | return 0; | |
520 | } | |
521 | ||
522 | static void block_set_params(int blk_enable, int shared_base, void *opaque) | |
523 | { | |
524 | assert(opaque == block_mig_state); | |
525 | ||
526 | block_mig_state->blk_enable = blk_enable; | |
527 | block_mig_state->shared_base = shared_base; | |
528 | ||
529 | /* shared base means that blk_enable = 1 */ | |
530 | block_mig_state->blk_enable |= shared_base; | |
531 | ||
532 | return; | |
533 | } | |
534 | ||
535 | void blk_mig_info(void) | |
536 | { | |
537 | BlockDriverState *bs; | |
538 | ||
539 | for (bs = bdrv_first; bs != NULL; bs = bs->next) { | |
540 | printf("Device %s\n", bs->device_name); | |
541 | if(bs->type == BDRV_TYPE_HD) { | |
542 | printf("device %s format %s\n", | |
543 | bs->device_name, bs->drv->format_name); | |
544 | } | |
545 | } | |
546 | } | |
547 | ||
548 | void blk_mig_init(void) | |
549 | { | |
550 | ||
551 | block_mig_state = qemu_mallocz(sizeof(BlkMigState)); | |
552 | ||
553 | register_savevm_live("block", 0, 1, block_set_params, block_save_live, | |
554 | NULL, block_load, block_mig_state); | |
555 | ||
556 | ||
557 | } |