]>
Commit | Line | Data |
---|---|---|
c163b5ca LS |
1 | /* |
2 | * QEMU live block migration | |
3 | * | |
4 | * Copyright IBM, Corp. 2009 | |
5 | * | |
6 | * Authors: | |
7 | * Liran Schour <lirans@il.ibm.com> | |
8 | * | |
9 | * This work is licensed under the terms of the GNU GPL, version 2. See | |
10 | * the COPYING file in the top-level directory. | |
11 | * | |
12 | */ | |
13 | ||
14 | #include "qemu-common.h" | |
15 | #include "block_int.h" | |
16 | #include "hw/hw.h" | |
17 | #include "block-migration.h" | |
18 | #include <assert.h> | |
19 | #include <pthread.h> | |
20 | ||
21 | #define SECTOR_BITS 9 | |
22 | #define SECTOR_SIZE (1 << SECTOR_BITS) | |
23 | #define SECTOR_MASK ~(SECTOR_SIZE - 1); | |
24 | ||
25 | #define BLOCK_SIZE (block_mig_state->sectors_per_block << SECTOR_BITS) | |
26 | ||
27 | #define BLK_MIG_FLAG_DEVICE_BLOCK 0x01 | |
28 | #define BLK_MIG_FLAG_EOS 0x02 | |
29 | ||
30 | #define MAX_IS_ALLOCATED_SEARCH 65536 | |
31 | #define MAX_BLOCKS_READ 10000 | |
32 | #define BLOCKS_READ_CHANGE 100 | |
33 | #define INITIAL_BLOCKS_READ 100 | |
34 | ||
35 | //#define DEBUG_BLK_MIGRATION | |
36 | ||
37 | #ifdef DEBUG_BLK_MIGRATION | |
38 | #define dprintf(fmt, ...) \ | |
39 | do { printf("blk_migration: " fmt, ## __VA_ARGS__); } while (0) | |
40 | #else | |
41 | #define dprintf(fmt, ...) \ | |
42 | do { } while (0) | |
43 | #endif | |
44 | ||
45 | typedef struct BlkMigBlock { | |
46 | uint8_t *buf; | |
47 | BlkMigDevState *bmds; | |
48 | int64_t sector; | |
49 | struct iovec iov; | |
50 | QEMUIOVector qiov; | |
51 | BlockDriverAIOCB *aiocb; | |
52 | int ret; | |
53 | struct BlkMigBlock *next; | |
54 | } BlkMigBlock; | |
55 | ||
56 | typedef struct BlkMigState { | |
57 | int bulk_completed; | |
58 | int blk_enable; | |
59 | int shared_base; | |
60 | int no_dirty; | |
61 | QEMUFile *load_file; | |
62 | BlkMigDevState *bmds_first; | |
63 | int sectors_per_block; | |
64 | BlkMigBlock *first_blk; | |
65 | BlkMigBlock *last_blk; | |
66 | int submitted; | |
67 | int read_done; | |
68 | int transferred; | |
69 | int64_t print_completion; | |
70 | } BlkMigState; | |
71 | ||
72 | static BlkMigState *block_mig_state = NULL; | |
73 | ||
74 | static void blk_mig_read_cb(void *opaque, int ret) | |
75 | { | |
76 | BlkMigBlock *blk = opaque; | |
77 | ||
78 | blk->ret = ret; | |
79 | ||
80 | /* insert at the end */ | |
81 | if(block_mig_state->last_blk == NULL) { | |
82 | block_mig_state->first_blk = blk; | |
83 | block_mig_state->last_blk = blk; | |
84 | } else { | |
85 | block_mig_state->last_blk->next = blk; | |
86 | block_mig_state->last_blk = blk; | |
87 | } | |
88 | ||
89 | block_mig_state->submitted--; | |
90 | block_mig_state->read_done++; | |
91 | assert(block_mig_state->submitted >= 0); | |
92 | ||
93 | return; | |
94 | } | |
95 | ||
96 | static int mig_read_device_bulk(QEMUFile *f, BlkMigDevState *bms) | |
97 | { | |
98 | int nr_sectors; | |
99 | int64_t total_sectors, cur_sector = 0; | |
100 | BlockDriverState *bs = bms->bs; | |
101 | BlkMigBlock *blk; | |
102 | ||
103 | blk = qemu_malloc(sizeof(BlkMigBlock)); | |
104 | blk->buf = qemu_malloc(BLOCK_SIZE); | |
105 | ||
106 | cur_sector = bms->cur_sector; | |
107 | total_sectors = bdrv_getlength(bs) >> SECTOR_BITS; | |
108 | ||
109 | if(bms->shared_base) { | |
110 | while(cur_sector < bms->total_sectors && | |
111 | !bdrv_is_allocated(bms->bs, cur_sector, | |
112 | MAX_IS_ALLOCATED_SEARCH, &nr_sectors)) { | |
113 | cur_sector += nr_sectors; | |
114 | } | |
115 | } | |
116 | ||
117 | if(cur_sector >= total_sectors) { | |
118 | bms->cur_sector = total_sectors; | |
119 | qemu_free(blk->buf); | |
120 | qemu_free(blk); | |
121 | return 1; | |
122 | } | |
123 | ||
124 | if(cur_sector >= block_mig_state->print_completion) { | |
125 | printf("Completed %" PRId64 " %%\r", cur_sector * 100 / total_sectors); | |
126 | fflush(stdout); | |
127 | block_mig_state->print_completion += | |
128 | (block_mig_state->sectors_per_block * 10000); | |
129 | } | |
130 | ||
131 | /* we going to transfder BLOCK_SIZE any way even if it is not allocated */ | |
132 | nr_sectors = block_mig_state->sectors_per_block; | |
133 | ||
134 | cur_sector &= ~((int64_t)block_mig_state->sectors_per_block -1); | |
135 | ||
136 | if(total_sectors - cur_sector < block_mig_state->sectors_per_block) { | |
137 | nr_sectors = (total_sectors - cur_sector); | |
138 | } | |
139 | ||
140 | bms->cur_sector = cur_sector + nr_sectors; | |
141 | blk->sector = cur_sector; | |
142 | blk->bmds = bms; | |
143 | blk->next = NULL; | |
144 | ||
145 | blk->iov.iov_base = blk->buf; | |
146 | blk->iov.iov_len = nr_sectors * SECTOR_SIZE; | |
147 | qemu_iovec_init_external(&blk->qiov, &blk->iov, 1); | |
148 | ||
149 | blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov, | |
150 | nr_sectors, blk_mig_read_cb, blk); | |
151 | ||
152 | if(!blk->aiocb) { | |
153 | printf("Error reading sector %" PRId64 "\n", cur_sector); | |
154 | qemu_free(blk->buf); | |
155 | qemu_free(blk); | |
156 | return 0; | |
157 | } | |
158 | ||
159 | bdrv_reset_dirty(bms->bs, cur_sector, nr_sectors); | |
160 | block_mig_state->submitted++; | |
161 | ||
162 | return (bms->cur_sector >= total_sectors); | |
163 | } | |
164 | ||
165 | static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds) | |
166 | { | |
167 | int len, nr_sectors; | |
168 | int64_t total_sectors = bmds->total_sectors, cur_sector = 0; | |
169 | uint8_t *tmp_buf = NULL; | |
170 | BlockDriverState *bs = bmds->bs; | |
171 | ||
172 | tmp_buf = qemu_malloc(BLOCK_SIZE); | |
173 | ||
174 | cur_sector = bmds->cur_sector; | |
175 | ||
176 | if(bmds->shared_base) { | |
177 | while(cur_sector < bmds->total_sectors && | |
178 | !bdrv_is_allocated(bmds->bs, cur_sector, | |
179 | MAX_IS_ALLOCATED_SEARCH, &nr_sectors)) { | |
180 | cur_sector += nr_sectors; | |
181 | } | |
182 | } | |
183 | ||
184 | if(cur_sector >= total_sectors) { | |
185 | bmds->cur_sector = total_sectors; | |
186 | qemu_free(tmp_buf); | |
187 | return 1; | |
188 | } | |
189 | ||
190 | if(cur_sector >= block_mig_state->print_completion) { | |
191 | printf("Completed %" PRId64 " %%\r", cur_sector * 100 / total_sectors); | |
192 | fflush(stdout); | |
193 | block_mig_state->print_completion += | |
194 | (block_mig_state->sectors_per_block * 10000); | |
195 | } | |
196 | ||
197 | cur_sector &= ~((int64_t)block_mig_state->sectors_per_block -1); | |
198 | ||
199 | /* we going to transfer | |
200 | BLOCK_SIZE | |
201 | any way even if it is not allocated */ | |
202 | nr_sectors = block_mig_state->sectors_per_block; | |
203 | ||
204 | if(total_sectors - cur_sector < block_mig_state->sectors_per_block) { | |
205 | nr_sectors = (total_sectors - cur_sector); | |
206 | } | |
207 | ||
208 | if(bdrv_read(bs, cur_sector, tmp_buf, nr_sectors) < 0) { | |
209 | printf("Error reading sector %" PRId64 "\n", cur_sector); | |
210 | } | |
211 | ||
212 | bdrv_reset_dirty(bs, cur_sector, nr_sectors); | |
213 | ||
214 | /* Device name */ | |
215 | qemu_put_be64(f,(cur_sector << SECTOR_BITS) | BLK_MIG_FLAG_DEVICE_BLOCK); | |
216 | ||
217 | len = strlen(bs->device_name); | |
218 | qemu_put_byte(f, len); | |
219 | qemu_put_buffer(f, (uint8_t *)bs->device_name, len); | |
220 | ||
221 | qemu_put_buffer(f, tmp_buf, | |
222 | BLOCK_SIZE); | |
223 | ||
224 | bmds->cur_sector = cur_sector + block_mig_state->sectors_per_block; | |
225 | ||
226 | qemu_free(tmp_buf); | |
227 | ||
228 | return (bmds->cur_sector >= total_sectors); | |
229 | } | |
230 | ||
231 | static void send_blk(QEMUFile *f, BlkMigBlock * blk) | |
232 | { | |
233 | int len; | |
234 | ||
235 | /* Device name */ | |
236 | qemu_put_be64(f,(blk->sector << SECTOR_BITS) | BLK_MIG_FLAG_DEVICE_BLOCK); | |
237 | ||
238 | len = strlen(blk->bmds->bs->device_name); | |
239 | qemu_put_byte(f, len); | |
240 | qemu_put_buffer(f, (uint8_t *)blk->bmds->bs->device_name, len); | |
241 | ||
242 | qemu_put_buffer(f, blk->buf, | |
243 | BLOCK_SIZE); | |
244 | ||
245 | return; | |
246 | } | |
247 | ||
248 | static void blk_mig_save_dev_info(QEMUFile *f, BlkMigDevState *bmds) | |
249 | { | |
250 | } | |
251 | ||
252 | static void set_dirty_tracking(int enable) | |
253 | { | |
254 | BlkMigDevState *bmds; | |
255 | for(bmds = block_mig_state->bmds_first; bmds != NULL; bmds = bmds->next) { | |
256 | bdrv_set_dirty_tracking(bmds->bs,enable); | |
257 | } | |
258 | ||
259 | return; | |
260 | } | |
261 | ||
262 | static void init_blk_migration(QEMUFile *f) | |
263 | { | |
264 | BlkMigDevState **pbmds, *bmds; | |
265 | BlockDriverState *bs; | |
266 | ||
267 | for (bs = bdrv_first; bs != NULL; bs = bs->next) { | |
268 | if(bs->type == BDRV_TYPE_HD) { | |
269 | bmds = qemu_mallocz(sizeof(BlkMigDevState)); | |
270 | bmds->bs = bs; | |
271 | bmds->bulk_completed = 0; | |
272 | bmds->total_sectors = bdrv_getlength(bs) >> SECTOR_BITS; | |
273 | bmds->shared_base = block_mig_state->shared_base; | |
274 | ||
275 | if(bmds->shared_base) { | |
276 | printf("Start migration for %s with shared base image\n", | |
277 | bs->device_name); | |
278 | } else { | |
279 | printf("Start full migration for %s\n", bs->device_name); | |
280 | } | |
281 | ||
282 | /* insert at the end */ | |
283 | pbmds = &block_mig_state->bmds_first; | |
284 | while (*pbmds != NULL) | |
285 | pbmds = &(*pbmds)->next; | |
286 | *pbmds = bmds; | |
287 | ||
288 | blk_mig_save_dev_info(f, bmds); | |
289 | ||
290 | } | |
291 | } | |
292 | ||
293 | block_mig_state->sectors_per_block = bdrv_get_sectors_per_chunk(); | |
294 | ||
295 | return; | |
296 | } | |
297 | ||
298 | static int blk_mig_save_bulked_block(QEMUFile *f, int is_async) | |
299 | { | |
300 | BlkMigDevState *bmds; | |
301 | ||
302 | for (bmds = block_mig_state->bmds_first; bmds != NULL; bmds = bmds->next) { | |
303 | if(bmds->bulk_completed == 0) { | |
304 | if(is_async) { | |
305 | if(mig_read_device_bulk(f, bmds) == 1) { | |
306 | /* completed bulk section for this device */ | |
307 | bmds->bulk_completed = 1; | |
308 | } | |
309 | } else { | |
310 | if(mig_save_device_bulk(f,bmds) == 1) { | |
311 | /* completed bulk section for this device */ | |
312 | bmds->bulk_completed = 1; | |
313 | } | |
314 | } | |
315 | return 1; | |
316 | } | |
317 | } | |
318 | ||
319 | /* we reached here means bulk is completed */ | |
320 | block_mig_state->bulk_completed = 1; | |
321 | ||
322 | return 0; | |
323 | ||
324 | } | |
325 | ||
326 | #define MAX_NUM_BLOCKS 4 | |
327 | ||
328 | static void blk_mig_save_dirty_blocks(QEMUFile *f) | |
329 | { | |
330 | BlkMigDevState *bmds; | |
331 | uint8_t buf[BLOCK_SIZE]; | |
332 | int64_t sector; | |
333 | int len; | |
334 | ||
335 | for(bmds = block_mig_state->bmds_first; bmds != NULL; bmds = bmds->next) { | |
336 | for(sector = 0; sector < bmds->cur_sector;) { | |
337 | ||
338 | if(bdrv_get_dirty(bmds->bs,sector)) { | |
339 | ||
340 | if(bdrv_read(bmds->bs, sector, buf, | |
341 | block_mig_state->sectors_per_block) < 0) { | |
342 | } | |
343 | ||
344 | /* device name */ | |
345 | qemu_put_be64(f,(sector << SECTOR_BITS) | |
346 | | BLK_MIG_FLAG_DEVICE_BLOCK); | |
347 | ||
348 | len = strlen(bmds->bs->device_name); | |
349 | ||
350 | qemu_put_byte(f, len); | |
351 | qemu_put_buffer(f, (uint8_t *)bmds->bs->device_name, len); | |
352 | ||
353 | qemu_put_buffer(f, buf, | |
354 | (block_mig_state->sectors_per_block * | |
355 | SECTOR_SIZE)); | |
356 | ||
357 | bdrv_reset_dirty(bmds->bs, sector, | |
358 | block_mig_state->sectors_per_block); | |
359 | ||
360 | sector += block_mig_state->sectors_per_block; | |
361 | } else { | |
362 | /* sector is clean */ | |
363 | sector += block_mig_state->sectors_per_block; | |
364 | } | |
365 | } | |
366 | } | |
367 | ||
368 | return; | |
369 | } | |
370 | ||
371 | static void flush_blks(QEMUFile* f) | |
372 | { | |
373 | BlkMigBlock *blk, *tmp; | |
374 | ||
375 | dprintf("%s Enter submitted %d read_done %d transfered\n", __FUNCTION__, | |
376 | submitted, read_done, transfered); | |
377 | ||
378 | for(blk = block_mig_state->first_blk; | |
379 | blk != NULL && !qemu_file_rate_limit(f); blk = tmp) { | |
380 | send_blk(f, blk); | |
381 | ||
382 | tmp = blk->next; | |
383 | qemu_free(blk->buf); | |
384 | qemu_free(blk); | |
385 | ||
386 | block_mig_state->read_done--; | |
387 | block_mig_state->transferred++; | |
388 | assert(block_mig_state->read_done >= 0); | |
389 | } | |
390 | block_mig_state->first_blk = blk; | |
391 | ||
392 | if(block_mig_state->first_blk == NULL) { | |
393 | block_mig_state->last_blk = NULL; | |
394 | } | |
395 | ||
396 | dprintf("%s Exit submitted %d read_done %d transferred%d\n", __FUNCTION__, | |
397 | block_mig_state->submitted, block_mig_state->read_done, | |
398 | block_mig_state->transferred); | |
399 | ||
400 | return; | |
401 | } | |
402 | ||
403 | static int is_stage2_completed(void) | |
404 | { | |
405 | BlkMigDevState *bmds; | |
406 | ||
407 | if(block_mig_state->submitted > 0) { | |
408 | return 0; | |
409 | } | |
410 | ||
411 | for (bmds = block_mig_state->bmds_first; bmds != NULL; bmds = bmds->next) { | |
412 | if(bmds->bulk_completed == 0) { | |
413 | return 0; | |
414 | } | |
415 | } | |
416 | ||
417 | return 1; | |
418 | } | |
419 | ||
420 | static int block_save_live(QEMUFile *f, int stage, void *opaque) | |
421 | { | |
422 | int ret = 1; | |
423 | ||
424 | dprintf("Enter save live stage %d submitted %d transferred %d\n", stage, | |
425 | submitted, transferred); | |
426 | ||
427 | if(block_mig_state->blk_enable != 1) { | |
428 | /* no need to migrate storage */ | |
429 | ||
430 | qemu_put_be64(f,BLK_MIG_FLAG_EOS); | |
431 | return 1; | |
432 | } | |
433 | ||
434 | if(stage == 1) { | |
435 | init_blk_migration(f); | |
436 | ||
437 | /* start track dirty blocks */ | |
438 | set_dirty_tracking(1); | |
439 | ||
440 | } | |
441 | ||
442 | flush_blks(f); | |
443 | ||
444 | /* control the rate of transfer */ | |
445 | while ((block_mig_state->submitted + block_mig_state->read_done) * | |
446 | (BLOCK_SIZE) < | |
447 | (qemu_file_get_rate_limit(f))) { | |
448 | ||
449 | ret = blk_mig_save_bulked_block(f, 1); | |
450 | ||
451 | if (ret == 0) /* no more bulk blocks for now*/ | |
452 | break; | |
453 | } | |
454 | ||
455 | flush_blks(f); | |
456 | ||
457 | if(stage == 3) { | |
458 | ||
459 | while(blk_mig_save_bulked_block(f, 0) != 0); | |
460 | ||
461 | blk_mig_save_dirty_blocks(f); | |
462 | ||
463 | /* stop track dirty blocks */ | |
464 | set_dirty_tracking(0);; | |
465 | ||
466 | printf("\nBlock migration completed\n"); | |
467 | } | |
468 | ||
469 | qemu_put_be64(f,BLK_MIG_FLAG_EOS); | |
470 | ||
471 | return ((stage == 2) && is_stage2_completed()); | |
472 | } | |
473 | ||
474 | static int block_load(QEMUFile *f, void *opaque, int version_id) | |
475 | { | |
476 | int len, flags; | |
477 | char device_name[256]; | |
478 | int64_t addr; | |
479 | BlockDriverState *bs; | |
480 | uint8_t *buf; | |
481 | ||
482 | block_mig_state->sectors_per_block = bdrv_get_sectors_per_chunk(); | |
483 | buf = qemu_malloc(BLOCK_SIZE); | |
484 | ||
485 | do { | |
486 | ||
487 | addr = qemu_get_be64(f); | |
488 | ||
489 | flags = addr & ~SECTOR_MASK; | |
490 | addr &= SECTOR_MASK; | |
491 | ||
492 | if(flags & BLK_MIG_FLAG_DEVICE_BLOCK) { | |
493 | ||
494 | /* get device name */ | |
495 | len = qemu_get_byte(f); | |
496 | ||
497 | qemu_get_buffer(f, (uint8_t *)device_name, len); | |
498 | device_name[len] = '\0'; | |
499 | ||
500 | bs = bdrv_find(device_name); | |
501 | ||
502 | qemu_get_buffer(f, buf, | |
503 | BLOCK_SIZE); | |
504 | if(bs != NULL) { | |
505 | ||
506 | bdrv_write(bs, (addr >> SECTOR_BITS), | |
507 | buf, block_mig_state->sectors_per_block); | |
508 | } else { | |
509 | printf("Error unknown block device %s\n", device_name); | |
510 | } | |
511 | } else if(flags & BLK_MIG_FLAG_EOS) { | |
512 | ||
513 | } else { | |
514 | printf("Unknown flags\n"); | |
515 | } | |
516 | } while(!(flags & BLK_MIG_FLAG_EOS)); | |
517 | ||
518 | qemu_free(buf); | |
519 | ||
520 | return 0; | |
521 | } | |
522 | ||
523 | static void block_set_params(int blk_enable, int shared_base, void *opaque) | |
524 | { | |
525 | assert(opaque == block_mig_state); | |
526 | ||
527 | block_mig_state->blk_enable = blk_enable; | |
528 | block_mig_state->shared_base = shared_base; | |
529 | ||
530 | /* shared base means that blk_enable = 1 */ | |
531 | block_mig_state->blk_enable |= shared_base; | |
532 | ||
533 | return; | |
534 | } | |
535 | ||
536 | void blk_mig_info(void) | |
537 | { | |
538 | BlockDriverState *bs; | |
539 | ||
540 | for (bs = bdrv_first; bs != NULL; bs = bs->next) { | |
541 | printf("Device %s\n", bs->device_name); | |
542 | if(bs->type == BDRV_TYPE_HD) { | |
543 | printf("device %s format %s\n", | |
544 | bs->device_name, bs->drv->format_name); | |
545 | } | |
546 | } | |
547 | } | |
548 | ||
549 | void blk_mig_init(void) | |
550 | { | |
551 | ||
552 | block_mig_state = qemu_mallocz(sizeof(BlkMigState)); | |
553 | ||
554 | register_savevm_live("block", 0, 1, block_set_params, block_save_live, | |
555 | NULL, block_load, block_mig_state); | |
556 | ||
557 | ||
558 | } |