]> git.proxmox.com Git - mirror_qemu.git/blob - migration/ram.c
migration: migrate QTAILQ
[mirror_qemu.git] / migration / ram.c
1 /*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
28 #include "qemu/osdep.h"
29 #include "qemu-common.h"
30 #include "cpu.h"
31 #include <zlib.h>
32 #include "qapi-event.h"
33 #include "qemu/cutils.h"
34 #include "qemu/bitops.h"
35 #include "qemu/bitmap.h"
36 #include "qemu/timer.h"
37 #include "qemu/main-loop.h"
38 #include "migration/migration.h"
39 #include "migration/postcopy-ram.h"
40 #include "exec/address-spaces.h"
41 #include "migration/page_cache.h"
42 #include "qemu/error-report.h"
43 #include "trace.h"
44 #include "exec/ram_addr.h"
45 #include "qemu/rcu_queue.h"
46 #include "migration/colo.h"
47
48 #ifdef DEBUG_MIGRATION_RAM
49 #define DPRINTF(fmt, ...) \
50 do { fprintf(stdout, "migration_ram: " fmt, ## __VA_ARGS__); } while (0)
51 #else
52 #define DPRINTF(fmt, ...) \
53 do { } while (0)
54 #endif
55
56 static int dirty_rate_high_cnt;
57
58 static uint64_t bitmap_sync_count;
59
60 /***********************************************************/
61 /* ram save/restore */
62
63 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
64 #define RAM_SAVE_FLAG_COMPRESS 0x02
65 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
66 #define RAM_SAVE_FLAG_PAGE 0x08
67 #define RAM_SAVE_FLAG_EOS 0x10
68 #define RAM_SAVE_FLAG_CONTINUE 0x20
69 #define RAM_SAVE_FLAG_XBZRLE 0x40
70 /* 0x80 is reserved in migration.h start with 0x100 next */
71 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
72
73 static uint8_t *ZERO_TARGET_PAGE;
74
75 static inline bool is_zero_range(uint8_t *p, uint64_t size)
76 {
77 return buffer_is_zero(p, size);
78 }
79
80 /* struct contains XBZRLE cache and a static page
81 used by the compression */
82 static struct {
83 /* buffer used for XBZRLE encoding */
84 uint8_t *encoded_buf;
85 /* buffer for storing page content */
86 uint8_t *current_buf;
87 /* Cache for XBZRLE, Protected by lock. */
88 PageCache *cache;
89 QemuMutex lock;
90 } XBZRLE;
91
92 /* buffer used for XBZRLE decoding */
93 static uint8_t *xbzrle_decoded_buf;
94
95 static void XBZRLE_cache_lock(void)
96 {
97 if (migrate_use_xbzrle())
98 qemu_mutex_lock(&XBZRLE.lock);
99 }
100
101 static void XBZRLE_cache_unlock(void)
102 {
103 if (migrate_use_xbzrle())
104 qemu_mutex_unlock(&XBZRLE.lock);
105 }
106
107 /*
108 * called from qmp_migrate_set_cache_size in main thread, possibly while
109 * a migration is in progress.
110 * A running migration maybe using the cache and might finish during this
111 * call, hence changes to the cache are protected by XBZRLE.lock().
112 */
113 int64_t xbzrle_cache_resize(int64_t new_size)
114 {
115 PageCache *new_cache;
116 int64_t ret;
117
118 if (new_size < TARGET_PAGE_SIZE) {
119 return -1;
120 }
121
122 XBZRLE_cache_lock();
123
124 if (XBZRLE.cache != NULL) {
125 if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
126 goto out_new_size;
127 }
128 new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
129 TARGET_PAGE_SIZE);
130 if (!new_cache) {
131 error_report("Error creating cache");
132 ret = -1;
133 goto out;
134 }
135
136 cache_fini(XBZRLE.cache);
137 XBZRLE.cache = new_cache;
138 }
139
140 out_new_size:
141 ret = pow2floor(new_size);
142 out:
143 XBZRLE_cache_unlock();
144 return ret;
145 }
146
147 /* accounting for migration statistics */
148 typedef struct AccountingInfo {
149 uint64_t dup_pages;
150 uint64_t skipped_pages;
151 uint64_t norm_pages;
152 uint64_t iterations;
153 uint64_t xbzrle_bytes;
154 uint64_t xbzrle_pages;
155 uint64_t xbzrle_cache_miss;
156 double xbzrle_cache_miss_rate;
157 uint64_t xbzrle_overflows;
158 } AccountingInfo;
159
160 static AccountingInfo acct_info;
161
162 static void acct_clear(void)
163 {
164 memset(&acct_info, 0, sizeof(acct_info));
165 }
166
167 uint64_t dup_mig_bytes_transferred(void)
168 {
169 return acct_info.dup_pages * TARGET_PAGE_SIZE;
170 }
171
172 uint64_t dup_mig_pages_transferred(void)
173 {
174 return acct_info.dup_pages;
175 }
176
177 uint64_t skipped_mig_bytes_transferred(void)
178 {
179 return acct_info.skipped_pages * TARGET_PAGE_SIZE;
180 }
181
182 uint64_t skipped_mig_pages_transferred(void)
183 {
184 return acct_info.skipped_pages;
185 }
186
187 uint64_t norm_mig_bytes_transferred(void)
188 {
189 return acct_info.norm_pages * TARGET_PAGE_SIZE;
190 }
191
192 uint64_t norm_mig_pages_transferred(void)
193 {
194 return acct_info.norm_pages;
195 }
196
197 uint64_t xbzrle_mig_bytes_transferred(void)
198 {
199 return acct_info.xbzrle_bytes;
200 }
201
202 uint64_t xbzrle_mig_pages_transferred(void)
203 {
204 return acct_info.xbzrle_pages;
205 }
206
207 uint64_t xbzrle_mig_pages_cache_miss(void)
208 {
209 return acct_info.xbzrle_cache_miss;
210 }
211
212 double xbzrle_mig_cache_miss_rate(void)
213 {
214 return acct_info.xbzrle_cache_miss_rate;
215 }
216
217 uint64_t xbzrle_mig_pages_overflow(void)
218 {
219 return acct_info.xbzrle_overflows;
220 }
221
222 /* This is the last block that we have visited serching for dirty pages
223 */
224 static RAMBlock *last_seen_block;
225 /* This is the last block from where we have sent data */
226 static RAMBlock *last_sent_block;
227 static ram_addr_t last_offset;
228 static QemuMutex migration_bitmap_mutex;
229 static uint64_t migration_dirty_pages;
230 static uint32_t last_version;
231 static bool ram_bulk_stage;
232
233 /* used by the search for pages to send */
234 struct PageSearchStatus {
235 /* Current block being searched */
236 RAMBlock *block;
237 /* Current offset to search from */
238 ram_addr_t offset;
239 /* Set once we wrap around */
240 bool complete_round;
241 };
242 typedef struct PageSearchStatus PageSearchStatus;
243
244 static struct BitmapRcu {
245 struct rcu_head rcu;
246 /* Main migration bitmap */
247 unsigned long *bmap;
248 /* bitmap of pages that haven't been sent even once
249 * only maintained and used in postcopy at the moment
250 * where it's used to send the dirtymap at the start
251 * of the postcopy phase
252 */
253 unsigned long *unsentmap;
254 } *migration_bitmap_rcu;
255
256 struct CompressParam {
257 bool done;
258 bool quit;
259 QEMUFile *file;
260 QemuMutex mutex;
261 QemuCond cond;
262 RAMBlock *block;
263 ram_addr_t offset;
264 };
265 typedef struct CompressParam CompressParam;
266
267 struct DecompressParam {
268 bool done;
269 bool quit;
270 QemuMutex mutex;
271 QemuCond cond;
272 void *des;
273 uint8_t *compbuf;
274 int len;
275 };
276 typedef struct DecompressParam DecompressParam;
277
278 static CompressParam *comp_param;
279 static QemuThread *compress_threads;
280 /* comp_done_cond is used to wake up the migration thread when
281 * one of the compression threads has finished the compression.
282 * comp_done_lock is used to co-work with comp_done_cond.
283 */
284 static QemuMutex comp_done_lock;
285 static QemuCond comp_done_cond;
286 /* The empty QEMUFileOps will be used by file in CompressParam */
287 static const QEMUFileOps empty_ops = { };
288
289 static bool compression_switch;
290 static DecompressParam *decomp_param;
291 static QemuThread *decompress_threads;
292 static QemuMutex decomp_done_lock;
293 static QemuCond decomp_done_cond;
294
295 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
296 ram_addr_t offset);
297
298 static void *do_data_compress(void *opaque)
299 {
300 CompressParam *param = opaque;
301 RAMBlock *block;
302 ram_addr_t offset;
303
304 qemu_mutex_lock(&param->mutex);
305 while (!param->quit) {
306 if (param->block) {
307 block = param->block;
308 offset = param->offset;
309 param->block = NULL;
310 qemu_mutex_unlock(&param->mutex);
311
312 do_compress_ram_page(param->file, block, offset);
313
314 qemu_mutex_lock(&comp_done_lock);
315 param->done = true;
316 qemu_cond_signal(&comp_done_cond);
317 qemu_mutex_unlock(&comp_done_lock);
318
319 qemu_mutex_lock(&param->mutex);
320 } else {
321 qemu_cond_wait(&param->cond, &param->mutex);
322 }
323 }
324 qemu_mutex_unlock(&param->mutex);
325
326 return NULL;
327 }
328
329 static inline void terminate_compression_threads(void)
330 {
331 int idx, thread_count;
332
333 thread_count = migrate_compress_threads();
334 for (idx = 0; idx < thread_count; idx++) {
335 qemu_mutex_lock(&comp_param[idx].mutex);
336 comp_param[idx].quit = true;
337 qemu_cond_signal(&comp_param[idx].cond);
338 qemu_mutex_unlock(&comp_param[idx].mutex);
339 }
340 }
341
342 void migrate_compress_threads_join(void)
343 {
344 int i, thread_count;
345
346 if (!migrate_use_compression()) {
347 return;
348 }
349 terminate_compression_threads();
350 thread_count = migrate_compress_threads();
351 for (i = 0; i < thread_count; i++) {
352 qemu_thread_join(compress_threads + i);
353 qemu_fclose(comp_param[i].file);
354 qemu_mutex_destroy(&comp_param[i].mutex);
355 qemu_cond_destroy(&comp_param[i].cond);
356 }
357 qemu_mutex_destroy(&comp_done_lock);
358 qemu_cond_destroy(&comp_done_cond);
359 g_free(compress_threads);
360 g_free(comp_param);
361 compress_threads = NULL;
362 comp_param = NULL;
363 }
364
365 void migrate_compress_threads_create(void)
366 {
367 int i, thread_count;
368
369 if (!migrate_use_compression()) {
370 return;
371 }
372 compression_switch = true;
373 thread_count = migrate_compress_threads();
374 compress_threads = g_new0(QemuThread, thread_count);
375 comp_param = g_new0(CompressParam, thread_count);
376 qemu_cond_init(&comp_done_cond);
377 qemu_mutex_init(&comp_done_lock);
378 for (i = 0; i < thread_count; i++) {
379 /* comp_param[i].file is just used as a dummy buffer to save data,
380 * set its ops to empty.
381 */
382 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
383 comp_param[i].done = true;
384 comp_param[i].quit = false;
385 qemu_mutex_init(&comp_param[i].mutex);
386 qemu_cond_init(&comp_param[i].cond);
387 qemu_thread_create(compress_threads + i, "compress",
388 do_data_compress, comp_param + i,
389 QEMU_THREAD_JOINABLE);
390 }
391 }
392
393 /**
394 * save_page_header: Write page header to wire
395 *
396 * If this is the 1st block, it also writes the block identification
397 *
398 * Returns: Number of bytes written
399 *
400 * @f: QEMUFile where to send the data
401 * @block: block that contains the page we want to send
402 * @offset: offset inside the block for the page
403 * in the lower bits, it contains flags
404 */
405 static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
406 {
407 size_t size, len;
408
409 qemu_put_be64(f, offset);
410 size = 8;
411
412 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
413 len = strlen(block->idstr);
414 qemu_put_byte(f, len);
415 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
416 size += 1 + len;
417 }
418 return size;
419 }
420
421 /* Reduce amount of guest cpu execution to hopefully slow down memory writes.
422 * If guest dirty memory rate is reduced below the rate at which we can
423 * transfer pages to the destination then we should be able to complete
424 * migration. Some workloads dirty memory way too fast and will not effectively
425 * converge, even with auto-converge.
426 */
427 static void mig_throttle_guest_down(void)
428 {
429 MigrationState *s = migrate_get_current();
430 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
431 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
432
433 /* We have not started throttling yet. Let's start it. */
434 if (!cpu_throttle_active()) {
435 cpu_throttle_set(pct_initial);
436 } else {
437 /* Throttling already on, just increase the rate */
438 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
439 }
440 }
441
442 /* Update the xbzrle cache to reflect a page that's been sent as all 0.
443 * The important thing is that a stale (not-yet-0'd) page be replaced
444 * by the new data.
445 * As a bonus, if the page wasn't in the cache it gets added so that
446 * when a small write is made into the 0'd page it gets XBZRLE sent
447 */
448 static void xbzrle_cache_zero_page(ram_addr_t current_addr)
449 {
450 if (ram_bulk_stage || !migrate_use_xbzrle()) {
451 return;
452 }
453
454 /* We don't care if this fails to allocate a new cache page
455 * as long as it updated an old one */
456 cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
457 bitmap_sync_count);
458 }
459
460 #define ENCODING_FLAG_XBZRLE 0x1
461
462 /**
463 * save_xbzrle_page: compress and send current page
464 *
465 * Returns: 1 means that we wrote the page
466 * 0 means that page is identical to the one already sent
467 * -1 means that xbzrle would be longer than normal
468 *
469 * @f: QEMUFile where to send the data
470 * @current_data:
471 * @current_addr:
472 * @block: block that contains the page we want to send
473 * @offset: offset inside the block for the page
474 * @last_stage: if we are at the completion stage
475 * @bytes_transferred: increase it with the number of transferred bytes
476 */
477 static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data,
478 ram_addr_t current_addr, RAMBlock *block,
479 ram_addr_t offset, bool last_stage,
480 uint64_t *bytes_transferred)
481 {
482 int encoded_len = 0, bytes_xbzrle;
483 uint8_t *prev_cached_page;
484
485 if (!cache_is_cached(XBZRLE.cache, current_addr, bitmap_sync_count)) {
486 acct_info.xbzrle_cache_miss++;
487 if (!last_stage) {
488 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
489 bitmap_sync_count) == -1) {
490 return -1;
491 } else {
492 /* update *current_data when the page has been
493 inserted into cache */
494 *current_data = get_cached_data(XBZRLE.cache, current_addr);
495 }
496 }
497 return -1;
498 }
499
500 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
501
502 /* save current buffer into memory */
503 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
504
505 /* XBZRLE encoding (if there is no overflow) */
506 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
507 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
508 TARGET_PAGE_SIZE);
509 if (encoded_len == 0) {
510 DPRINTF("Skipping unmodified page\n");
511 return 0;
512 } else if (encoded_len == -1) {
513 DPRINTF("Overflow\n");
514 acct_info.xbzrle_overflows++;
515 /* update data in the cache */
516 if (!last_stage) {
517 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
518 *current_data = prev_cached_page;
519 }
520 return -1;
521 }
522
523 /* we need to update the data in the cache, in order to get the same data */
524 if (!last_stage) {
525 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
526 }
527
528 /* Send XBZRLE based compressed page */
529 bytes_xbzrle = save_page_header(f, block, offset | RAM_SAVE_FLAG_XBZRLE);
530 qemu_put_byte(f, ENCODING_FLAG_XBZRLE);
531 qemu_put_be16(f, encoded_len);
532 qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len);
533 bytes_xbzrle += encoded_len + 1 + 2;
534 acct_info.xbzrle_pages++;
535 acct_info.xbzrle_bytes += bytes_xbzrle;
536 *bytes_transferred += bytes_xbzrle;
537
538 return 1;
539 }
540
541 /* Called with rcu_read_lock() to protect migration_bitmap
542 * rb: The RAMBlock to search for dirty pages in
543 * start: Start address (typically so we can continue from previous page)
544 * ram_addr_abs: Pointer into which to store the address of the dirty page
545 * within the global ram_addr space
546 *
547 * Returns: byte offset within memory region of the start of a dirty page
548 */
549 static inline
550 ram_addr_t migration_bitmap_find_dirty(RAMBlock *rb,
551 ram_addr_t start,
552 ram_addr_t *ram_addr_abs)
553 {
554 unsigned long base = rb->offset >> TARGET_PAGE_BITS;
555 unsigned long nr = base + (start >> TARGET_PAGE_BITS);
556 uint64_t rb_size = rb->used_length;
557 unsigned long size = base + (rb_size >> TARGET_PAGE_BITS);
558 unsigned long *bitmap;
559
560 unsigned long next;
561
562 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
563 if (ram_bulk_stage && nr > base) {
564 next = nr + 1;
565 } else {
566 next = find_next_bit(bitmap, size, nr);
567 }
568
569 *ram_addr_abs = next << TARGET_PAGE_BITS;
570 return (next - base) << TARGET_PAGE_BITS;
571 }
572
573 static inline bool migration_bitmap_clear_dirty(ram_addr_t addr)
574 {
575 bool ret;
576 int nr = addr >> TARGET_PAGE_BITS;
577 unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
578
579 ret = test_and_clear_bit(nr, bitmap);
580
581 if (ret) {
582 migration_dirty_pages--;
583 }
584 return ret;
585 }
586
587 static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length)
588 {
589 unsigned long *bitmap;
590 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
591 migration_dirty_pages +=
592 cpu_physical_memory_sync_dirty_bitmap(bitmap, start, length);
593 }
594
595 /* Fix me: there are too many global variables used in migration process. */
596 static int64_t start_time;
597 static int64_t bytes_xfer_prev;
598 static int64_t num_dirty_pages_period;
599 static uint64_t xbzrle_cache_miss_prev;
600 static uint64_t iterations_prev;
601
602 static void migration_bitmap_sync_init(void)
603 {
604 start_time = 0;
605 bytes_xfer_prev = 0;
606 num_dirty_pages_period = 0;
607 xbzrle_cache_miss_prev = 0;
608 iterations_prev = 0;
609 }
610
611 static void migration_bitmap_sync(void)
612 {
613 RAMBlock *block;
614 uint64_t num_dirty_pages_init = migration_dirty_pages;
615 MigrationState *s = migrate_get_current();
616 int64_t end_time;
617 int64_t bytes_xfer_now;
618
619 bitmap_sync_count++;
620
621 if (!bytes_xfer_prev) {
622 bytes_xfer_prev = ram_bytes_transferred();
623 }
624
625 if (!start_time) {
626 start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
627 }
628
629 trace_migration_bitmap_sync_start();
630 memory_global_dirty_log_sync();
631
632 qemu_mutex_lock(&migration_bitmap_mutex);
633 rcu_read_lock();
634 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
635 migration_bitmap_sync_range(block->offset, block->used_length);
636 }
637 rcu_read_unlock();
638 qemu_mutex_unlock(&migration_bitmap_mutex);
639
640 trace_migration_bitmap_sync_end(migration_dirty_pages
641 - num_dirty_pages_init);
642 num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init;
643 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
644
645 /* more than 1 second = 1000 millisecons */
646 if (end_time > start_time + 1000) {
647 if (migrate_auto_converge()) {
648 /* The following detection logic can be refined later. For now:
649 Check to see if the dirtied bytes is 50% more than the approx.
650 amount of bytes that just got transferred since the last time we
651 were in this routine. If that happens twice, start or increase
652 throttling */
653 bytes_xfer_now = ram_bytes_transferred();
654
655 if (s->dirty_pages_rate &&
656 (num_dirty_pages_period * TARGET_PAGE_SIZE >
657 (bytes_xfer_now - bytes_xfer_prev)/2) &&
658 (dirty_rate_high_cnt++ >= 2)) {
659 trace_migration_throttle();
660 dirty_rate_high_cnt = 0;
661 mig_throttle_guest_down();
662 }
663 bytes_xfer_prev = bytes_xfer_now;
664 }
665
666 if (migrate_use_xbzrle()) {
667 if (iterations_prev != acct_info.iterations) {
668 acct_info.xbzrle_cache_miss_rate =
669 (double)(acct_info.xbzrle_cache_miss -
670 xbzrle_cache_miss_prev) /
671 (acct_info.iterations - iterations_prev);
672 }
673 iterations_prev = acct_info.iterations;
674 xbzrle_cache_miss_prev = acct_info.xbzrle_cache_miss;
675 }
676 s->dirty_pages_rate = num_dirty_pages_period * 1000
677 / (end_time - start_time);
678 s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE;
679 start_time = end_time;
680 num_dirty_pages_period = 0;
681 }
682 s->dirty_sync_count = bitmap_sync_count;
683 if (migrate_use_events()) {
684 qapi_event_send_migration_pass(bitmap_sync_count, NULL);
685 }
686 }
687
688 /**
689 * save_zero_page: Send the zero page to the stream
690 *
691 * Returns: Number of pages written.
692 *
693 * @f: QEMUFile where to send the data
694 * @block: block that contains the page we want to send
695 * @offset: offset inside the block for the page
696 * @p: pointer to the page
697 * @bytes_transferred: increase it with the number of transferred bytes
698 */
699 static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
700 uint8_t *p, uint64_t *bytes_transferred)
701 {
702 int pages = -1;
703
704 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
705 acct_info.dup_pages++;
706 *bytes_transferred += save_page_header(f, block,
707 offset | RAM_SAVE_FLAG_COMPRESS);
708 qemu_put_byte(f, 0);
709 *bytes_transferred += 1;
710 pages = 1;
711 }
712
713 return pages;
714 }
715
716 /**
717 * ram_save_page: Send the given page to the stream
718 *
719 * Returns: Number of pages written.
720 * < 0 - error
721 * >=0 - Number of pages written - this might legally be 0
722 * if xbzrle noticed the page was the same.
723 *
724 * @f: QEMUFile where to send the data
725 * @block: block that contains the page we want to send
726 * @offset: offset inside the block for the page
727 * @last_stage: if we are at the completion stage
728 * @bytes_transferred: increase it with the number of transferred bytes
729 */
730 static int ram_save_page(QEMUFile *f, PageSearchStatus *pss,
731 bool last_stage, uint64_t *bytes_transferred)
732 {
733 int pages = -1;
734 uint64_t bytes_xmit;
735 ram_addr_t current_addr;
736 uint8_t *p;
737 int ret;
738 bool send_async = true;
739 RAMBlock *block = pss->block;
740 ram_addr_t offset = pss->offset;
741
742 p = block->host + offset;
743
744 /* In doubt sent page as normal */
745 bytes_xmit = 0;
746 ret = ram_control_save_page(f, block->offset,
747 offset, TARGET_PAGE_SIZE, &bytes_xmit);
748 if (bytes_xmit) {
749 *bytes_transferred += bytes_xmit;
750 pages = 1;
751 }
752
753 XBZRLE_cache_lock();
754
755 current_addr = block->offset + offset;
756
757 if (block == last_sent_block) {
758 offset |= RAM_SAVE_FLAG_CONTINUE;
759 }
760 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
761 if (ret != RAM_SAVE_CONTROL_DELAYED) {
762 if (bytes_xmit > 0) {
763 acct_info.norm_pages++;
764 } else if (bytes_xmit == 0) {
765 acct_info.dup_pages++;
766 }
767 }
768 } else {
769 pages = save_zero_page(f, block, offset, p, bytes_transferred);
770 if (pages > 0) {
771 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
772 * page would be stale
773 */
774 xbzrle_cache_zero_page(current_addr);
775 } else if (!ram_bulk_stage &&
776 !migration_in_postcopy(migrate_get_current()) &&
777 migrate_use_xbzrle()) {
778 pages = save_xbzrle_page(f, &p, current_addr, block,
779 offset, last_stage, bytes_transferred);
780 if (!last_stage) {
781 /* Can't send this cached data async, since the cache page
782 * might get updated before it gets to the wire
783 */
784 send_async = false;
785 }
786 }
787 }
788
789 /* XBZRLE overflow or normal page */
790 if (pages == -1) {
791 *bytes_transferred += save_page_header(f, block,
792 offset | RAM_SAVE_FLAG_PAGE);
793 if (send_async) {
794 qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE);
795 } else {
796 qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
797 }
798 *bytes_transferred += TARGET_PAGE_SIZE;
799 pages = 1;
800 acct_info.norm_pages++;
801 }
802
803 XBZRLE_cache_unlock();
804
805 return pages;
806 }
807
808 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
809 ram_addr_t offset)
810 {
811 int bytes_sent, blen;
812 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
813
814 bytes_sent = save_page_header(f, block, offset |
815 RAM_SAVE_FLAG_COMPRESS_PAGE);
816 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
817 migrate_compress_level());
818 if (blen < 0) {
819 bytes_sent = 0;
820 qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
821 error_report("compressed data failed!");
822 } else {
823 bytes_sent += blen;
824 }
825
826 return bytes_sent;
827 }
828
829 static uint64_t bytes_transferred;
830
831 static void flush_compressed_data(QEMUFile *f)
832 {
833 int idx, len, thread_count;
834
835 if (!migrate_use_compression()) {
836 return;
837 }
838 thread_count = migrate_compress_threads();
839
840 qemu_mutex_lock(&comp_done_lock);
841 for (idx = 0; idx < thread_count; idx++) {
842 while (!comp_param[idx].done) {
843 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
844 }
845 }
846 qemu_mutex_unlock(&comp_done_lock);
847
848 for (idx = 0; idx < thread_count; idx++) {
849 qemu_mutex_lock(&comp_param[idx].mutex);
850 if (!comp_param[idx].quit) {
851 len = qemu_put_qemu_file(f, comp_param[idx].file);
852 bytes_transferred += len;
853 }
854 qemu_mutex_unlock(&comp_param[idx].mutex);
855 }
856 }
857
858 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
859 ram_addr_t offset)
860 {
861 param->block = block;
862 param->offset = offset;
863 }
864
865 static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block,
866 ram_addr_t offset,
867 uint64_t *bytes_transferred)
868 {
869 int idx, thread_count, bytes_xmit = -1, pages = -1;
870
871 thread_count = migrate_compress_threads();
872 qemu_mutex_lock(&comp_done_lock);
873 while (true) {
874 for (idx = 0; idx < thread_count; idx++) {
875 if (comp_param[idx].done) {
876 comp_param[idx].done = false;
877 bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file);
878 qemu_mutex_lock(&comp_param[idx].mutex);
879 set_compress_params(&comp_param[idx], block, offset);
880 qemu_cond_signal(&comp_param[idx].cond);
881 qemu_mutex_unlock(&comp_param[idx].mutex);
882 pages = 1;
883 acct_info.norm_pages++;
884 *bytes_transferred += bytes_xmit;
885 break;
886 }
887 }
888 if (pages > 0) {
889 break;
890 } else {
891 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
892 }
893 }
894 qemu_mutex_unlock(&comp_done_lock);
895
896 return pages;
897 }
898
899 /**
900 * ram_save_compressed_page: compress the given page and send it to the stream
901 *
902 * Returns: Number of pages written.
903 *
904 * @f: QEMUFile where to send the data
905 * @block: block that contains the page we want to send
906 * @offset: offset inside the block for the page
907 * @last_stage: if we are at the completion stage
908 * @bytes_transferred: increase it with the number of transferred bytes
909 */
910 static int ram_save_compressed_page(QEMUFile *f, PageSearchStatus *pss,
911 bool last_stage,
912 uint64_t *bytes_transferred)
913 {
914 int pages = -1;
915 uint64_t bytes_xmit = 0;
916 uint8_t *p;
917 int ret, blen;
918 RAMBlock *block = pss->block;
919 ram_addr_t offset = pss->offset;
920
921 p = block->host + offset;
922
923 ret = ram_control_save_page(f, block->offset,
924 offset, TARGET_PAGE_SIZE, &bytes_xmit);
925 if (bytes_xmit) {
926 *bytes_transferred += bytes_xmit;
927 pages = 1;
928 }
929 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
930 if (ret != RAM_SAVE_CONTROL_DELAYED) {
931 if (bytes_xmit > 0) {
932 acct_info.norm_pages++;
933 } else if (bytes_xmit == 0) {
934 acct_info.dup_pages++;
935 }
936 }
937 } else {
938 /* When starting the process of a new block, the first page of
939 * the block should be sent out before other pages in the same
940 * block, and all the pages in last block should have been sent
941 * out, keeping this order is important, because the 'cont' flag
942 * is used to avoid resending the block name.
943 */
944 if (block != last_sent_block) {
945 flush_compressed_data(f);
946 pages = save_zero_page(f, block, offset, p, bytes_transferred);
947 if (pages == -1) {
948 /* Make sure the first page is sent out before other pages */
949 bytes_xmit = save_page_header(f, block, offset |
950 RAM_SAVE_FLAG_COMPRESS_PAGE);
951 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
952 migrate_compress_level());
953 if (blen > 0) {
954 *bytes_transferred += bytes_xmit + blen;
955 acct_info.norm_pages++;
956 pages = 1;
957 } else {
958 qemu_file_set_error(f, blen);
959 error_report("compressed data failed!");
960 }
961 }
962 } else {
963 offset |= RAM_SAVE_FLAG_CONTINUE;
964 pages = save_zero_page(f, block, offset, p, bytes_transferred);
965 if (pages == -1) {
966 pages = compress_page_with_multi_thread(f, block, offset,
967 bytes_transferred);
968 }
969 }
970 }
971
972 return pages;
973 }
974
975 /*
976 * Find the next dirty page and update any state associated with
977 * the search process.
978 *
979 * Returns: True if a page is found
980 *
981 * @f: Current migration stream.
982 * @pss: Data about the state of the current dirty page scan.
983 * @*again: Set to false if the search has scanned the whole of RAM
984 * *ram_addr_abs: Pointer into which to store the address of the dirty page
985 * within the global ram_addr space
986 */
987 static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss,
988 bool *again, ram_addr_t *ram_addr_abs)
989 {
990 pss->offset = migration_bitmap_find_dirty(pss->block, pss->offset,
991 ram_addr_abs);
992 if (pss->complete_round && pss->block == last_seen_block &&
993 pss->offset >= last_offset) {
994 /*
995 * We've been once around the RAM and haven't found anything.
996 * Give up.
997 */
998 *again = false;
999 return false;
1000 }
1001 if (pss->offset >= pss->block->used_length) {
1002 /* Didn't find anything in this RAM Block */
1003 pss->offset = 0;
1004 pss->block = QLIST_NEXT_RCU(pss->block, next);
1005 if (!pss->block) {
1006 /* Hit the end of the list */
1007 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1008 /* Flag that we've looped */
1009 pss->complete_round = true;
1010 ram_bulk_stage = false;
1011 if (migrate_use_xbzrle()) {
1012 /* If xbzrle is on, stop using the data compression at this
1013 * point. In theory, xbzrle can do better than compression.
1014 */
1015 flush_compressed_data(f);
1016 compression_switch = false;
1017 }
1018 }
1019 /* Didn't find anything this time, but try again on the new block */
1020 *again = true;
1021 return false;
1022 } else {
1023 /* Can go around again, but... */
1024 *again = true;
1025 /* We've found something so probably don't need to */
1026 return true;
1027 }
1028 }
1029
1030 /*
1031 * Helper for 'get_queued_page' - gets a page off the queue
1032 * ms: MigrationState in
1033 * *offset: Used to return the offset within the RAMBlock
1034 * ram_addr_abs: global offset in the dirty/sent bitmaps
1035 *
1036 * Returns: block (or NULL if none available)
1037 */
1038 static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset,
1039 ram_addr_t *ram_addr_abs)
1040 {
1041 RAMBlock *block = NULL;
1042
1043 qemu_mutex_lock(&ms->src_page_req_mutex);
1044 if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) {
1045 struct MigrationSrcPageRequest *entry =
1046 QSIMPLEQ_FIRST(&ms->src_page_requests);
1047 block = entry->rb;
1048 *offset = entry->offset;
1049 *ram_addr_abs = (entry->offset + entry->rb->offset) &
1050 TARGET_PAGE_MASK;
1051
1052 if (entry->len > TARGET_PAGE_SIZE) {
1053 entry->len -= TARGET_PAGE_SIZE;
1054 entry->offset += TARGET_PAGE_SIZE;
1055 } else {
1056 memory_region_unref(block->mr);
1057 QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1058 g_free(entry);
1059 }
1060 }
1061 qemu_mutex_unlock(&ms->src_page_req_mutex);
1062
1063 return block;
1064 }
1065
1066 /*
1067 * Unqueue a page from the queue fed by postcopy page requests; skips pages
1068 * that are already sent (!dirty)
1069 *
1070 * ms: MigrationState in
1071 * pss: PageSearchStatus structure updated with found block/offset
1072 * ram_addr_abs: global offset in the dirty/sent bitmaps
1073 *
1074 * Returns: true if a queued page is found
1075 */
1076 static bool get_queued_page(MigrationState *ms, PageSearchStatus *pss,
1077 ram_addr_t *ram_addr_abs)
1078 {
1079 RAMBlock *block;
1080 ram_addr_t offset;
1081 bool dirty;
1082
1083 do {
1084 block = unqueue_page(ms, &offset, ram_addr_abs);
1085 /*
1086 * We're sending this page, and since it's postcopy nothing else
1087 * will dirty it, and we must make sure it doesn't get sent again
1088 * even if this queue request was received after the background
1089 * search already sent it.
1090 */
1091 if (block) {
1092 unsigned long *bitmap;
1093 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1094 dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap);
1095 if (!dirty) {
1096 trace_get_queued_page_not_dirty(
1097 block->idstr, (uint64_t)offset,
1098 (uint64_t)*ram_addr_abs,
1099 test_bit(*ram_addr_abs >> TARGET_PAGE_BITS,
1100 atomic_rcu_read(&migration_bitmap_rcu)->unsentmap));
1101 } else {
1102 trace_get_queued_page(block->idstr,
1103 (uint64_t)offset,
1104 (uint64_t)*ram_addr_abs);
1105 }
1106 }
1107
1108 } while (block && !dirty);
1109
1110 if (block) {
1111 /*
1112 * As soon as we start servicing pages out of order, then we have
1113 * to kill the bulk stage, since the bulk stage assumes
1114 * in (migration_bitmap_find_and_reset_dirty) that every page is
1115 * dirty, that's no longer true.
1116 */
1117 ram_bulk_stage = false;
1118
1119 /*
1120 * We want the background search to continue from the queued page
1121 * since the guest is likely to want other pages near to the page
1122 * it just requested.
1123 */
1124 pss->block = block;
1125 pss->offset = offset;
1126 }
1127
1128 return !!block;
1129 }
1130
1131 /**
1132 * flush_page_queue: Flush any remaining pages in the ram request queue
1133 * it should be empty at the end anyway, but in error cases there may be
1134 * some left.
1135 *
1136 * ms: MigrationState
1137 */
1138 void flush_page_queue(MigrationState *ms)
1139 {
1140 struct MigrationSrcPageRequest *mspr, *next_mspr;
1141 /* This queue generally should be empty - but in the case of a failed
1142 * migration might have some droppings in.
1143 */
1144 rcu_read_lock();
1145 QSIMPLEQ_FOREACH_SAFE(mspr, &ms->src_page_requests, next_req, next_mspr) {
1146 memory_region_unref(mspr->rb->mr);
1147 QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1148 g_free(mspr);
1149 }
1150 rcu_read_unlock();
1151 }
1152
1153 /**
1154 * Queue the pages for transmission, e.g. a request from postcopy destination
1155 * ms: MigrationStatus in which the queue is held
1156 * rbname: The RAMBlock the request is for - may be NULL (to mean reuse last)
1157 * start: Offset from the start of the RAMBlock
1158 * len: Length (in bytes) to send
1159 * Return: 0 on success
1160 */
1161 int ram_save_queue_pages(MigrationState *ms, const char *rbname,
1162 ram_addr_t start, ram_addr_t len)
1163 {
1164 RAMBlock *ramblock;
1165
1166 ms->postcopy_requests++;
1167 rcu_read_lock();
1168 if (!rbname) {
1169 /* Reuse last RAMBlock */
1170 ramblock = ms->last_req_rb;
1171
1172 if (!ramblock) {
1173 /*
1174 * Shouldn't happen, we can't reuse the last RAMBlock if
1175 * it's the 1st request.
1176 */
1177 error_report("ram_save_queue_pages no previous block");
1178 goto err;
1179 }
1180 } else {
1181 ramblock = qemu_ram_block_by_name(rbname);
1182
1183 if (!ramblock) {
1184 /* We shouldn't be asked for a non-existent RAMBlock */
1185 error_report("ram_save_queue_pages no block '%s'", rbname);
1186 goto err;
1187 }
1188 ms->last_req_rb = ramblock;
1189 }
1190 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1191 if (start+len > ramblock->used_length) {
1192 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1193 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1194 __func__, start, len, ramblock->used_length);
1195 goto err;
1196 }
1197
1198 struct MigrationSrcPageRequest *new_entry =
1199 g_malloc0(sizeof(struct MigrationSrcPageRequest));
1200 new_entry->rb = ramblock;
1201 new_entry->offset = start;
1202 new_entry->len = len;
1203
1204 memory_region_ref(ramblock->mr);
1205 qemu_mutex_lock(&ms->src_page_req_mutex);
1206 QSIMPLEQ_INSERT_TAIL(&ms->src_page_requests, new_entry, next_req);
1207 qemu_mutex_unlock(&ms->src_page_req_mutex);
1208 rcu_read_unlock();
1209
1210 return 0;
1211
1212 err:
1213 rcu_read_unlock();
1214 return -1;
1215 }
1216
1217 /**
1218 * ram_save_target_page: Save one target page
1219 *
1220 *
1221 * @f: QEMUFile where to send the data
1222 * @block: pointer to block that contains the page we want to send
1223 * @offset: offset inside the block for the page;
1224 * @last_stage: if we are at the completion stage
1225 * @bytes_transferred: increase it with the number of transferred bytes
1226 * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1227 *
1228 * Returns: Number of pages written.
1229 */
1230 static int ram_save_target_page(MigrationState *ms, QEMUFile *f,
1231 PageSearchStatus *pss,
1232 bool last_stage,
1233 uint64_t *bytes_transferred,
1234 ram_addr_t dirty_ram_abs)
1235 {
1236 int res = 0;
1237
1238 /* Check the pages is dirty and if it is send it */
1239 if (migration_bitmap_clear_dirty(dirty_ram_abs)) {
1240 unsigned long *unsentmap;
1241 if (compression_switch && migrate_use_compression()) {
1242 res = ram_save_compressed_page(f, pss,
1243 last_stage,
1244 bytes_transferred);
1245 } else {
1246 res = ram_save_page(f, pss, last_stage,
1247 bytes_transferred);
1248 }
1249
1250 if (res < 0) {
1251 return res;
1252 }
1253 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1254 if (unsentmap) {
1255 clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap);
1256 }
1257 /* Only update last_sent_block if a block was actually sent; xbzrle
1258 * might have decided the page was identical so didn't bother writing
1259 * to the stream.
1260 */
1261 if (res > 0) {
1262 last_sent_block = pss->block;
1263 }
1264 }
1265
1266 return res;
1267 }
1268
1269 /**
1270 * ram_save_host_page: Starting at *offset send pages up to the end
1271 * of the current host page. It's valid for the initial
1272 * offset to point into the middle of a host page
1273 * in which case the remainder of the hostpage is sent.
1274 * Only dirty target pages are sent.
1275 *
1276 * Returns: Number of pages written.
1277 *
1278 * @f: QEMUFile where to send the data
1279 * @block: pointer to block that contains the page we want to send
1280 * @offset: offset inside the block for the page; updated to last target page
1281 * sent
1282 * @last_stage: if we are at the completion stage
1283 * @bytes_transferred: increase it with the number of transferred bytes
1284 * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1285 */
1286 static int ram_save_host_page(MigrationState *ms, QEMUFile *f,
1287 PageSearchStatus *pss,
1288 bool last_stage,
1289 uint64_t *bytes_transferred,
1290 ram_addr_t dirty_ram_abs)
1291 {
1292 int tmppages, pages = 0;
1293 do {
1294 tmppages = ram_save_target_page(ms, f, pss, last_stage,
1295 bytes_transferred, dirty_ram_abs);
1296 if (tmppages < 0) {
1297 return tmppages;
1298 }
1299
1300 pages += tmppages;
1301 pss->offset += TARGET_PAGE_SIZE;
1302 dirty_ram_abs += TARGET_PAGE_SIZE;
1303 } while (pss->offset & (qemu_host_page_size - 1));
1304
1305 /* The offset we leave with is the last one we looked at */
1306 pss->offset -= TARGET_PAGE_SIZE;
1307 return pages;
1308 }
1309
1310 /**
1311 * ram_find_and_save_block: Finds a dirty page and sends it to f
1312 *
1313 * Called within an RCU critical section.
1314 *
1315 * Returns: The number of pages written
1316 * 0 means no dirty pages
1317 *
1318 * @f: QEMUFile where to send the data
1319 * @last_stage: if we are at the completion stage
1320 * @bytes_transferred: increase it with the number of transferred bytes
1321 *
1322 * On systems where host-page-size > target-page-size it will send all the
1323 * pages in a host page that are dirty.
1324 */
1325
1326 static int ram_find_and_save_block(QEMUFile *f, bool last_stage,
1327 uint64_t *bytes_transferred)
1328 {
1329 PageSearchStatus pss;
1330 MigrationState *ms = migrate_get_current();
1331 int pages = 0;
1332 bool again, found;
1333 ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
1334 ram_addr_t space */
1335
1336 pss.block = last_seen_block;
1337 pss.offset = last_offset;
1338 pss.complete_round = false;
1339
1340 if (!pss.block) {
1341 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1342 }
1343
1344 do {
1345 again = true;
1346 found = get_queued_page(ms, &pss, &dirty_ram_abs);
1347
1348 if (!found) {
1349 /* priority queue empty, so just search for something dirty */
1350 found = find_dirty_block(f, &pss, &again, &dirty_ram_abs);
1351 }
1352
1353 if (found) {
1354 pages = ram_save_host_page(ms, f, &pss,
1355 last_stage, bytes_transferred,
1356 dirty_ram_abs);
1357 }
1358 } while (!pages && again);
1359
1360 last_seen_block = pss.block;
1361 last_offset = pss.offset;
1362
1363 return pages;
1364 }
1365
1366 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1367 {
1368 uint64_t pages = size / TARGET_PAGE_SIZE;
1369 if (zero) {
1370 acct_info.dup_pages += pages;
1371 } else {
1372 acct_info.norm_pages += pages;
1373 bytes_transferred += size;
1374 qemu_update_position(f, size);
1375 }
1376 }
1377
1378 static ram_addr_t ram_save_remaining(void)
1379 {
1380 return migration_dirty_pages;
1381 }
1382
1383 uint64_t ram_bytes_remaining(void)
1384 {
1385 return ram_save_remaining() * TARGET_PAGE_SIZE;
1386 }
1387
1388 uint64_t ram_bytes_transferred(void)
1389 {
1390 return bytes_transferred;
1391 }
1392
1393 uint64_t ram_bytes_total(void)
1394 {
1395 RAMBlock *block;
1396 uint64_t total = 0;
1397
1398 rcu_read_lock();
1399 QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
1400 total += block->used_length;
1401 rcu_read_unlock();
1402 return total;
1403 }
1404
1405 void free_xbzrle_decoded_buf(void)
1406 {
1407 g_free(xbzrle_decoded_buf);
1408 xbzrle_decoded_buf = NULL;
1409 }
1410
1411 static void migration_bitmap_free(struct BitmapRcu *bmap)
1412 {
1413 g_free(bmap->bmap);
1414 g_free(bmap->unsentmap);
1415 g_free(bmap);
1416 }
1417
1418 static void ram_migration_cleanup(void *opaque)
1419 {
1420 /* caller have hold iothread lock or is in a bh, so there is
1421 * no writing race against this migration_bitmap
1422 */
1423 struct BitmapRcu *bitmap = migration_bitmap_rcu;
1424 atomic_rcu_set(&migration_bitmap_rcu, NULL);
1425 if (bitmap) {
1426 memory_global_dirty_log_stop();
1427 call_rcu(bitmap, migration_bitmap_free, rcu);
1428 }
1429
1430 XBZRLE_cache_lock();
1431 if (XBZRLE.cache) {
1432 cache_fini(XBZRLE.cache);
1433 g_free(XBZRLE.encoded_buf);
1434 g_free(XBZRLE.current_buf);
1435 g_free(ZERO_TARGET_PAGE);
1436 XBZRLE.cache = NULL;
1437 XBZRLE.encoded_buf = NULL;
1438 XBZRLE.current_buf = NULL;
1439 }
1440 XBZRLE_cache_unlock();
1441 }
1442
1443 static void reset_ram_globals(void)
1444 {
1445 last_seen_block = NULL;
1446 last_sent_block = NULL;
1447 last_offset = 0;
1448 last_version = ram_list.version;
1449 ram_bulk_stage = true;
1450 }
1451
1452 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1453
1454 void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
1455 {
1456 /* called in qemu main thread, so there is
1457 * no writing race against this migration_bitmap
1458 */
1459 if (migration_bitmap_rcu) {
1460 struct BitmapRcu *old_bitmap = migration_bitmap_rcu, *bitmap;
1461 bitmap = g_new(struct BitmapRcu, 1);
1462 bitmap->bmap = bitmap_new(new);
1463
1464 /* prevent migration_bitmap content from being set bit
1465 * by migration_bitmap_sync_range() at the same time.
1466 * it is safe to migration if migration_bitmap is cleared bit
1467 * at the same time.
1468 */
1469 qemu_mutex_lock(&migration_bitmap_mutex);
1470 bitmap_copy(bitmap->bmap, old_bitmap->bmap, old);
1471 bitmap_set(bitmap->bmap, old, new - old);
1472
1473 /* We don't have a way to safely extend the sentmap
1474 * with RCU; so mark it as missing, entry to postcopy
1475 * will fail.
1476 */
1477 bitmap->unsentmap = NULL;
1478
1479 atomic_rcu_set(&migration_bitmap_rcu, bitmap);
1480 qemu_mutex_unlock(&migration_bitmap_mutex);
1481 migration_dirty_pages += new - old;
1482 call_rcu(old_bitmap, migration_bitmap_free, rcu);
1483 }
1484 }
1485
1486 /*
1487 * 'expected' is the value you expect the bitmap mostly to be full
1488 * of; it won't bother printing lines that are all this value.
1489 * If 'todump' is null the migration bitmap is dumped.
1490 */
1491 void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
1492 {
1493 int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1494
1495 int64_t cur;
1496 int64_t linelen = 128;
1497 char linebuf[129];
1498
1499 if (!todump) {
1500 todump = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1501 }
1502
1503 for (cur = 0; cur < ram_pages; cur += linelen) {
1504 int64_t curb;
1505 bool found = false;
1506 /*
1507 * Last line; catch the case where the line length
1508 * is longer than remaining ram
1509 */
1510 if (cur + linelen > ram_pages) {
1511 linelen = ram_pages - cur;
1512 }
1513 for (curb = 0; curb < linelen; curb++) {
1514 bool thisbit = test_bit(cur + curb, todump);
1515 linebuf[curb] = thisbit ? '1' : '.';
1516 found = found || (thisbit != expected);
1517 }
1518 if (found) {
1519 linebuf[curb] = '\0';
1520 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
1521 }
1522 }
1523 }
1524
1525 /* **** functions for postcopy ***** */
1526
1527 /*
1528 * Callback from postcopy_each_ram_send_discard for each RAMBlock
1529 * Note: At this point the 'unsentmap' is the processed bitmap combined
1530 * with the dirtymap; so a '1' means it's either dirty or unsent.
1531 * start,length: Indexes into the bitmap for the first bit
1532 * representing the named block and length in target-pages
1533 */
1534 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1535 PostcopyDiscardState *pds,
1536 unsigned long start,
1537 unsigned long length)
1538 {
1539 unsigned long end = start + length; /* one after the end */
1540 unsigned long current;
1541 unsigned long *unsentmap;
1542
1543 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1544 for (current = start; current < end; ) {
1545 unsigned long one = find_next_bit(unsentmap, end, current);
1546
1547 if (one <= end) {
1548 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1549 unsigned long discard_length;
1550
1551 if (zero >= end) {
1552 discard_length = end - one;
1553 } else {
1554 discard_length = zero - one;
1555 }
1556 if (discard_length) {
1557 postcopy_discard_send_range(ms, pds, one, discard_length);
1558 }
1559 current = one + discard_length;
1560 } else {
1561 current = one;
1562 }
1563 }
1564
1565 return 0;
1566 }
1567
1568 /*
1569 * Utility for the outgoing postcopy code.
1570 * Calls postcopy_send_discard_bm_ram for each RAMBlock
1571 * passing it bitmap indexes and name.
1572 * Returns: 0 on success
1573 * (qemu_ram_foreach_block ends up passing unscaled lengths
1574 * which would mean postcopy code would have to deal with target page)
1575 */
1576 static int postcopy_each_ram_send_discard(MigrationState *ms)
1577 {
1578 struct RAMBlock *block;
1579 int ret;
1580
1581 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1582 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1583 PostcopyDiscardState *pds = postcopy_discard_send_init(ms,
1584 first,
1585 block->idstr);
1586
1587 /*
1588 * Postcopy sends chunks of bitmap over the wire, but it
1589 * just needs indexes at this point, avoids it having
1590 * target page specific code.
1591 */
1592 ret = postcopy_send_discard_bm_ram(ms, pds, first,
1593 block->used_length >> TARGET_PAGE_BITS);
1594 postcopy_discard_send_finish(ms, pds);
1595 if (ret) {
1596 return ret;
1597 }
1598 }
1599
1600 return 0;
1601 }
1602
1603 /*
1604 * Helper for postcopy_chunk_hostpages; it's called twice to cleanup
1605 * the two bitmaps, that are similar, but one is inverted.
1606 *
1607 * We search for runs of target-pages that don't start or end on a
1608 * host page boundary;
1609 * unsent_pass=true: Cleans up partially unsent host pages by searching
1610 * the unsentmap
1611 * unsent_pass=false: Cleans up partially dirty host pages by searching
1612 * the main migration bitmap
1613 *
1614 */
1615 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1616 RAMBlock *block,
1617 PostcopyDiscardState *pds)
1618 {
1619 unsigned long *bitmap;
1620 unsigned long *unsentmap;
1621 unsigned int host_ratio = qemu_host_page_size / TARGET_PAGE_SIZE;
1622 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1623 unsigned long len = block->used_length >> TARGET_PAGE_BITS;
1624 unsigned long last = first + (len - 1);
1625 unsigned long run_start;
1626
1627 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1628 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1629
1630 if (unsent_pass) {
1631 /* Find a sent page */
1632 run_start = find_next_zero_bit(unsentmap, last + 1, first);
1633 } else {
1634 /* Find a dirty page */
1635 run_start = find_next_bit(bitmap, last + 1, first);
1636 }
1637
1638 while (run_start <= last) {
1639 bool do_fixup = false;
1640 unsigned long fixup_start_addr;
1641 unsigned long host_offset;
1642
1643 /*
1644 * If the start of this run of pages is in the middle of a host
1645 * page, then we need to fixup this host page.
1646 */
1647 host_offset = run_start % host_ratio;
1648 if (host_offset) {
1649 do_fixup = true;
1650 run_start -= host_offset;
1651 fixup_start_addr = run_start;
1652 /* For the next pass */
1653 run_start = run_start + host_ratio;
1654 } else {
1655 /* Find the end of this run */
1656 unsigned long run_end;
1657 if (unsent_pass) {
1658 run_end = find_next_bit(unsentmap, last + 1, run_start + 1);
1659 } else {
1660 run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1);
1661 }
1662 /*
1663 * If the end isn't at the start of a host page, then the
1664 * run doesn't finish at the end of a host page
1665 * and we need to discard.
1666 */
1667 host_offset = run_end % host_ratio;
1668 if (host_offset) {
1669 do_fixup = true;
1670 fixup_start_addr = run_end - host_offset;
1671 /*
1672 * This host page has gone, the next loop iteration starts
1673 * from after the fixup
1674 */
1675 run_start = fixup_start_addr + host_ratio;
1676 } else {
1677 /*
1678 * No discards on this iteration, next loop starts from
1679 * next sent/dirty page
1680 */
1681 run_start = run_end + 1;
1682 }
1683 }
1684
1685 if (do_fixup) {
1686 unsigned long page;
1687
1688 /* Tell the destination to discard this page */
1689 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1690 /* For the unsent_pass we:
1691 * discard partially sent pages
1692 * For the !unsent_pass (dirty) we:
1693 * discard partially dirty pages that were sent
1694 * (any partially sent pages were already discarded
1695 * by the previous unsent_pass)
1696 */
1697 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1698 host_ratio);
1699 }
1700
1701 /* Clean up the bitmap */
1702 for (page = fixup_start_addr;
1703 page < fixup_start_addr + host_ratio; page++) {
1704 /* All pages in this host page are now not sent */
1705 set_bit(page, unsentmap);
1706
1707 /*
1708 * Remark them as dirty, updating the count for any pages
1709 * that weren't previously dirty.
1710 */
1711 migration_dirty_pages += !test_and_set_bit(page, bitmap);
1712 }
1713 }
1714
1715 if (unsent_pass) {
1716 /* Find the next sent page for the next iteration */
1717 run_start = find_next_zero_bit(unsentmap, last + 1,
1718 run_start);
1719 } else {
1720 /* Find the next dirty page for the next iteration */
1721 run_start = find_next_bit(bitmap, last + 1, run_start);
1722 }
1723 }
1724 }
1725
1726 /*
1727 * Utility for the outgoing postcopy code.
1728 *
1729 * Discard any partially sent host-page size chunks, mark any partially
1730 * dirty host-page size chunks as all dirty.
1731 *
1732 * Returns: 0 on success
1733 */
1734 static int postcopy_chunk_hostpages(MigrationState *ms)
1735 {
1736 struct RAMBlock *block;
1737
1738 if (qemu_host_page_size == TARGET_PAGE_SIZE) {
1739 /* Easy case - TPS==HPS - nothing to be done */
1740 return 0;
1741 }
1742
1743 /* Easiest way to make sure we don't resume in the middle of a host-page */
1744 last_seen_block = NULL;
1745 last_sent_block = NULL;
1746 last_offset = 0;
1747
1748 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1749 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1750
1751 PostcopyDiscardState *pds =
1752 postcopy_discard_send_init(ms, first, block->idstr);
1753
1754 /* First pass: Discard all partially sent host pages */
1755 postcopy_chunk_hostpages_pass(ms, true, block, pds);
1756 /*
1757 * Second pass: Ensure that all partially dirty host pages are made
1758 * fully dirty.
1759 */
1760 postcopy_chunk_hostpages_pass(ms, false, block, pds);
1761
1762 postcopy_discard_send_finish(ms, pds);
1763 } /* ram_list loop */
1764
1765 return 0;
1766 }
1767
1768 /*
1769 * Transmit the set of pages to be discarded after precopy to the target
1770 * these are pages that:
1771 * a) Have been previously transmitted but are now dirty again
1772 * b) Pages that have never been transmitted, this ensures that
1773 * any pages on the destination that have been mapped by background
1774 * tasks get discarded (transparent huge pages is the specific concern)
1775 * Hopefully this is pretty sparse
1776 */
1777 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1778 {
1779 int ret;
1780 unsigned long *bitmap, *unsentmap;
1781
1782 rcu_read_lock();
1783
1784 /* This should be our last sync, the src is now paused */
1785 migration_bitmap_sync();
1786
1787 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1788 if (!unsentmap) {
1789 /* We don't have a safe way to resize the sentmap, so
1790 * if the bitmap was resized it will be NULL at this
1791 * point.
1792 */
1793 error_report("migration ram resized during precopy phase");
1794 rcu_read_unlock();
1795 return -EINVAL;
1796 }
1797
1798 /* Deal with TPS != HPS */
1799 ret = postcopy_chunk_hostpages(ms);
1800 if (ret) {
1801 rcu_read_unlock();
1802 return ret;
1803 }
1804
1805 /*
1806 * Update the unsentmap to be unsentmap = unsentmap | dirty
1807 */
1808 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1809 bitmap_or(unsentmap, unsentmap, bitmap,
1810 last_ram_offset() >> TARGET_PAGE_BITS);
1811
1812
1813 trace_ram_postcopy_send_discard_bitmap();
1814 #ifdef DEBUG_POSTCOPY
1815 ram_debug_dump_bitmap(unsentmap, true);
1816 #endif
1817
1818 ret = postcopy_each_ram_send_discard(ms);
1819 rcu_read_unlock();
1820
1821 return ret;
1822 }
1823
1824 /*
1825 * At the start of the postcopy phase of migration, any now-dirty
1826 * precopied pages are discarded.
1827 *
1828 * start, length describe a byte address range within the RAMBlock
1829 *
1830 * Returns 0 on success.
1831 */
1832 int ram_discard_range(MigrationIncomingState *mis,
1833 const char *block_name,
1834 uint64_t start, size_t length)
1835 {
1836 int ret = -1;
1837
1838 rcu_read_lock();
1839 RAMBlock *rb = qemu_ram_block_by_name(block_name);
1840
1841 if (!rb) {
1842 error_report("ram_discard_range: Failed to find block '%s'",
1843 block_name);
1844 goto err;
1845 }
1846
1847 uint8_t *host_startaddr = rb->host + start;
1848
1849 if ((uintptr_t)host_startaddr & (qemu_host_page_size - 1)) {
1850 error_report("ram_discard_range: Unaligned start address: %p",
1851 host_startaddr);
1852 goto err;
1853 }
1854
1855 if ((start + length) <= rb->used_length) {
1856 uint8_t *host_endaddr = host_startaddr + length;
1857 if ((uintptr_t)host_endaddr & (qemu_host_page_size - 1)) {
1858 error_report("ram_discard_range: Unaligned end address: %p",
1859 host_endaddr);
1860 goto err;
1861 }
1862 ret = postcopy_ram_discard_range(mis, host_startaddr, length);
1863 } else {
1864 error_report("ram_discard_range: Overrun block '%s' (%" PRIu64
1865 "/%zx/" RAM_ADDR_FMT")",
1866 block_name, start, length, rb->used_length);
1867 }
1868
1869 err:
1870 rcu_read_unlock();
1871
1872 return ret;
1873 }
1874
1875 static int ram_save_init_globals(void)
1876 {
1877 int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
1878
1879 dirty_rate_high_cnt = 0;
1880 bitmap_sync_count = 0;
1881 migration_bitmap_sync_init();
1882 qemu_mutex_init(&migration_bitmap_mutex);
1883
1884 if (migrate_use_xbzrle()) {
1885 XBZRLE_cache_lock();
1886 ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE);
1887 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1888 TARGET_PAGE_SIZE,
1889 TARGET_PAGE_SIZE);
1890 if (!XBZRLE.cache) {
1891 XBZRLE_cache_unlock();
1892 error_report("Error creating cache");
1893 return -1;
1894 }
1895 XBZRLE_cache_unlock();
1896
1897 /* We prefer not to abort if there is no memory */
1898 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1899 if (!XBZRLE.encoded_buf) {
1900 error_report("Error allocating encoded_buf");
1901 return -1;
1902 }
1903
1904 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1905 if (!XBZRLE.current_buf) {
1906 error_report("Error allocating current_buf");
1907 g_free(XBZRLE.encoded_buf);
1908 XBZRLE.encoded_buf = NULL;
1909 return -1;
1910 }
1911
1912 acct_clear();
1913 }
1914
1915 /* For memory_global_dirty_log_start below. */
1916 qemu_mutex_lock_iothread();
1917
1918 qemu_mutex_lock_ramlist();
1919 rcu_read_lock();
1920 bytes_transferred = 0;
1921 reset_ram_globals();
1922
1923 ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1924 migration_bitmap_rcu = g_new0(struct BitmapRcu, 1);
1925 migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages);
1926 bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages);
1927
1928 if (migrate_postcopy_ram()) {
1929 migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages);
1930 bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages);
1931 }
1932
1933 /*
1934 * Count the total number of pages used by ram blocks not including any
1935 * gaps due to alignment or unplugs.
1936 */
1937 migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1938
1939 memory_global_dirty_log_start();
1940 migration_bitmap_sync();
1941 qemu_mutex_unlock_ramlist();
1942 qemu_mutex_unlock_iothread();
1943 rcu_read_unlock();
1944
1945 return 0;
1946 }
1947
1948 /* Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1949 * long-running RCU critical section. When rcu-reclaims in the code
1950 * start to become numerous it will be necessary to reduce the
1951 * granularity of these critical sections.
1952 */
1953
1954 static int ram_save_setup(QEMUFile *f, void *opaque)
1955 {
1956 RAMBlock *block;
1957
1958 /* migration has already setup the bitmap, reuse it. */
1959 if (!migration_in_colo_state()) {
1960 if (ram_save_init_globals() < 0) {
1961 return -1;
1962 }
1963 }
1964
1965 rcu_read_lock();
1966
1967 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1968
1969 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1970 qemu_put_byte(f, strlen(block->idstr));
1971 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1972 qemu_put_be64(f, block->used_length);
1973 }
1974
1975 rcu_read_unlock();
1976
1977 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1978 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1979
1980 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1981
1982 return 0;
1983 }
1984
1985 static int ram_save_iterate(QEMUFile *f, void *opaque)
1986 {
1987 int ret;
1988 int i;
1989 int64_t t0;
1990 int done = 0;
1991
1992 rcu_read_lock();
1993 if (ram_list.version != last_version) {
1994 reset_ram_globals();
1995 }
1996
1997 /* Read version before ram_list.blocks */
1998 smp_rmb();
1999
2000 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2001
2002 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2003 i = 0;
2004 while ((ret = qemu_file_rate_limit(f)) == 0) {
2005 int pages;
2006
2007 pages = ram_find_and_save_block(f, false, &bytes_transferred);
2008 /* no more pages to sent */
2009 if (pages == 0) {
2010 done = 1;
2011 break;
2012 }
2013 acct_info.iterations++;
2014
2015 /* we want to check in the 1st loop, just in case it was the 1st time
2016 and we had to sync the dirty bitmap.
2017 qemu_get_clock_ns() is a bit expensive, so we only check each some
2018 iterations
2019 */
2020 if ((i & 63) == 0) {
2021 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2022 if (t1 > MAX_WAIT) {
2023 DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n",
2024 t1, i);
2025 break;
2026 }
2027 }
2028 i++;
2029 }
2030 flush_compressed_data(f);
2031 rcu_read_unlock();
2032
2033 /*
2034 * Must occur before EOS (or any QEMUFile operation)
2035 * because of RDMA protocol.
2036 */
2037 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2038
2039 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2040 bytes_transferred += 8;
2041
2042 ret = qemu_file_get_error(f);
2043 if (ret < 0) {
2044 return ret;
2045 }
2046
2047 return done;
2048 }
2049
2050 /* Called with iothread lock */
2051 static int ram_save_complete(QEMUFile *f, void *opaque)
2052 {
2053 rcu_read_lock();
2054
2055 if (!migration_in_postcopy(migrate_get_current())) {
2056 migration_bitmap_sync();
2057 }
2058
2059 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2060
2061 /* try transferring iterative blocks of memory */
2062
2063 /* flush all remaining blocks regardless of rate limiting */
2064 while (true) {
2065 int pages;
2066
2067 pages = ram_find_and_save_block(f, !migration_in_colo_state(),
2068 &bytes_transferred);
2069 /* no more blocks to sent */
2070 if (pages == 0) {
2071 break;
2072 }
2073 }
2074
2075 flush_compressed_data(f);
2076 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2077
2078 rcu_read_unlock();
2079
2080 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2081
2082 return 0;
2083 }
2084
2085 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2086 uint64_t *non_postcopiable_pending,
2087 uint64_t *postcopiable_pending)
2088 {
2089 uint64_t remaining_size;
2090
2091 remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2092
2093 if (!migration_in_postcopy(migrate_get_current()) &&
2094 remaining_size < max_size) {
2095 qemu_mutex_lock_iothread();
2096 rcu_read_lock();
2097 migration_bitmap_sync();
2098 rcu_read_unlock();
2099 qemu_mutex_unlock_iothread();
2100 remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2101 }
2102
2103 /* We can do postcopy, and all the data is postcopiable */
2104 *postcopiable_pending += remaining_size;
2105 }
2106
2107 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2108 {
2109 unsigned int xh_len;
2110 int xh_flags;
2111 uint8_t *loaded_data;
2112
2113 if (!xbzrle_decoded_buf) {
2114 xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2115 }
2116 loaded_data = xbzrle_decoded_buf;
2117
2118 /* extract RLE header */
2119 xh_flags = qemu_get_byte(f);
2120 xh_len = qemu_get_be16(f);
2121
2122 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2123 error_report("Failed to load XBZRLE page - wrong compression!");
2124 return -1;
2125 }
2126
2127 if (xh_len > TARGET_PAGE_SIZE) {
2128 error_report("Failed to load XBZRLE page - len overflow!");
2129 return -1;
2130 }
2131 /* load data and decode */
2132 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2133
2134 /* decode RLE */
2135 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2136 TARGET_PAGE_SIZE) == -1) {
2137 error_report("Failed to load XBZRLE page - decode error!");
2138 return -1;
2139 }
2140
2141 return 0;
2142 }
2143
2144 /* Must be called from within a rcu critical section.
2145 * Returns a pointer from within the RCU-protected ram_list.
2146 */
2147 /*
2148 * Read a RAMBlock ID from the stream f.
2149 *
2150 * f: Stream to read from
2151 * flags: Page flags (mostly to see if it's a continuation of previous block)
2152 */
2153 static inline RAMBlock *ram_block_from_stream(QEMUFile *f,
2154 int flags)
2155 {
2156 static RAMBlock *block = NULL;
2157 char id[256];
2158 uint8_t len;
2159
2160 if (flags & RAM_SAVE_FLAG_CONTINUE) {
2161 if (!block) {
2162 error_report("Ack, bad migration stream!");
2163 return NULL;
2164 }
2165 return block;
2166 }
2167
2168 len = qemu_get_byte(f);
2169 qemu_get_buffer(f, (uint8_t *)id, len);
2170 id[len] = 0;
2171
2172 block = qemu_ram_block_by_name(id);
2173 if (!block) {
2174 error_report("Can't find block %s", id);
2175 return NULL;
2176 }
2177
2178 return block;
2179 }
2180
2181 static inline void *host_from_ram_block_offset(RAMBlock *block,
2182 ram_addr_t offset)
2183 {
2184 if (!offset_in_ramblock(block, offset)) {
2185 return NULL;
2186 }
2187
2188 return block->host + offset;
2189 }
2190
2191 /*
2192 * If a page (or a whole RDMA chunk) has been
2193 * determined to be zero, then zap it.
2194 */
2195 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2196 {
2197 if (ch != 0 || !is_zero_range(host, size)) {
2198 memset(host, ch, size);
2199 }
2200 }
2201
2202 static void *do_data_decompress(void *opaque)
2203 {
2204 DecompressParam *param = opaque;
2205 unsigned long pagesize;
2206 uint8_t *des;
2207 int len;
2208
2209 qemu_mutex_lock(&param->mutex);
2210 while (!param->quit) {
2211 if (param->des) {
2212 des = param->des;
2213 len = param->len;
2214 param->des = 0;
2215 qemu_mutex_unlock(&param->mutex);
2216
2217 pagesize = TARGET_PAGE_SIZE;
2218 /* uncompress() will return failed in some case, especially
2219 * when the page is dirted when doing the compression, it's
2220 * not a problem because the dirty page will be retransferred
2221 * and uncompress() won't break the data in other pages.
2222 */
2223 uncompress((Bytef *)des, &pagesize,
2224 (const Bytef *)param->compbuf, len);
2225
2226 qemu_mutex_lock(&decomp_done_lock);
2227 param->done = true;
2228 qemu_cond_signal(&decomp_done_cond);
2229 qemu_mutex_unlock(&decomp_done_lock);
2230
2231 qemu_mutex_lock(&param->mutex);
2232 } else {
2233 qemu_cond_wait(&param->cond, &param->mutex);
2234 }
2235 }
2236 qemu_mutex_unlock(&param->mutex);
2237
2238 return NULL;
2239 }
2240
2241 static void wait_for_decompress_done(void)
2242 {
2243 int idx, thread_count;
2244
2245 if (!migrate_use_compression()) {
2246 return;
2247 }
2248
2249 thread_count = migrate_decompress_threads();
2250 qemu_mutex_lock(&decomp_done_lock);
2251 for (idx = 0; idx < thread_count; idx++) {
2252 while (!decomp_param[idx].done) {
2253 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2254 }
2255 }
2256 qemu_mutex_unlock(&decomp_done_lock);
2257 }
2258
2259 void migrate_decompress_threads_create(void)
2260 {
2261 int i, thread_count;
2262
2263 thread_count = migrate_decompress_threads();
2264 decompress_threads = g_new0(QemuThread, thread_count);
2265 decomp_param = g_new0(DecompressParam, thread_count);
2266 qemu_mutex_init(&decomp_done_lock);
2267 qemu_cond_init(&decomp_done_cond);
2268 for (i = 0; i < thread_count; i++) {
2269 qemu_mutex_init(&decomp_param[i].mutex);
2270 qemu_cond_init(&decomp_param[i].cond);
2271 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2272 decomp_param[i].done = true;
2273 decomp_param[i].quit = false;
2274 qemu_thread_create(decompress_threads + i, "decompress",
2275 do_data_decompress, decomp_param + i,
2276 QEMU_THREAD_JOINABLE);
2277 }
2278 }
2279
2280 void migrate_decompress_threads_join(void)
2281 {
2282 int i, thread_count;
2283
2284 thread_count = migrate_decompress_threads();
2285 for (i = 0; i < thread_count; i++) {
2286 qemu_mutex_lock(&decomp_param[i].mutex);
2287 decomp_param[i].quit = true;
2288 qemu_cond_signal(&decomp_param[i].cond);
2289 qemu_mutex_unlock(&decomp_param[i].mutex);
2290 }
2291 for (i = 0; i < thread_count; i++) {
2292 qemu_thread_join(decompress_threads + i);
2293 qemu_mutex_destroy(&decomp_param[i].mutex);
2294 qemu_cond_destroy(&decomp_param[i].cond);
2295 g_free(decomp_param[i].compbuf);
2296 }
2297 g_free(decompress_threads);
2298 g_free(decomp_param);
2299 decompress_threads = NULL;
2300 decomp_param = NULL;
2301 }
2302
2303 static void decompress_data_with_multi_threads(QEMUFile *f,
2304 void *host, int len)
2305 {
2306 int idx, thread_count;
2307
2308 thread_count = migrate_decompress_threads();
2309 qemu_mutex_lock(&decomp_done_lock);
2310 while (true) {
2311 for (idx = 0; idx < thread_count; idx++) {
2312 if (decomp_param[idx].done) {
2313 decomp_param[idx].done = false;
2314 qemu_mutex_lock(&decomp_param[idx].mutex);
2315 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2316 decomp_param[idx].des = host;
2317 decomp_param[idx].len = len;
2318 qemu_cond_signal(&decomp_param[idx].cond);
2319 qemu_mutex_unlock(&decomp_param[idx].mutex);
2320 break;
2321 }
2322 }
2323 if (idx < thread_count) {
2324 break;
2325 } else {
2326 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2327 }
2328 }
2329 qemu_mutex_unlock(&decomp_done_lock);
2330 }
2331
2332 /*
2333 * Allocate data structures etc needed by incoming migration with postcopy-ram
2334 * postcopy-ram's similarly names postcopy_ram_incoming_init does the work
2335 */
2336 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2337 {
2338 size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
2339
2340 return postcopy_ram_incoming_init(mis, ram_pages);
2341 }
2342
2343 /*
2344 * Called in postcopy mode by ram_load().
2345 * rcu_read_lock is taken prior to this being called.
2346 */
2347 static int ram_load_postcopy(QEMUFile *f)
2348 {
2349 int flags = 0, ret = 0;
2350 bool place_needed = false;
2351 bool matching_page_sizes = qemu_host_page_size == TARGET_PAGE_SIZE;
2352 MigrationIncomingState *mis = migration_incoming_get_current();
2353 /* Temporary page that is later 'placed' */
2354 void *postcopy_host_page = postcopy_get_tmp_page(mis);
2355 void *last_host = NULL;
2356 bool all_zero = false;
2357
2358 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2359 ram_addr_t addr;
2360 void *host = NULL;
2361 void *page_buffer = NULL;
2362 void *place_source = NULL;
2363 uint8_t ch;
2364
2365 addr = qemu_get_be64(f);
2366 flags = addr & ~TARGET_PAGE_MASK;
2367 addr &= TARGET_PAGE_MASK;
2368
2369 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2370 place_needed = false;
2371 if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) {
2372 RAMBlock *block = ram_block_from_stream(f, flags);
2373
2374 host = host_from_ram_block_offset(block, addr);
2375 if (!host) {
2376 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2377 ret = -EINVAL;
2378 break;
2379 }
2380 /*
2381 * Postcopy requires that we place whole host pages atomically.
2382 * To make it atomic, the data is read into a temporary page
2383 * that's moved into place later.
2384 * The migration protocol uses, possibly smaller, target-pages
2385 * however the source ensures it always sends all the components
2386 * of a host page in order.
2387 */
2388 page_buffer = postcopy_host_page +
2389 ((uintptr_t)host & ~qemu_host_page_mask);
2390 /* If all TP are zero then we can optimise the place */
2391 if (!((uintptr_t)host & ~qemu_host_page_mask)) {
2392 all_zero = true;
2393 } else {
2394 /* not the 1st TP within the HP */
2395 if (host != (last_host + TARGET_PAGE_SIZE)) {
2396 error_report("Non-sequential target page %p/%p",
2397 host, last_host);
2398 ret = -EINVAL;
2399 break;
2400 }
2401 }
2402
2403
2404 /*
2405 * If it's the last part of a host page then we place the host
2406 * page
2407 */
2408 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2409 ~qemu_host_page_mask) == 0;
2410 place_source = postcopy_host_page;
2411 }
2412 last_host = host;
2413
2414 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2415 case RAM_SAVE_FLAG_COMPRESS:
2416 ch = qemu_get_byte(f);
2417 memset(page_buffer, ch, TARGET_PAGE_SIZE);
2418 if (ch) {
2419 all_zero = false;
2420 }
2421 break;
2422
2423 case RAM_SAVE_FLAG_PAGE:
2424 all_zero = false;
2425 if (!place_needed || !matching_page_sizes) {
2426 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2427 } else {
2428 /* Avoids the qemu_file copy during postcopy, which is
2429 * going to do a copy later; can only do it when we
2430 * do this read in one go (matching page sizes)
2431 */
2432 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2433 TARGET_PAGE_SIZE);
2434 }
2435 break;
2436 case RAM_SAVE_FLAG_EOS:
2437 /* normal exit */
2438 break;
2439 default:
2440 error_report("Unknown combination of migration flags: %#x"
2441 " (postcopy mode)", flags);
2442 ret = -EINVAL;
2443 }
2444
2445 if (place_needed) {
2446 /* This gets called at the last target page in the host page */
2447 if (all_zero) {
2448 ret = postcopy_place_page_zero(mis,
2449 host + TARGET_PAGE_SIZE -
2450 qemu_host_page_size);
2451 } else {
2452 ret = postcopy_place_page(mis, host + TARGET_PAGE_SIZE -
2453 qemu_host_page_size,
2454 place_source);
2455 }
2456 }
2457 if (!ret) {
2458 ret = qemu_file_get_error(f);
2459 }
2460 }
2461
2462 return ret;
2463 }
2464
2465 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2466 {
2467 int flags = 0, ret = 0;
2468 static uint64_t seq_iter;
2469 int len = 0;
2470 /*
2471 * If system is running in postcopy mode, page inserts to host memory must
2472 * be atomic
2473 */
2474 bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2475
2476 seq_iter++;
2477
2478 if (version_id != 4) {
2479 ret = -EINVAL;
2480 }
2481
2482 /* This RCU critical section can be very long running.
2483 * When RCU reclaims in the code start to become numerous,
2484 * it will be necessary to reduce the granularity of this
2485 * critical section.
2486 */
2487 rcu_read_lock();
2488
2489 if (postcopy_running) {
2490 ret = ram_load_postcopy(f);
2491 }
2492
2493 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2494 ram_addr_t addr, total_ram_bytes;
2495 void *host = NULL;
2496 uint8_t ch;
2497
2498 addr = qemu_get_be64(f);
2499 flags = addr & ~TARGET_PAGE_MASK;
2500 addr &= TARGET_PAGE_MASK;
2501
2502 if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE |
2503 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2504 RAMBlock *block = ram_block_from_stream(f, flags);
2505
2506 host = host_from_ram_block_offset(block, addr);
2507 if (!host) {
2508 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2509 ret = -EINVAL;
2510 break;
2511 }
2512 }
2513
2514 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2515 case RAM_SAVE_FLAG_MEM_SIZE:
2516 /* Synchronize RAM block list */
2517 total_ram_bytes = addr;
2518 while (!ret && total_ram_bytes) {
2519 RAMBlock *block;
2520 char id[256];
2521 ram_addr_t length;
2522
2523 len = qemu_get_byte(f);
2524 qemu_get_buffer(f, (uint8_t *)id, len);
2525 id[len] = 0;
2526 length = qemu_get_be64(f);
2527
2528 block = qemu_ram_block_by_name(id);
2529 if (block) {
2530 if (length != block->used_length) {
2531 Error *local_err = NULL;
2532
2533 ret = qemu_ram_resize(block, length,
2534 &local_err);
2535 if (local_err) {
2536 error_report_err(local_err);
2537 }
2538 }
2539 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2540 block->idstr);
2541 } else {
2542 error_report("Unknown ramblock \"%s\", cannot "
2543 "accept migration", id);
2544 ret = -EINVAL;
2545 }
2546
2547 total_ram_bytes -= length;
2548 }
2549 break;
2550
2551 case RAM_SAVE_FLAG_COMPRESS:
2552 ch = qemu_get_byte(f);
2553 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2554 break;
2555
2556 case RAM_SAVE_FLAG_PAGE:
2557 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2558 break;
2559
2560 case RAM_SAVE_FLAG_COMPRESS_PAGE:
2561 len = qemu_get_be32(f);
2562 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2563 error_report("Invalid compressed data length: %d", len);
2564 ret = -EINVAL;
2565 break;
2566 }
2567 decompress_data_with_multi_threads(f, host, len);
2568 break;
2569
2570 case RAM_SAVE_FLAG_XBZRLE:
2571 if (load_xbzrle(f, addr, host) < 0) {
2572 error_report("Failed to decompress XBZRLE page at "
2573 RAM_ADDR_FMT, addr);
2574 ret = -EINVAL;
2575 break;
2576 }
2577 break;
2578 case RAM_SAVE_FLAG_EOS:
2579 /* normal exit */
2580 break;
2581 default:
2582 if (flags & RAM_SAVE_FLAG_HOOK) {
2583 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2584 } else {
2585 error_report("Unknown combination of migration flags: %#x",
2586 flags);
2587 ret = -EINVAL;
2588 }
2589 }
2590 if (!ret) {
2591 ret = qemu_file_get_error(f);
2592 }
2593 }
2594
2595 wait_for_decompress_done();
2596 rcu_read_unlock();
2597 DPRINTF("Completed load of VM with exit code %d seq iteration "
2598 "%" PRIu64 "\n", ret, seq_iter);
2599 return ret;
2600 }
2601
2602 static SaveVMHandlers savevm_ram_handlers = {
2603 .save_live_setup = ram_save_setup,
2604 .save_live_iterate = ram_save_iterate,
2605 .save_live_complete_postcopy = ram_save_complete,
2606 .save_live_complete_precopy = ram_save_complete,
2607 .save_live_pending = ram_save_pending,
2608 .load_state = ram_load,
2609 .cleanup = ram_migration_cleanup,
2610 };
2611
2612 void ram_mig_init(void)
2613 {
2614 qemu_mutex_init(&XBZRLE.lock);
2615 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, NULL);
2616 }