]> git.proxmox.com Git - mirror_qemu.git/blame - migration/ram.c
Migration: Emit event at start of pass
[mirror_qemu.git] / migration / ram.c
CommitLineData
56e93d26
JQ
1/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
76cc7b58
JQ
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
56e93d26
JQ
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
28#include <stdint.h>
56e93d26 29#include <zlib.h>
4addcd4f 30#include "qapi-event.h"
56e93d26
JQ
31#include "qemu/bitops.h"
32#include "qemu/bitmap.h"
7205c9ec
JQ
33#include "qemu/timer.h"
34#include "qemu/main-loop.h"
56e93d26 35#include "migration/migration.h"
e0b266f0 36#include "migration/postcopy-ram.h"
56e93d26
JQ
37#include "exec/address-spaces.h"
38#include "migration/page_cache.h"
56e93d26 39#include "qemu/error-report.h"
56e93d26 40#include "trace.h"
56e93d26 41#include "exec/ram_addr.h"
56e93d26
JQ
42#include "qemu/rcu_queue.h"
43
44#ifdef DEBUG_MIGRATION_RAM
45#define DPRINTF(fmt, ...) \
46 do { fprintf(stdout, "migration_ram: " fmt, ## __VA_ARGS__); } while (0)
47#else
48#define DPRINTF(fmt, ...) \
49 do { } while (0)
50#endif
51
56e93d26 52static int dirty_rate_high_cnt;
56e93d26
JQ
53
54static uint64_t bitmap_sync_count;
55
56/***********************************************************/
57/* ram save/restore */
58
59#define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
60#define RAM_SAVE_FLAG_COMPRESS 0x02
61#define RAM_SAVE_FLAG_MEM_SIZE 0x04
62#define RAM_SAVE_FLAG_PAGE 0x08
63#define RAM_SAVE_FLAG_EOS 0x10
64#define RAM_SAVE_FLAG_CONTINUE 0x20
65#define RAM_SAVE_FLAG_XBZRLE 0x40
66/* 0x80 is reserved in migration.h start with 0x100 next */
67#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
68
69static const uint8_t ZERO_TARGET_PAGE[TARGET_PAGE_SIZE];
70
71static inline bool is_zero_range(uint8_t *p, uint64_t size)
72{
73 return buffer_find_nonzero_offset(p, size) == size;
74}
75
76/* struct contains XBZRLE cache and a static page
77 used by the compression */
78static struct {
79 /* buffer used for XBZRLE encoding */
80 uint8_t *encoded_buf;
81 /* buffer for storing page content */
82 uint8_t *current_buf;
83 /* Cache for XBZRLE, Protected by lock. */
84 PageCache *cache;
85 QemuMutex lock;
86} XBZRLE;
87
88/* buffer used for XBZRLE decoding */
89static uint8_t *xbzrle_decoded_buf;
90
91static void XBZRLE_cache_lock(void)
92{
93 if (migrate_use_xbzrle())
94 qemu_mutex_lock(&XBZRLE.lock);
95}
96
97static void XBZRLE_cache_unlock(void)
98{
99 if (migrate_use_xbzrle())
100 qemu_mutex_unlock(&XBZRLE.lock);
101}
102
103/*
104 * called from qmp_migrate_set_cache_size in main thread, possibly while
105 * a migration is in progress.
106 * A running migration maybe using the cache and might finish during this
107 * call, hence changes to the cache are protected by XBZRLE.lock().
108 */
109int64_t xbzrle_cache_resize(int64_t new_size)
110{
111 PageCache *new_cache;
112 int64_t ret;
113
114 if (new_size < TARGET_PAGE_SIZE) {
115 return -1;
116 }
117
118 XBZRLE_cache_lock();
119
120 if (XBZRLE.cache != NULL) {
121 if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
122 goto out_new_size;
123 }
124 new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
125 TARGET_PAGE_SIZE);
126 if (!new_cache) {
127 error_report("Error creating cache");
128 ret = -1;
129 goto out;
130 }
131
132 cache_fini(XBZRLE.cache);
133 XBZRLE.cache = new_cache;
134 }
135
136out_new_size:
137 ret = pow2floor(new_size);
138out:
139 XBZRLE_cache_unlock();
140 return ret;
141}
142
143/* accounting for migration statistics */
144typedef struct AccountingInfo {
145 uint64_t dup_pages;
146 uint64_t skipped_pages;
147 uint64_t norm_pages;
148 uint64_t iterations;
149 uint64_t xbzrle_bytes;
150 uint64_t xbzrle_pages;
151 uint64_t xbzrle_cache_miss;
152 double xbzrle_cache_miss_rate;
153 uint64_t xbzrle_overflows;
154} AccountingInfo;
155
156static AccountingInfo acct_info;
157
158static void acct_clear(void)
159{
160 memset(&acct_info, 0, sizeof(acct_info));
161}
162
163uint64_t dup_mig_bytes_transferred(void)
164{
165 return acct_info.dup_pages * TARGET_PAGE_SIZE;
166}
167
168uint64_t dup_mig_pages_transferred(void)
169{
170 return acct_info.dup_pages;
171}
172
173uint64_t skipped_mig_bytes_transferred(void)
174{
175 return acct_info.skipped_pages * TARGET_PAGE_SIZE;
176}
177
178uint64_t skipped_mig_pages_transferred(void)
179{
180 return acct_info.skipped_pages;
181}
182
183uint64_t norm_mig_bytes_transferred(void)
184{
185 return acct_info.norm_pages * TARGET_PAGE_SIZE;
186}
187
188uint64_t norm_mig_pages_transferred(void)
189{
190 return acct_info.norm_pages;
191}
192
193uint64_t xbzrle_mig_bytes_transferred(void)
194{
195 return acct_info.xbzrle_bytes;
196}
197
198uint64_t xbzrle_mig_pages_transferred(void)
199{
200 return acct_info.xbzrle_pages;
201}
202
203uint64_t xbzrle_mig_pages_cache_miss(void)
204{
205 return acct_info.xbzrle_cache_miss;
206}
207
208double xbzrle_mig_cache_miss_rate(void)
209{
210 return acct_info.xbzrle_cache_miss_rate;
211}
212
213uint64_t xbzrle_mig_pages_overflow(void)
214{
215 return acct_info.xbzrle_overflows;
216}
217
218/* This is the last block that we have visited serching for dirty pages
219 */
220static RAMBlock *last_seen_block;
221/* This is the last block from where we have sent data */
222static RAMBlock *last_sent_block;
223static ram_addr_t last_offset;
dd631697 224static QemuMutex migration_bitmap_mutex;
56e93d26
JQ
225static uint64_t migration_dirty_pages;
226static uint32_t last_version;
227static bool ram_bulk_stage;
228
b8fb8cb7
DDAG
229/* used by the search for pages to send */
230struct PageSearchStatus {
231 /* Current block being searched */
232 RAMBlock *block;
233 /* Current offset to search from */
234 ram_addr_t offset;
235 /* Set once we wrap around */
236 bool complete_round;
237};
238typedef struct PageSearchStatus PageSearchStatus;
239
60be6340
DL
240static struct BitmapRcu {
241 struct rcu_head rcu;
f3f491fc 242 /* Main migration bitmap */
60be6340 243 unsigned long *bmap;
f3f491fc
DDAG
244 /* bitmap of pages that haven't been sent even once
245 * only maintained and used in postcopy at the moment
246 * where it's used to send the dirtymap at the start
247 * of the postcopy phase
248 */
249 unsigned long *unsentmap;
60be6340
DL
250} *migration_bitmap_rcu;
251
56e93d26
JQ
252struct CompressParam {
253 bool start;
254 bool done;
255 QEMUFile *file;
256 QemuMutex mutex;
257 QemuCond cond;
258 RAMBlock *block;
259 ram_addr_t offset;
260};
261typedef struct CompressParam CompressParam;
262
263struct DecompressParam {
264 bool start;
265 QemuMutex mutex;
266 QemuCond cond;
267 void *des;
268 uint8 *compbuf;
269 int len;
270};
271typedef struct DecompressParam DecompressParam;
272
273static CompressParam *comp_param;
274static QemuThread *compress_threads;
275/* comp_done_cond is used to wake up the migration thread when
276 * one of the compression threads has finished the compression.
277 * comp_done_lock is used to co-work with comp_done_cond.
278 */
279static QemuMutex *comp_done_lock;
280static QemuCond *comp_done_cond;
281/* The empty QEMUFileOps will be used by file in CompressParam */
282static const QEMUFileOps empty_ops = { };
283
284static bool compression_switch;
285static bool quit_comp_thread;
286static bool quit_decomp_thread;
287static DecompressParam *decomp_param;
288static QemuThread *decompress_threads;
289static uint8_t *compressed_data_buf;
290
291static int do_compress_ram_page(CompressParam *param);
292
293static void *do_data_compress(void *opaque)
294{
295 CompressParam *param = opaque;
296
297 while (!quit_comp_thread) {
298 qemu_mutex_lock(&param->mutex);
299 /* Re-check the quit_comp_thread in case of
300 * terminate_compression_threads is called just before
301 * qemu_mutex_lock(&param->mutex) and after
302 * while(!quit_comp_thread), re-check it here can make
303 * sure the compression thread terminate as expected.
304 */
305 while (!param->start && !quit_comp_thread) {
306 qemu_cond_wait(&param->cond, &param->mutex);
307 }
308 if (!quit_comp_thread) {
309 do_compress_ram_page(param);
310 }
311 param->start = false;
312 qemu_mutex_unlock(&param->mutex);
313
314 qemu_mutex_lock(comp_done_lock);
315 param->done = true;
316 qemu_cond_signal(comp_done_cond);
317 qemu_mutex_unlock(comp_done_lock);
318 }
319
320 return NULL;
321}
322
323static inline void terminate_compression_threads(void)
324{
325 int idx, thread_count;
326
327 thread_count = migrate_compress_threads();
328 quit_comp_thread = true;
329 for (idx = 0; idx < thread_count; idx++) {
330 qemu_mutex_lock(&comp_param[idx].mutex);
331 qemu_cond_signal(&comp_param[idx].cond);
332 qemu_mutex_unlock(&comp_param[idx].mutex);
333 }
334}
335
336void migrate_compress_threads_join(void)
337{
338 int i, thread_count;
339
340 if (!migrate_use_compression()) {
341 return;
342 }
343 terminate_compression_threads();
344 thread_count = migrate_compress_threads();
345 for (i = 0; i < thread_count; i++) {
346 qemu_thread_join(compress_threads + i);
347 qemu_fclose(comp_param[i].file);
348 qemu_mutex_destroy(&comp_param[i].mutex);
349 qemu_cond_destroy(&comp_param[i].cond);
350 }
351 qemu_mutex_destroy(comp_done_lock);
352 qemu_cond_destroy(comp_done_cond);
353 g_free(compress_threads);
354 g_free(comp_param);
355 g_free(comp_done_cond);
356 g_free(comp_done_lock);
357 compress_threads = NULL;
358 comp_param = NULL;
359 comp_done_cond = NULL;
360 comp_done_lock = NULL;
361}
362
363void migrate_compress_threads_create(void)
364{
365 int i, thread_count;
366
367 if (!migrate_use_compression()) {
368 return;
369 }
370 quit_comp_thread = false;
371 compression_switch = true;
372 thread_count = migrate_compress_threads();
373 compress_threads = g_new0(QemuThread, thread_count);
374 comp_param = g_new0(CompressParam, thread_count);
375 comp_done_cond = g_new0(QemuCond, 1);
376 comp_done_lock = g_new0(QemuMutex, 1);
377 qemu_cond_init(comp_done_cond);
378 qemu_mutex_init(comp_done_lock);
379 for (i = 0; i < thread_count; i++) {
380 /* com_param[i].file is just used as a dummy buffer to save data, set
381 * it's ops to empty.
382 */
383 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
384 comp_param[i].done = true;
385 qemu_mutex_init(&comp_param[i].mutex);
386 qemu_cond_init(&comp_param[i].cond);
387 qemu_thread_create(compress_threads + i, "compress",
388 do_data_compress, comp_param + i,
389 QEMU_THREAD_JOINABLE);
390 }
391}
392
393/**
394 * save_page_header: Write page header to wire
395 *
396 * If this is the 1st block, it also writes the block identification
397 *
398 * Returns: Number of bytes written
399 *
400 * @f: QEMUFile where to send the data
401 * @block: block that contains the page we want to send
402 * @offset: offset inside the block for the page
403 * in the lower bits, it contains flags
404 */
405static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
406{
9f5f380b 407 size_t size, len;
56e93d26
JQ
408
409 qemu_put_be64(f, offset);
410 size = 8;
411
412 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
9f5f380b
LL
413 len = strlen(block->idstr);
414 qemu_put_byte(f, len);
415 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
416 size += 1 + len;
56e93d26
JQ
417 }
418 return size;
419}
420
070afca2
JH
421/* Reduce amount of guest cpu execution to hopefully slow down memory writes.
422 * If guest dirty memory rate is reduced below the rate at which we can
423 * transfer pages to the destination then we should be able to complete
424 * migration. Some workloads dirty memory way too fast and will not effectively
425 * converge, even with auto-converge.
426 */
427static void mig_throttle_guest_down(void)
428{
429 MigrationState *s = migrate_get_current();
430 uint64_t pct_initial =
431 s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INITIAL];
432 uint64_t pct_icrement =
433 s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INCREMENT];
434
435 /* We have not started throttling yet. Let's start it. */
436 if (!cpu_throttle_active()) {
437 cpu_throttle_set(pct_initial);
438 } else {
439 /* Throttling already on, just increase the rate */
440 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
441 }
442}
443
56e93d26
JQ
444/* Update the xbzrle cache to reflect a page that's been sent as all 0.
445 * The important thing is that a stale (not-yet-0'd) page be replaced
446 * by the new data.
447 * As a bonus, if the page wasn't in the cache it gets added so that
448 * when a small write is made into the 0'd page it gets XBZRLE sent
449 */
450static void xbzrle_cache_zero_page(ram_addr_t current_addr)
451{
452 if (ram_bulk_stage || !migrate_use_xbzrle()) {
453 return;
454 }
455
456 /* We don't care if this fails to allocate a new cache page
457 * as long as it updated an old one */
458 cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
459 bitmap_sync_count);
460}
461
462#define ENCODING_FLAG_XBZRLE 0x1
463
464/**
465 * save_xbzrle_page: compress and send current page
466 *
467 * Returns: 1 means that we wrote the page
468 * 0 means that page is identical to the one already sent
469 * -1 means that xbzrle would be longer than normal
470 *
471 * @f: QEMUFile where to send the data
472 * @current_data:
473 * @current_addr:
474 * @block: block that contains the page we want to send
475 * @offset: offset inside the block for the page
476 * @last_stage: if we are at the completion stage
477 * @bytes_transferred: increase it with the number of transferred bytes
478 */
479static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data,
480 ram_addr_t current_addr, RAMBlock *block,
481 ram_addr_t offset, bool last_stage,
482 uint64_t *bytes_transferred)
483{
484 int encoded_len = 0, bytes_xbzrle;
485 uint8_t *prev_cached_page;
486
487 if (!cache_is_cached(XBZRLE.cache, current_addr, bitmap_sync_count)) {
488 acct_info.xbzrle_cache_miss++;
489 if (!last_stage) {
490 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
491 bitmap_sync_count) == -1) {
492 return -1;
493 } else {
494 /* update *current_data when the page has been
495 inserted into cache */
496 *current_data = get_cached_data(XBZRLE.cache, current_addr);
497 }
498 }
499 return -1;
500 }
501
502 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
503
504 /* save current buffer into memory */
505 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
506
507 /* XBZRLE encoding (if there is no overflow) */
508 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
509 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
510 TARGET_PAGE_SIZE);
511 if (encoded_len == 0) {
512 DPRINTF("Skipping unmodified page\n");
513 return 0;
514 } else if (encoded_len == -1) {
515 DPRINTF("Overflow\n");
516 acct_info.xbzrle_overflows++;
517 /* update data in the cache */
518 if (!last_stage) {
519 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
520 *current_data = prev_cached_page;
521 }
522 return -1;
523 }
524
525 /* we need to update the data in the cache, in order to get the same data */
526 if (!last_stage) {
527 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
528 }
529
530 /* Send XBZRLE based compressed page */
531 bytes_xbzrle = save_page_header(f, block, offset | RAM_SAVE_FLAG_XBZRLE);
532 qemu_put_byte(f, ENCODING_FLAG_XBZRLE);
533 qemu_put_be16(f, encoded_len);
534 qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len);
535 bytes_xbzrle += encoded_len + 1 + 2;
536 acct_info.xbzrle_pages++;
537 acct_info.xbzrle_bytes += bytes_xbzrle;
538 *bytes_transferred += bytes_xbzrle;
539
540 return 1;
541}
542
f3f491fc
DDAG
543/* Called with rcu_read_lock() to protect migration_bitmap
544 * rb: The RAMBlock to search for dirty pages in
545 * start: Start address (typically so we can continue from previous page)
546 * ram_addr_abs: Pointer into which to store the address of the dirty page
547 * within the global ram_addr space
548 *
549 * Returns: byte offset within memory region of the start of a dirty page
550 */
56e93d26 551static inline
a82d593b
DDAG
552ram_addr_t migration_bitmap_find_dirty(RAMBlock *rb,
553 ram_addr_t start,
554 ram_addr_t *ram_addr_abs)
56e93d26 555{
2f68e399 556 unsigned long base = rb->offset >> TARGET_PAGE_BITS;
56e93d26 557 unsigned long nr = base + (start >> TARGET_PAGE_BITS);
2f68e399
DDAG
558 uint64_t rb_size = rb->used_length;
559 unsigned long size = base + (rb_size >> TARGET_PAGE_BITS);
2ff64038 560 unsigned long *bitmap;
56e93d26
JQ
561
562 unsigned long next;
563
60be6340 564 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
56e93d26
JQ
565 if (ram_bulk_stage && nr > base) {
566 next = nr + 1;
567 } else {
2ff64038 568 next = find_next_bit(bitmap, size, nr);
56e93d26
JQ
569 }
570
f3f491fc 571 *ram_addr_abs = next << TARGET_PAGE_BITS;
56e93d26
JQ
572 return (next - base) << TARGET_PAGE_BITS;
573}
574
a82d593b
DDAG
575static inline bool migration_bitmap_clear_dirty(ram_addr_t addr)
576{
577 bool ret;
578 int nr = addr >> TARGET_PAGE_BITS;
579 unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
580
581 ret = test_and_clear_bit(nr, bitmap);
582
583 if (ret) {
584 migration_dirty_pages--;
585 }
586 return ret;
587}
588
56e93d26
JQ
589static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length)
590{
2ff64038 591 unsigned long *bitmap;
60be6340 592 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
56e93d26 593 migration_dirty_pages +=
2ff64038 594 cpu_physical_memory_sync_dirty_bitmap(bitmap, start, length);
56e93d26
JQ
595}
596
56e93d26
JQ
597/* Fix me: there are too many global variables used in migration process. */
598static int64_t start_time;
599static int64_t bytes_xfer_prev;
600static int64_t num_dirty_pages_period;
601static uint64_t xbzrle_cache_miss_prev;
602static uint64_t iterations_prev;
603
604static void migration_bitmap_sync_init(void)
605{
606 start_time = 0;
607 bytes_xfer_prev = 0;
608 num_dirty_pages_period = 0;
609 xbzrle_cache_miss_prev = 0;
610 iterations_prev = 0;
611}
612
613/* Called with iothread lock held, to protect ram_list.dirty_memory[] */
614static void migration_bitmap_sync(void)
615{
616 RAMBlock *block;
617 uint64_t num_dirty_pages_init = migration_dirty_pages;
618 MigrationState *s = migrate_get_current();
619 int64_t end_time;
620 int64_t bytes_xfer_now;
621
622 bitmap_sync_count++;
623
624 if (!bytes_xfer_prev) {
625 bytes_xfer_prev = ram_bytes_transferred();
626 }
627
628 if (!start_time) {
629 start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
630 }
631
632 trace_migration_bitmap_sync_start();
633 address_space_sync_dirty_bitmap(&address_space_memory);
634
dd631697 635 qemu_mutex_lock(&migration_bitmap_mutex);
56e93d26
JQ
636 rcu_read_lock();
637 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2f68e399 638 migration_bitmap_sync_range(block->offset, block->used_length);
56e93d26
JQ
639 }
640 rcu_read_unlock();
dd631697 641 qemu_mutex_unlock(&migration_bitmap_mutex);
56e93d26
JQ
642
643 trace_migration_bitmap_sync_end(migration_dirty_pages
644 - num_dirty_pages_init);
645 num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init;
646 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
647
648 /* more than 1 second = 1000 millisecons */
649 if (end_time > start_time + 1000) {
650 if (migrate_auto_converge()) {
651 /* The following detection logic can be refined later. For now:
652 Check to see if the dirtied bytes is 50% more than the approx.
653 amount of bytes that just got transferred since the last time we
070afca2
JH
654 were in this routine. If that happens twice, start or increase
655 throttling */
56e93d26 656 bytes_xfer_now = ram_bytes_transferred();
070afca2 657
56e93d26
JQ
658 if (s->dirty_pages_rate &&
659 (num_dirty_pages_period * TARGET_PAGE_SIZE >
660 (bytes_xfer_now - bytes_xfer_prev)/2) &&
070afca2 661 (dirty_rate_high_cnt++ >= 2)) {
56e93d26 662 trace_migration_throttle();
56e93d26 663 dirty_rate_high_cnt = 0;
070afca2 664 mig_throttle_guest_down();
56e93d26
JQ
665 }
666 bytes_xfer_prev = bytes_xfer_now;
56e93d26 667 }
070afca2 668
56e93d26
JQ
669 if (migrate_use_xbzrle()) {
670 if (iterations_prev != acct_info.iterations) {
671 acct_info.xbzrle_cache_miss_rate =
672 (double)(acct_info.xbzrle_cache_miss -
673 xbzrle_cache_miss_prev) /
674 (acct_info.iterations - iterations_prev);
675 }
676 iterations_prev = acct_info.iterations;
677 xbzrle_cache_miss_prev = acct_info.xbzrle_cache_miss;
678 }
679 s->dirty_pages_rate = num_dirty_pages_period * 1000
680 / (end_time - start_time);
681 s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE;
682 start_time = end_time;
683 num_dirty_pages_period = 0;
684 }
685 s->dirty_sync_count = bitmap_sync_count;
4addcd4f
DDAG
686 if (migrate_use_events()) {
687 qapi_event_send_migration_pass(bitmap_sync_count, NULL);
688 }
56e93d26
JQ
689}
690
691/**
692 * save_zero_page: Send the zero page to the stream
693 *
694 * Returns: Number of pages written.
695 *
696 * @f: QEMUFile where to send the data
697 * @block: block that contains the page we want to send
698 * @offset: offset inside the block for the page
699 * @p: pointer to the page
700 * @bytes_transferred: increase it with the number of transferred bytes
701 */
702static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
703 uint8_t *p, uint64_t *bytes_transferred)
704{
705 int pages = -1;
706
707 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
708 acct_info.dup_pages++;
709 *bytes_transferred += save_page_header(f, block,
710 offset | RAM_SAVE_FLAG_COMPRESS);
711 qemu_put_byte(f, 0);
712 *bytes_transferred += 1;
713 pages = 1;
714 }
715
716 return pages;
717}
718
719/**
720 * ram_save_page: Send the given page to the stream
721 *
722 * Returns: Number of pages written.
3fd3c4b3
DDAG
723 * < 0 - error
724 * >=0 - Number of pages written - this might legally be 0
725 * if xbzrle noticed the page was the same.
56e93d26
JQ
726 *
727 * @f: QEMUFile where to send the data
728 * @block: block that contains the page we want to send
729 * @offset: offset inside the block for the page
730 * @last_stage: if we are at the completion stage
731 * @bytes_transferred: increase it with the number of transferred bytes
732 */
733static int ram_save_page(QEMUFile *f, RAMBlock* block, ram_addr_t offset,
734 bool last_stage, uint64_t *bytes_transferred)
735{
736 int pages = -1;
737 uint64_t bytes_xmit;
738 ram_addr_t current_addr;
56e93d26
JQ
739 uint8_t *p;
740 int ret;
741 bool send_async = true;
742
2f68e399 743 p = block->host + offset;
56e93d26
JQ
744
745 /* In doubt sent page as normal */
746 bytes_xmit = 0;
747 ret = ram_control_save_page(f, block->offset,
748 offset, TARGET_PAGE_SIZE, &bytes_xmit);
749 if (bytes_xmit) {
750 *bytes_transferred += bytes_xmit;
751 pages = 1;
752 }
753
754 XBZRLE_cache_lock();
755
756 current_addr = block->offset + offset;
757
758 if (block == last_sent_block) {
759 offset |= RAM_SAVE_FLAG_CONTINUE;
760 }
761 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
762 if (ret != RAM_SAVE_CONTROL_DELAYED) {
763 if (bytes_xmit > 0) {
764 acct_info.norm_pages++;
765 } else if (bytes_xmit == 0) {
766 acct_info.dup_pages++;
767 }
768 }
769 } else {
770 pages = save_zero_page(f, block, offset, p, bytes_transferred);
771 if (pages > 0) {
772 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
773 * page would be stale
774 */
775 xbzrle_cache_zero_page(current_addr);
776 } else if (!ram_bulk_stage && migrate_use_xbzrle()) {
777 pages = save_xbzrle_page(f, &p, current_addr, block,
778 offset, last_stage, bytes_transferred);
779 if (!last_stage) {
780 /* Can't send this cached data async, since the cache page
781 * might get updated before it gets to the wire
782 */
783 send_async = false;
784 }
785 }
786 }
787
788 /* XBZRLE overflow or normal page */
789 if (pages == -1) {
790 *bytes_transferred += save_page_header(f, block,
791 offset | RAM_SAVE_FLAG_PAGE);
792 if (send_async) {
793 qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE);
794 } else {
795 qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
796 }
797 *bytes_transferred += TARGET_PAGE_SIZE;
798 pages = 1;
799 acct_info.norm_pages++;
800 }
801
802 XBZRLE_cache_unlock();
803
804 return pages;
805}
806
807static int do_compress_ram_page(CompressParam *param)
808{
809 int bytes_sent, blen;
810 uint8_t *p;
811 RAMBlock *block = param->block;
812 ram_addr_t offset = param->offset;
813
2f68e399 814 p = block->host + (offset & TARGET_PAGE_MASK);
56e93d26
JQ
815
816 bytes_sent = save_page_header(param->file, block, offset |
817 RAM_SAVE_FLAG_COMPRESS_PAGE);
818 blen = qemu_put_compression_data(param->file, p, TARGET_PAGE_SIZE,
819 migrate_compress_level());
820 bytes_sent += blen;
821
822 return bytes_sent;
823}
824
825static inline void start_compression(CompressParam *param)
826{
827 param->done = false;
828 qemu_mutex_lock(&param->mutex);
829 param->start = true;
830 qemu_cond_signal(&param->cond);
831 qemu_mutex_unlock(&param->mutex);
832}
833
834static inline void start_decompression(DecompressParam *param)
835{
836 qemu_mutex_lock(&param->mutex);
837 param->start = true;
838 qemu_cond_signal(&param->cond);
839 qemu_mutex_unlock(&param->mutex);
840}
841
842static uint64_t bytes_transferred;
843
844static void flush_compressed_data(QEMUFile *f)
845{
846 int idx, len, thread_count;
847
848 if (!migrate_use_compression()) {
849 return;
850 }
851 thread_count = migrate_compress_threads();
852 for (idx = 0; idx < thread_count; idx++) {
853 if (!comp_param[idx].done) {
854 qemu_mutex_lock(comp_done_lock);
855 while (!comp_param[idx].done && !quit_comp_thread) {
856 qemu_cond_wait(comp_done_cond, comp_done_lock);
857 }
858 qemu_mutex_unlock(comp_done_lock);
859 }
860 if (!quit_comp_thread) {
861 len = qemu_put_qemu_file(f, comp_param[idx].file);
862 bytes_transferred += len;
863 }
864 }
865}
866
867static inline void set_compress_params(CompressParam *param, RAMBlock *block,
868 ram_addr_t offset)
869{
870 param->block = block;
871 param->offset = offset;
872}
873
874static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block,
875 ram_addr_t offset,
876 uint64_t *bytes_transferred)
877{
878 int idx, thread_count, bytes_xmit = -1, pages = -1;
879
880 thread_count = migrate_compress_threads();
881 qemu_mutex_lock(comp_done_lock);
882 while (true) {
883 for (idx = 0; idx < thread_count; idx++) {
884 if (comp_param[idx].done) {
885 bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file);
886 set_compress_params(&comp_param[idx], block, offset);
887 start_compression(&comp_param[idx]);
888 pages = 1;
889 acct_info.norm_pages++;
890 *bytes_transferred += bytes_xmit;
891 break;
892 }
893 }
894 if (pages > 0) {
895 break;
896 } else {
897 qemu_cond_wait(comp_done_cond, comp_done_lock);
898 }
899 }
900 qemu_mutex_unlock(comp_done_lock);
901
902 return pages;
903}
904
905/**
906 * ram_save_compressed_page: compress the given page and send it to the stream
907 *
908 * Returns: Number of pages written.
909 *
910 * @f: QEMUFile where to send the data
911 * @block: block that contains the page we want to send
912 * @offset: offset inside the block for the page
913 * @last_stage: if we are at the completion stage
914 * @bytes_transferred: increase it with the number of transferred bytes
915 */
916static int ram_save_compressed_page(QEMUFile *f, RAMBlock *block,
917 ram_addr_t offset, bool last_stage,
918 uint64_t *bytes_transferred)
919{
920 int pages = -1;
921 uint64_t bytes_xmit;
56e93d26
JQ
922 uint8_t *p;
923 int ret;
924
2f68e399 925 p = block->host + offset;
56e93d26
JQ
926
927 bytes_xmit = 0;
928 ret = ram_control_save_page(f, block->offset,
929 offset, TARGET_PAGE_SIZE, &bytes_xmit);
930 if (bytes_xmit) {
931 *bytes_transferred += bytes_xmit;
932 pages = 1;
933 }
934 if (block == last_sent_block) {
935 offset |= RAM_SAVE_FLAG_CONTINUE;
936 }
937 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
938 if (ret != RAM_SAVE_CONTROL_DELAYED) {
939 if (bytes_xmit > 0) {
940 acct_info.norm_pages++;
941 } else if (bytes_xmit == 0) {
942 acct_info.dup_pages++;
943 }
944 }
945 } else {
946 /* When starting the process of a new block, the first page of
947 * the block should be sent out before other pages in the same
948 * block, and all the pages in last block should have been sent
949 * out, keeping this order is important, because the 'cont' flag
950 * is used to avoid resending the block name.
951 */
952 if (block != last_sent_block) {
953 flush_compressed_data(f);
954 pages = save_zero_page(f, block, offset, p, bytes_transferred);
955 if (pages == -1) {
956 set_compress_params(&comp_param[0], block, offset);
957 /* Use the qemu thread to compress the data to make sure the
958 * first page is sent out before other pages
959 */
960 bytes_xmit = do_compress_ram_page(&comp_param[0]);
961 acct_info.norm_pages++;
962 qemu_put_qemu_file(f, comp_param[0].file);
963 *bytes_transferred += bytes_xmit;
964 pages = 1;
965 }
966 } else {
967 pages = save_zero_page(f, block, offset, p, bytes_transferred);
968 if (pages == -1) {
969 pages = compress_page_with_multi_thread(f, block, offset,
970 bytes_transferred);
971 }
972 }
973 }
974
975 return pages;
976}
977
b9e60928
DDAG
978/*
979 * Find the next dirty page and update any state associated with
980 * the search process.
981 *
982 * Returns: True if a page is found
983 *
984 * @f: Current migration stream.
985 * @pss: Data about the state of the current dirty page scan.
986 * @*again: Set to false if the search has scanned the whole of RAM
e0b266f0
DDAG
987 * *ram_addr_abs: Pointer into which to store the address of the dirty page
988 * within the global ram_addr space
b9e60928
DDAG
989 */
990static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss,
f3f491fc 991 bool *again, ram_addr_t *ram_addr_abs)
b9e60928 992{
a82d593b
DDAG
993 pss->offset = migration_bitmap_find_dirty(pss->block, pss->offset,
994 ram_addr_abs);
b9e60928
DDAG
995 if (pss->complete_round && pss->block == last_seen_block &&
996 pss->offset >= last_offset) {
997 /*
998 * We've been once around the RAM and haven't found anything.
999 * Give up.
1000 */
1001 *again = false;
1002 return false;
1003 }
1004 if (pss->offset >= pss->block->used_length) {
1005 /* Didn't find anything in this RAM Block */
1006 pss->offset = 0;
1007 pss->block = QLIST_NEXT_RCU(pss->block, next);
1008 if (!pss->block) {
1009 /* Hit the end of the list */
1010 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1011 /* Flag that we've looped */
1012 pss->complete_round = true;
1013 ram_bulk_stage = false;
1014 if (migrate_use_xbzrle()) {
1015 /* If xbzrle is on, stop using the data compression at this
1016 * point. In theory, xbzrle can do better than compression.
1017 */
1018 flush_compressed_data(f);
1019 compression_switch = false;
1020 }
1021 }
1022 /* Didn't find anything this time, but try again on the new block */
1023 *again = true;
1024 return false;
1025 } else {
1026 /* Can go around again, but... */
1027 *again = true;
1028 /* We've found something so probably don't need to */
1029 return true;
1030 }
1031}
1032
a82d593b
DDAG
1033/*
1034 * Helper for 'get_queued_page' - gets a page off the queue
1035 * ms: MigrationState in
1036 * *offset: Used to return the offset within the RAMBlock
1037 * ram_addr_abs: global offset in the dirty/sent bitmaps
1038 *
1039 * Returns: block (or NULL if none available)
1040 */
1041static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset,
1042 ram_addr_t *ram_addr_abs)
1043{
1044 RAMBlock *block = NULL;
1045
1046 qemu_mutex_lock(&ms->src_page_req_mutex);
1047 if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) {
1048 struct MigrationSrcPageRequest *entry =
1049 QSIMPLEQ_FIRST(&ms->src_page_requests);
1050 block = entry->rb;
1051 *offset = entry->offset;
1052 *ram_addr_abs = (entry->offset + entry->rb->offset) &
1053 TARGET_PAGE_MASK;
1054
1055 if (entry->len > TARGET_PAGE_SIZE) {
1056 entry->len -= TARGET_PAGE_SIZE;
1057 entry->offset += TARGET_PAGE_SIZE;
1058 } else {
1059 memory_region_unref(block->mr);
1060 QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1061 g_free(entry);
1062 }
1063 }
1064 qemu_mutex_unlock(&ms->src_page_req_mutex);
1065
1066 return block;
1067}
1068
1069/*
1070 * Unqueue a page from the queue fed by postcopy page requests; skips pages
1071 * that are already sent (!dirty)
1072 *
1073 * ms: MigrationState in
1074 * pss: PageSearchStatus structure updated with found block/offset
1075 * ram_addr_abs: global offset in the dirty/sent bitmaps
1076 *
1077 * Returns: true if a queued page is found
1078 */
1079static bool get_queued_page(MigrationState *ms, PageSearchStatus *pss,
1080 ram_addr_t *ram_addr_abs)
1081{
1082 RAMBlock *block;
1083 ram_addr_t offset;
1084 bool dirty;
1085
1086 do {
1087 block = unqueue_page(ms, &offset, ram_addr_abs);
1088 /*
1089 * We're sending this page, and since it's postcopy nothing else
1090 * will dirty it, and we must make sure it doesn't get sent again
1091 * even if this queue request was received after the background
1092 * search already sent it.
1093 */
1094 if (block) {
1095 unsigned long *bitmap;
1096 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1097 dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap);
1098 if (!dirty) {
1099 trace_get_queued_page_not_dirty(
1100 block->idstr, (uint64_t)offset,
1101 (uint64_t)*ram_addr_abs,
1102 test_bit(*ram_addr_abs >> TARGET_PAGE_BITS,
1103 atomic_rcu_read(&migration_bitmap_rcu)->unsentmap));
1104 } else {
1105 trace_get_queued_page(block->idstr,
1106 (uint64_t)offset,
1107 (uint64_t)*ram_addr_abs);
1108 }
1109 }
1110
1111 } while (block && !dirty);
1112
1113 if (block) {
1114 /*
1115 * As soon as we start servicing pages out of order, then we have
1116 * to kill the bulk stage, since the bulk stage assumes
1117 * in (migration_bitmap_find_and_reset_dirty) that every page is
1118 * dirty, that's no longer true.
1119 */
1120 ram_bulk_stage = false;
1121
1122 /*
1123 * We want the background search to continue from the queued page
1124 * since the guest is likely to want other pages near to the page
1125 * it just requested.
1126 */
1127 pss->block = block;
1128 pss->offset = offset;
1129 }
1130
1131 return !!block;
1132}
1133
6c595cde
DDAG
1134/**
1135 * flush_page_queue: Flush any remaining pages in the ram request queue
1136 * it should be empty at the end anyway, but in error cases there may be
1137 * some left.
1138 *
1139 * ms: MigrationState
1140 */
1141void flush_page_queue(MigrationState *ms)
1142{
1143 struct MigrationSrcPageRequest *mspr, *next_mspr;
1144 /* This queue generally should be empty - but in the case of a failed
1145 * migration might have some droppings in.
1146 */
1147 rcu_read_lock();
1148 QSIMPLEQ_FOREACH_SAFE(mspr, &ms->src_page_requests, next_req, next_mspr) {
1149 memory_region_unref(mspr->rb->mr);
1150 QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1151 g_free(mspr);
1152 }
1153 rcu_read_unlock();
1154}
1155
1156/**
1157 * Queue the pages for transmission, e.g. a request from postcopy destination
1158 * ms: MigrationStatus in which the queue is held
1159 * rbname: The RAMBlock the request is for - may be NULL (to mean reuse last)
1160 * start: Offset from the start of the RAMBlock
1161 * len: Length (in bytes) to send
1162 * Return: 0 on success
1163 */
1164int ram_save_queue_pages(MigrationState *ms, const char *rbname,
1165 ram_addr_t start, ram_addr_t len)
1166{
1167 RAMBlock *ramblock;
1168
1169 rcu_read_lock();
1170 if (!rbname) {
1171 /* Reuse last RAMBlock */
1172 ramblock = ms->last_req_rb;
1173
1174 if (!ramblock) {
1175 /*
1176 * Shouldn't happen, we can't reuse the last RAMBlock if
1177 * it's the 1st request.
1178 */
1179 error_report("ram_save_queue_pages no previous block");
1180 goto err;
1181 }
1182 } else {
1183 ramblock = qemu_ram_block_by_name(rbname);
1184
1185 if (!ramblock) {
1186 /* We shouldn't be asked for a non-existent RAMBlock */
1187 error_report("ram_save_queue_pages no block '%s'", rbname);
1188 goto err;
1189 }
1190 ms->last_req_rb = ramblock;
1191 }
1192 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1193 if (start+len > ramblock->used_length) {
9458ad6b
JQ
1194 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1195 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
6c595cde
DDAG
1196 __func__, start, len, ramblock->used_length);
1197 goto err;
1198 }
1199
1200 struct MigrationSrcPageRequest *new_entry =
1201 g_malloc0(sizeof(struct MigrationSrcPageRequest));
1202 new_entry->rb = ramblock;
1203 new_entry->offset = start;
1204 new_entry->len = len;
1205
1206 memory_region_ref(ramblock->mr);
1207 qemu_mutex_lock(&ms->src_page_req_mutex);
1208 QSIMPLEQ_INSERT_TAIL(&ms->src_page_requests, new_entry, next_req);
1209 qemu_mutex_unlock(&ms->src_page_req_mutex);
1210 rcu_read_unlock();
1211
1212 return 0;
1213
1214err:
1215 rcu_read_unlock();
1216 return -1;
1217}
1218
a82d593b
DDAG
1219/**
1220 * ram_save_target_page: Save one target page
1221 *
1222 *
1223 * @f: QEMUFile where to send the data
1224 * @block: pointer to block that contains the page we want to send
1225 * @offset: offset inside the block for the page;
1226 * @last_stage: if we are at the completion stage
1227 * @bytes_transferred: increase it with the number of transferred bytes
1228 * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1229 *
1230 * Returns: Number of pages written.
1231 */
1232static int ram_save_target_page(MigrationState *ms, QEMUFile *f,
1233 RAMBlock *block, ram_addr_t offset,
1234 bool last_stage,
1235 uint64_t *bytes_transferred,
1236 ram_addr_t dirty_ram_abs)
1237{
1238 int res = 0;
1239
1240 /* Check the pages is dirty and if it is send it */
1241 if (migration_bitmap_clear_dirty(dirty_ram_abs)) {
1242 unsigned long *unsentmap;
1243 if (compression_switch && migrate_use_compression()) {
1244 res = ram_save_compressed_page(f, block, offset,
1245 last_stage,
1246 bytes_transferred);
1247 } else {
1248 res = ram_save_page(f, block, offset, last_stage,
1249 bytes_transferred);
1250 }
1251
1252 if (res < 0) {
1253 return res;
1254 }
1255 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1256 if (unsentmap) {
1257 clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap);
1258 }
3fd3c4b3
DDAG
1259 /* Only update last_sent_block if a block was actually sent; xbzrle
1260 * might have decided the page was identical so didn't bother writing
1261 * to the stream.
1262 */
1263 if (res > 0) {
1264 last_sent_block = block;
1265 }
a82d593b
DDAG
1266 }
1267
1268 return res;
1269}
1270
1271/**
1272 * ram_save_host_page: Starting at *offset send pages upto the end
1273 * of the current host page. It's valid for the initial
1274 * offset to point into the middle of a host page
1275 * in which case the remainder of the hostpage is sent.
1276 * Only dirty target pages are sent.
1277 *
1278 * Returns: Number of pages written.
1279 *
1280 * @f: QEMUFile where to send the data
1281 * @block: pointer to block that contains the page we want to send
1282 * @offset: offset inside the block for the page; updated to last target page
1283 * sent
1284 * @last_stage: if we are at the completion stage
1285 * @bytes_transferred: increase it with the number of transferred bytes
1286 * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1287 */
1288static int ram_save_host_page(MigrationState *ms, QEMUFile *f, RAMBlock *block,
1289 ram_addr_t *offset, bool last_stage,
1290 uint64_t *bytes_transferred,
1291 ram_addr_t dirty_ram_abs)
1292{
1293 int tmppages, pages = 0;
1294 do {
1295 tmppages = ram_save_target_page(ms, f, block, *offset, last_stage,
1296 bytes_transferred, dirty_ram_abs);
1297 if (tmppages < 0) {
1298 return tmppages;
1299 }
1300
1301 pages += tmppages;
1302 *offset += TARGET_PAGE_SIZE;
1303 dirty_ram_abs += TARGET_PAGE_SIZE;
1304 } while (*offset & (qemu_host_page_size - 1));
1305
1306 /* The offset we leave with is the last one we looked at */
1307 *offset -= TARGET_PAGE_SIZE;
1308 return pages;
1309}
6c595cde 1310
56e93d26
JQ
1311/**
1312 * ram_find_and_save_block: Finds a dirty page and sends it to f
1313 *
1314 * Called within an RCU critical section.
1315 *
1316 * Returns: The number of pages written
1317 * 0 means no dirty pages
1318 *
1319 * @f: QEMUFile where to send the data
1320 * @last_stage: if we are at the completion stage
1321 * @bytes_transferred: increase it with the number of transferred bytes
a82d593b
DDAG
1322 *
1323 * On systems where host-page-size > target-page-size it will send all the
1324 * pages in a host page that are dirty.
56e93d26
JQ
1325 */
1326
1327static int ram_find_and_save_block(QEMUFile *f, bool last_stage,
1328 uint64_t *bytes_transferred)
1329{
b8fb8cb7 1330 PageSearchStatus pss;
a82d593b 1331 MigrationState *ms = migrate_get_current();
56e93d26 1332 int pages = 0;
b9e60928 1333 bool again, found;
f3f491fc
DDAG
1334 ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
1335 ram_addr_t space */
56e93d26 1336
b8fb8cb7
DDAG
1337 pss.block = last_seen_block;
1338 pss.offset = last_offset;
1339 pss.complete_round = false;
1340
1341 if (!pss.block) {
1342 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1343 }
56e93d26 1344
b9e60928 1345 do {
a82d593b
DDAG
1346 again = true;
1347 found = get_queued_page(ms, &pss, &dirty_ram_abs);
b9e60928 1348
a82d593b
DDAG
1349 if (!found) {
1350 /* priority queue empty, so just search for something dirty */
1351 found = find_dirty_block(f, &pss, &again, &dirty_ram_abs);
1352 }
f3f491fc 1353
a82d593b
DDAG
1354 if (found) {
1355 pages = ram_save_host_page(ms, f, pss.block, &pss.offset,
1356 last_stage, bytes_transferred,
1357 dirty_ram_abs);
56e93d26 1358 }
b9e60928 1359 } while (!pages && again);
56e93d26 1360
b8fb8cb7
DDAG
1361 last_seen_block = pss.block;
1362 last_offset = pss.offset;
56e93d26
JQ
1363
1364 return pages;
1365}
1366
1367void acct_update_position(QEMUFile *f, size_t size, bool zero)
1368{
1369 uint64_t pages = size / TARGET_PAGE_SIZE;
1370 if (zero) {
1371 acct_info.dup_pages += pages;
1372 } else {
1373 acct_info.norm_pages += pages;
1374 bytes_transferred += size;
1375 qemu_update_position(f, size);
1376 }
1377}
1378
1379static ram_addr_t ram_save_remaining(void)
1380{
1381 return migration_dirty_pages;
1382}
1383
1384uint64_t ram_bytes_remaining(void)
1385{
1386 return ram_save_remaining() * TARGET_PAGE_SIZE;
1387}
1388
1389uint64_t ram_bytes_transferred(void)
1390{
1391 return bytes_transferred;
1392}
1393
1394uint64_t ram_bytes_total(void)
1395{
1396 RAMBlock *block;
1397 uint64_t total = 0;
1398
1399 rcu_read_lock();
1400 QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
1401 total += block->used_length;
1402 rcu_read_unlock();
1403 return total;
1404}
1405
1406void free_xbzrle_decoded_buf(void)
1407{
1408 g_free(xbzrle_decoded_buf);
1409 xbzrle_decoded_buf = NULL;
1410}
1411
60be6340
DL
1412static void migration_bitmap_free(struct BitmapRcu *bmap)
1413{
1414 g_free(bmap->bmap);
f3f491fc 1415 g_free(bmap->unsentmap);
60be6340
DL
1416 g_free(bmap);
1417}
1418
6ad2a215 1419static void ram_migration_cleanup(void *opaque)
56e93d26 1420{
2ff64038
LZ
1421 /* caller have hold iothread lock or is in a bh, so there is
1422 * no writing race against this migration_bitmap
1423 */
60be6340
DL
1424 struct BitmapRcu *bitmap = migration_bitmap_rcu;
1425 atomic_rcu_set(&migration_bitmap_rcu, NULL);
2ff64038 1426 if (bitmap) {
56e93d26 1427 memory_global_dirty_log_stop();
60be6340 1428 call_rcu(bitmap, migration_bitmap_free, rcu);
56e93d26
JQ
1429 }
1430
1431 XBZRLE_cache_lock();
1432 if (XBZRLE.cache) {
1433 cache_fini(XBZRLE.cache);
1434 g_free(XBZRLE.encoded_buf);
1435 g_free(XBZRLE.current_buf);
1436 XBZRLE.cache = NULL;
1437 XBZRLE.encoded_buf = NULL;
1438 XBZRLE.current_buf = NULL;
1439 }
1440 XBZRLE_cache_unlock();
1441}
1442
56e93d26
JQ
1443static void reset_ram_globals(void)
1444{
1445 last_seen_block = NULL;
1446 last_sent_block = NULL;
1447 last_offset = 0;
1448 last_version = ram_list.version;
1449 ram_bulk_stage = true;
1450}
1451
1452#define MAX_WAIT 50 /* ms, half buffered_file limit */
1453
dd631697
LZ
1454void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
1455{
1456 /* called in qemu main thread, so there is
1457 * no writing race against this migration_bitmap
1458 */
60be6340
DL
1459 if (migration_bitmap_rcu) {
1460 struct BitmapRcu *old_bitmap = migration_bitmap_rcu, *bitmap;
1461 bitmap = g_new(struct BitmapRcu, 1);
1462 bitmap->bmap = bitmap_new(new);
dd631697
LZ
1463
1464 /* prevent migration_bitmap content from being set bit
1465 * by migration_bitmap_sync_range() at the same time.
1466 * it is safe to migration if migration_bitmap is cleared bit
1467 * at the same time.
1468 */
1469 qemu_mutex_lock(&migration_bitmap_mutex);
60be6340
DL
1470 bitmap_copy(bitmap->bmap, old_bitmap->bmap, old);
1471 bitmap_set(bitmap->bmap, old, new - old);
f3f491fc
DDAG
1472
1473 /* We don't have a way to safely extend the sentmap
1474 * with RCU; so mark it as missing, entry to postcopy
1475 * will fail.
1476 */
1477 bitmap->unsentmap = NULL;
1478
60be6340 1479 atomic_rcu_set(&migration_bitmap_rcu, bitmap);
dd631697
LZ
1480 qemu_mutex_unlock(&migration_bitmap_mutex);
1481 migration_dirty_pages += new - old;
60be6340 1482 call_rcu(old_bitmap, migration_bitmap_free, rcu);
dd631697
LZ
1483 }
1484}
56e93d26 1485
4f2e4252
DDAG
1486/*
1487 * 'expected' is the value you expect the bitmap mostly to be full
1488 * of; it won't bother printing lines that are all this value.
1489 * If 'todump' is null the migration bitmap is dumped.
1490 */
1491void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
1492{
1493 int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1494
1495 int64_t cur;
1496 int64_t linelen = 128;
1497 char linebuf[129];
1498
1499 if (!todump) {
1500 todump = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1501 }
1502
1503 for (cur = 0; cur < ram_pages; cur += linelen) {
1504 int64_t curb;
1505 bool found = false;
1506 /*
1507 * Last line; catch the case where the line length
1508 * is longer than remaining ram
1509 */
1510 if (cur + linelen > ram_pages) {
1511 linelen = ram_pages - cur;
1512 }
1513 for (curb = 0; curb < linelen; curb++) {
1514 bool thisbit = test_bit(cur + curb, todump);
1515 linebuf[curb] = thisbit ? '1' : '.';
1516 found = found || (thisbit != expected);
1517 }
1518 if (found) {
1519 linebuf[curb] = '\0';
1520 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
1521 }
1522 }
1523}
1524
e0b266f0
DDAG
1525/* **** functions for postcopy ***** */
1526
1527/*
1528 * Callback from postcopy_each_ram_send_discard for each RAMBlock
1529 * Note: At this point the 'unsentmap' is the processed bitmap combined
1530 * with the dirtymap; so a '1' means it's either dirty or unsent.
1531 * start,length: Indexes into the bitmap for the first bit
1532 * representing the named block and length in target-pages
1533 */
1534static int postcopy_send_discard_bm_ram(MigrationState *ms,
1535 PostcopyDiscardState *pds,
1536 unsigned long start,
1537 unsigned long length)
1538{
1539 unsigned long end = start + length; /* one after the end */
1540 unsigned long current;
1541 unsigned long *unsentmap;
1542
1543 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1544 for (current = start; current < end; ) {
1545 unsigned long one = find_next_bit(unsentmap, end, current);
1546
1547 if (one <= end) {
1548 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1549 unsigned long discard_length;
1550
1551 if (zero >= end) {
1552 discard_length = end - one;
1553 } else {
1554 discard_length = zero - one;
1555 }
1556 postcopy_discard_send_range(ms, pds, one, discard_length);
1557 current = one + discard_length;
1558 } else {
1559 current = one;
1560 }
1561 }
1562
1563 return 0;
1564}
1565
1566/*
1567 * Utility for the outgoing postcopy code.
1568 * Calls postcopy_send_discard_bm_ram for each RAMBlock
1569 * passing it bitmap indexes and name.
1570 * Returns: 0 on success
1571 * (qemu_ram_foreach_block ends up passing unscaled lengths
1572 * which would mean postcopy code would have to deal with target page)
1573 */
1574static int postcopy_each_ram_send_discard(MigrationState *ms)
1575{
1576 struct RAMBlock *block;
1577 int ret;
1578
1579 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1580 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1581 PostcopyDiscardState *pds = postcopy_discard_send_init(ms,
1582 first,
1583 block->idstr);
1584
1585 /*
1586 * Postcopy sends chunks of bitmap over the wire, but it
1587 * just needs indexes at this point, avoids it having
1588 * target page specific code.
1589 */
1590 ret = postcopy_send_discard_bm_ram(ms, pds, first,
1591 block->used_length >> TARGET_PAGE_BITS);
1592 postcopy_discard_send_finish(ms, pds);
1593 if (ret) {
1594 return ret;
1595 }
1596 }
1597
1598 return 0;
1599}
1600
99e314eb
DDAG
1601/*
1602 * Helper for postcopy_chunk_hostpages; it's called twice to cleanup
1603 * the two bitmaps, that are similar, but one is inverted.
1604 *
1605 * We search for runs of target-pages that don't start or end on a
1606 * host page boundary;
1607 * unsent_pass=true: Cleans up partially unsent host pages by searching
1608 * the unsentmap
1609 * unsent_pass=false: Cleans up partially dirty host pages by searching
1610 * the main migration bitmap
1611 *
1612 */
1613static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1614 RAMBlock *block,
1615 PostcopyDiscardState *pds)
1616{
1617 unsigned long *bitmap;
1618 unsigned long *unsentmap;
1619 unsigned int host_ratio = qemu_host_page_size / TARGET_PAGE_SIZE;
1620 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1621 unsigned long len = block->used_length >> TARGET_PAGE_BITS;
1622 unsigned long last = first + (len - 1);
1623 unsigned long run_start;
1624
1625 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1626 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1627
1628 if (unsent_pass) {
1629 /* Find a sent page */
1630 run_start = find_next_zero_bit(unsentmap, last + 1, first);
1631 } else {
1632 /* Find a dirty page */
1633 run_start = find_next_bit(bitmap, last + 1, first);
1634 }
1635
1636 while (run_start <= last) {
1637 bool do_fixup = false;
1638 unsigned long fixup_start_addr;
1639 unsigned long host_offset;
1640
1641 /*
1642 * If the start of this run of pages is in the middle of a host
1643 * page, then we need to fixup this host page.
1644 */
1645 host_offset = run_start % host_ratio;
1646 if (host_offset) {
1647 do_fixup = true;
1648 run_start -= host_offset;
1649 fixup_start_addr = run_start;
1650 /* For the next pass */
1651 run_start = run_start + host_ratio;
1652 } else {
1653 /* Find the end of this run */
1654 unsigned long run_end;
1655 if (unsent_pass) {
1656 run_end = find_next_bit(unsentmap, last + 1, run_start + 1);
1657 } else {
1658 run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1);
1659 }
1660 /*
1661 * If the end isn't at the start of a host page, then the
1662 * run doesn't finish at the end of a host page
1663 * and we need to discard.
1664 */
1665 host_offset = run_end % host_ratio;
1666 if (host_offset) {
1667 do_fixup = true;
1668 fixup_start_addr = run_end - host_offset;
1669 /*
1670 * This host page has gone, the next loop iteration starts
1671 * from after the fixup
1672 */
1673 run_start = fixup_start_addr + host_ratio;
1674 } else {
1675 /*
1676 * No discards on this iteration, next loop starts from
1677 * next sent/dirty page
1678 */
1679 run_start = run_end + 1;
1680 }
1681 }
1682
1683 if (do_fixup) {
1684 unsigned long page;
1685
1686 /* Tell the destination to discard this page */
1687 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1688 /* For the unsent_pass we:
1689 * discard partially sent pages
1690 * For the !unsent_pass (dirty) we:
1691 * discard partially dirty pages that were sent
1692 * (any partially sent pages were already discarded
1693 * by the previous unsent_pass)
1694 */
1695 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1696 host_ratio);
1697 }
1698
1699 /* Clean up the bitmap */
1700 for (page = fixup_start_addr;
1701 page < fixup_start_addr + host_ratio; page++) {
1702 /* All pages in this host page are now not sent */
1703 set_bit(page, unsentmap);
1704
1705 /*
1706 * Remark them as dirty, updating the count for any pages
1707 * that weren't previously dirty.
1708 */
1709 migration_dirty_pages += !test_and_set_bit(page, bitmap);
1710 }
1711 }
1712
1713 if (unsent_pass) {
1714 /* Find the next sent page for the next iteration */
1715 run_start = find_next_zero_bit(unsentmap, last + 1,
1716 run_start);
1717 } else {
1718 /* Find the next dirty page for the next iteration */
1719 run_start = find_next_bit(bitmap, last + 1, run_start);
1720 }
1721 }
1722}
1723
1724/*
1725 * Utility for the outgoing postcopy code.
1726 *
1727 * Discard any partially sent host-page size chunks, mark any partially
1728 * dirty host-page size chunks as all dirty.
1729 *
1730 * Returns: 0 on success
1731 */
1732static int postcopy_chunk_hostpages(MigrationState *ms)
1733{
1734 struct RAMBlock *block;
1735
1736 if (qemu_host_page_size == TARGET_PAGE_SIZE) {
1737 /* Easy case - TPS==HPS - nothing to be done */
1738 return 0;
1739 }
1740
1741 /* Easiest way to make sure we don't resume in the middle of a host-page */
1742 last_seen_block = NULL;
1743 last_sent_block = NULL;
1744 last_offset = 0;
1745
1746 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1747 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1748
1749 PostcopyDiscardState *pds =
1750 postcopy_discard_send_init(ms, first, block->idstr);
1751
1752 /* First pass: Discard all partially sent host pages */
1753 postcopy_chunk_hostpages_pass(ms, true, block, pds);
1754 /*
1755 * Second pass: Ensure that all partially dirty host pages are made
1756 * fully dirty.
1757 */
1758 postcopy_chunk_hostpages_pass(ms, false, block, pds);
1759
1760 postcopy_discard_send_finish(ms, pds);
1761 } /* ram_list loop */
1762
1763 return 0;
1764}
1765
e0b266f0
DDAG
1766/*
1767 * Transmit the set of pages to be discarded after precopy to the target
1768 * these are pages that:
1769 * a) Have been previously transmitted but are now dirty again
1770 * b) Pages that have never been transmitted, this ensures that
1771 * any pages on the destination that have been mapped by background
1772 * tasks get discarded (transparent huge pages is the specific concern)
1773 * Hopefully this is pretty sparse
1774 */
1775int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1776{
1777 int ret;
1778 unsigned long *bitmap, *unsentmap;
1779
1780 rcu_read_lock();
1781
1782 /* This should be our last sync, the src is now paused */
1783 migration_bitmap_sync();
1784
1785 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1786 if (!unsentmap) {
1787 /* We don't have a safe way to resize the sentmap, so
1788 * if the bitmap was resized it will be NULL at this
1789 * point.
1790 */
1791 error_report("migration ram resized during precopy phase");
1792 rcu_read_unlock();
1793 return -EINVAL;
1794 }
1795
99e314eb
DDAG
1796 /* Deal with TPS != HPS */
1797 ret = postcopy_chunk_hostpages(ms);
1798 if (ret) {
1799 rcu_read_unlock();
1800 return ret;
1801 }
1802
e0b266f0
DDAG
1803 /*
1804 * Update the unsentmap to be unsentmap = unsentmap | dirty
1805 */
1806 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1807 bitmap_or(unsentmap, unsentmap, bitmap,
1808 last_ram_offset() >> TARGET_PAGE_BITS);
1809
1810
1811 trace_ram_postcopy_send_discard_bitmap();
1812#ifdef DEBUG_POSTCOPY
1813 ram_debug_dump_bitmap(unsentmap, true);
1814#endif
1815
1816 ret = postcopy_each_ram_send_discard(ms);
1817 rcu_read_unlock();
1818
1819 return ret;
1820}
1821
1822/*
1823 * At the start of the postcopy phase of migration, any now-dirty
1824 * precopied pages are discarded.
1825 *
1826 * start, length describe a byte address range within the RAMBlock
1827 *
1828 * Returns 0 on success.
1829 */
1830int ram_discard_range(MigrationIncomingState *mis,
1831 const char *block_name,
1832 uint64_t start, size_t length)
1833{
1834 int ret = -1;
1835
1836 rcu_read_lock();
1837 RAMBlock *rb = qemu_ram_block_by_name(block_name);
1838
1839 if (!rb) {
1840 error_report("ram_discard_range: Failed to find block '%s'",
1841 block_name);
1842 goto err;
1843 }
1844
1845 uint8_t *host_startaddr = rb->host + start;
1846
1847 if ((uintptr_t)host_startaddr & (qemu_host_page_size - 1)) {
1848 error_report("ram_discard_range: Unaligned start address: %p",
1849 host_startaddr);
1850 goto err;
1851 }
1852
1853 if ((start + length) <= rb->used_length) {
1854 uint8_t *host_endaddr = host_startaddr + length;
1855 if ((uintptr_t)host_endaddr & (qemu_host_page_size - 1)) {
1856 error_report("ram_discard_range: Unaligned end address: %p",
1857 host_endaddr);
1858 goto err;
1859 }
1860 ret = postcopy_ram_discard_range(mis, host_startaddr, length);
1861 } else {
1862 error_report("ram_discard_range: Overrun block '%s' (%" PRIu64
9458ad6b 1863 "/%zx/" RAM_ADDR_FMT")",
e0b266f0
DDAG
1864 block_name, start, length, rb->used_length);
1865 }
1866
1867err:
1868 rcu_read_unlock();
1869
1870 return ret;
1871}
1872
1873
56e93d26
JQ
1874/* Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1875 * long-running RCU critical section. When rcu-reclaims in the code
1876 * start to become numerous it will be necessary to reduce the
1877 * granularity of these critical sections.
1878 */
1879
1880static int ram_save_setup(QEMUFile *f, void *opaque)
1881{
1882 RAMBlock *block;
1883 int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
1884
56e93d26
JQ
1885 dirty_rate_high_cnt = 0;
1886 bitmap_sync_count = 0;
1887 migration_bitmap_sync_init();
dd631697 1888 qemu_mutex_init(&migration_bitmap_mutex);
56e93d26
JQ
1889
1890 if (migrate_use_xbzrle()) {
1891 XBZRLE_cache_lock();
1892 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1893 TARGET_PAGE_SIZE,
1894 TARGET_PAGE_SIZE);
1895 if (!XBZRLE.cache) {
1896 XBZRLE_cache_unlock();
1897 error_report("Error creating cache");
1898 return -1;
1899 }
1900 XBZRLE_cache_unlock();
1901
1902 /* We prefer not to abort if there is no memory */
1903 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1904 if (!XBZRLE.encoded_buf) {
1905 error_report("Error allocating encoded_buf");
1906 return -1;
1907 }
1908
1909 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1910 if (!XBZRLE.current_buf) {
1911 error_report("Error allocating current_buf");
1912 g_free(XBZRLE.encoded_buf);
1913 XBZRLE.encoded_buf = NULL;
1914 return -1;
1915 }
1916
1917 acct_clear();
1918 }
1919
1920 /* iothread lock needed for ram_list.dirty_memory[] */
1921 qemu_mutex_lock_iothread();
1922 qemu_mutex_lock_ramlist();
1923 rcu_read_lock();
1924 bytes_transferred = 0;
1925 reset_ram_globals();
1926
1927 ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
f3f491fc 1928 migration_bitmap_rcu = g_new0(struct BitmapRcu, 1);
60be6340
DL
1929 migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages);
1930 bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages);
56e93d26 1931
f3f491fc
DDAG
1932 if (migrate_postcopy_ram()) {
1933 migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages);
1934 bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages);
1935 }
1936
56e93d26
JQ
1937 /*
1938 * Count the total number of pages used by ram blocks not including any
1939 * gaps due to alignment or unplugs.
1940 */
1941 migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1942
1943 memory_global_dirty_log_start();
1944 migration_bitmap_sync();
1945 qemu_mutex_unlock_ramlist();
1946 qemu_mutex_unlock_iothread();
1947
1948 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1949
1950 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1951 qemu_put_byte(f, strlen(block->idstr));
1952 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1953 qemu_put_be64(f, block->used_length);
1954 }
1955
1956 rcu_read_unlock();
1957
1958 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1959 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1960
1961 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1962
1963 return 0;
1964}
1965
1966static int ram_save_iterate(QEMUFile *f, void *opaque)
1967{
1968 int ret;
1969 int i;
1970 int64_t t0;
1971 int pages_sent = 0;
1972
1973 rcu_read_lock();
1974 if (ram_list.version != last_version) {
1975 reset_ram_globals();
1976 }
1977
1978 /* Read version before ram_list.blocks */
1979 smp_rmb();
1980
1981 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
1982
1983 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1984 i = 0;
1985 while ((ret = qemu_file_rate_limit(f)) == 0) {
1986 int pages;
1987
1988 pages = ram_find_and_save_block(f, false, &bytes_transferred);
1989 /* no more pages to sent */
1990 if (pages == 0) {
1991 break;
1992 }
1993 pages_sent += pages;
1994 acct_info.iterations++;
070afca2 1995
56e93d26
JQ
1996 /* we want to check in the 1st loop, just in case it was the 1st time
1997 and we had to sync the dirty bitmap.
1998 qemu_get_clock_ns() is a bit expensive, so we only check each some
1999 iterations
2000 */
2001 if ((i & 63) == 0) {
2002 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2003 if (t1 > MAX_WAIT) {
2004 DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n",
2005 t1, i);
2006 break;
2007 }
2008 }
2009 i++;
2010 }
2011 flush_compressed_data(f);
2012 rcu_read_unlock();
2013
2014 /*
2015 * Must occur before EOS (or any QEMUFile operation)
2016 * because of RDMA protocol.
2017 */
2018 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2019
2020 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2021 bytes_transferred += 8;
2022
2023 ret = qemu_file_get_error(f);
2024 if (ret < 0) {
2025 return ret;
2026 }
2027
2028 return pages_sent;
2029}
2030
2031/* Called with iothread lock */
2032static int ram_save_complete(QEMUFile *f, void *opaque)
2033{
2034 rcu_read_lock();
2035
663e6c1d
DDAG
2036 if (!migration_in_postcopy(migrate_get_current())) {
2037 migration_bitmap_sync();
2038 }
56e93d26
JQ
2039
2040 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2041
2042 /* try transferring iterative blocks of memory */
2043
2044 /* flush all remaining blocks regardless of rate limiting */
2045 while (true) {
2046 int pages;
2047
2048 pages = ram_find_and_save_block(f, true, &bytes_transferred);
2049 /* no more blocks to sent */
2050 if (pages == 0) {
2051 break;
2052 }
2053 }
2054
2055 flush_compressed_data(f);
2056 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
56e93d26
JQ
2057
2058 rcu_read_unlock();
d09a6fde 2059
56e93d26
JQ
2060 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2061
2062 return 0;
2063}
2064
c31b098f
DDAG
2065static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2066 uint64_t *non_postcopiable_pending,
2067 uint64_t *postcopiable_pending)
56e93d26
JQ
2068{
2069 uint64_t remaining_size;
2070
2071 remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2072
663e6c1d
DDAG
2073 if (!migration_in_postcopy(migrate_get_current()) &&
2074 remaining_size < max_size) {
56e93d26
JQ
2075 qemu_mutex_lock_iothread();
2076 rcu_read_lock();
2077 migration_bitmap_sync();
2078 rcu_read_unlock();
2079 qemu_mutex_unlock_iothread();
2080 remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2081 }
c31b098f
DDAG
2082
2083 /* We can do postcopy, and all the data is postcopiable */
2084 *postcopiable_pending += remaining_size;
56e93d26
JQ
2085}
2086
2087static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2088{
2089 unsigned int xh_len;
2090 int xh_flags;
2091
2092 if (!xbzrle_decoded_buf) {
2093 xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2094 }
2095
2096 /* extract RLE header */
2097 xh_flags = qemu_get_byte(f);
2098 xh_len = qemu_get_be16(f);
2099
2100 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2101 error_report("Failed to load XBZRLE page - wrong compression!");
2102 return -1;
2103 }
2104
2105 if (xh_len > TARGET_PAGE_SIZE) {
2106 error_report("Failed to load XBZRLE page - len overflow!");
2107 return -1;
2108 }
2109 /* load data and decode */
2110 qemu_get_buffer(f, xbzrle_decoded_buf, xh_len);
2111
2112 /* decode RLE */
2113 if (xbzrle_decode_buffer(xbzrle_decoded_buf, xh_len, host,
2114 TARGET_PAGE_SIZE) == -1) {
2115 error_report("Failed to load XBZRLE page - decode error!");
2116 return -1;
2117 }
2118
2119 return 0;
2120}
2121
2122/* Must be called from within a rcu critical section.
2123 * Returns a pointer from within the RCU-protected ram_list.
2124 */
a7180877
DDAG
2125/*
2126 * Read a RAMBlock ID from the stream f, find the host address of the
2127 * start of that block and add on 'offset'
2128 *
2129 * f: Stream to read from
2130 * offset: Offset within the block
2131 * flags: Page flags (mostly to see if it's a continuation of previous block)
2132 */
56e93d26
JQ
2133static inline void *host_from_stream_offset(QEMUFile *f,
2134 ram_addr_t offset,
2135 int flags)
2136{
2137 static RAMBlock *block = NULL;
2138 char id[256];
2139 uint8_t len;
2140
2141 if (flags & RAM_SAVE_FLAG_CONTINUE) {
2142 if (!block || block->max_length <= offset) {
2143 error_report("Ack, bad migration stream!");
2144 return NULL;
2145 }
2146
2f68e399 2147 return block->host + offset;
56e93d26
JQ
2148 }
2149
2150 len = qemu_get_byte(f);
2151 qemu_get_buffer(f, (uint8_t *)id, len);
2152 id[len] = 0;
2153
e3dd7493
DDAG
2154 block = qemu_ram_block_by_name(id);
2155 if (block && block->max_length > offset) {
2156 return block->host + offset;
56e93d26
JQ
2157 }
2158
e3dd7493 2159 error_report("Can't find block %s", id);
56e93d26
JQ
2160 return NULL;
2161}
2162
2163/*
2164 * If a page (or a whole RDMA chunk) has been
2165 * determined to be zero, then zap it.
2166 */
2167void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2168{
2169 if (ch != 0 || !is_zero_range(host, size)) {
2170 memset(host, ch, size);
2171 }
2172}
2173
2174static void *do_data_decompress(void *opaque)
2175{
2176 DecompressParam *param = opaque;
2177 unsigned long pagesize;
2178
2179 while (!quit_decomp_thread) {
2180 qemu_mutex_lock(&param->mutex);
2181 while (!param->start && !quit_decomp_thread) {
2182 qemu_cond_wait(&param->cond, &param->mutex);
2183 pagesize = TARGET_PAGE_SIZE;
2184 if (!quit_decomp_thread) {
2185 /* uncompress() will return failed in some case, especially
2186 * when the page is dirted when doing the compression, it's
2187 * not a problem because the dirty page will be retransferred
2188 * and uncompress() won't break the data in other pages.
2189 */
2190 uncompress((Bytef *)param->des, &pagesize,
2191 (const Bytef *)param->compbuf, param->len);
2192 }
2193 param->start = false;
2194 }
2195 qemu_mutex_unlock(&param->mutex);
2196 }
2197
2198 return NULL;
2199}
2200
2201void migrate_decompress_threads_create(void)
2202{
2203 int i, thread_count;
2204
2205 thread_count = migrate_decompress_threads();
2206 decompress_threads = g_new0(QemuThread, thread_count);
2207 decomp_param = g_new0(DecompressParam, thread_count);
2208 compressed_data_buf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2209 quit_decomp_thread = false;
2210 for (i = 0; i < thread_count; i++) {
2211 qemu_mutex_init(&decomp_param[i].mutex);
2212 qemu_cond_init(&decomp_param[i].cond);
2213 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2214 qemu_thread_create(decompress_threads + i, "decompress",
2215 do_data_decompress, decomp_param + i,
2216 QEMU_THREAD_JOINABLE);
2217 }
2218}
2219
2220void migrate_decompress_threads_join(void)
2221{
2222 int i, thread_count;
2223
2224 quit_decomp_thread = true;
2225 thread_count = migrate_decompress_threads();
2226 for (i = 0; i < thread_count; i++) {
2227 qemu_mutex_lock(&decomp_param[i].mutex);
2228 qemu_cond_signal(&decomp_param[i].cond);
2229 qemu_mutex_unlock(&decomp_param[i].mutex);
2230 }
2231 for (i = 0; i < thread_count; i++) {
2232 qemu_thread_join(decompress_threads + i);
2233 qemu_mutex_destroy(&decomp_param[i].mutex);
2234 qemu_cond_destroy(&decomp_param[i].cond);
2235 g_free(decomp_param[i].compbuf);
2236 }
2237 g_free(decompress_threads);
2238 g_free(decomp_param);
2239 g_free(compressed_data_buf);
2240 decompress_threads = NULL;
2241 decomp_param = NULL;
2242 compressed_data_buf = NULL;
2243}
2244
2245static void decompress_data_with_multi_threads(uint8_t *compbuf,
2246 void *host, int len)
2247{
2248 int idx, thread_count;
2249
2250 thread_count = migrate_decompress_threads();
2251 while (true) {
2252 for (idx = 0; idx < thread_count; idx++) {
2253 if (!decomp_param[idx].start) {
2254 memcpy(decomp_param[idx].compbuf, compbuf, len);
2255 decomp_param[idx].des = host;
2256 decomp_param[idx].len = len;
2257 start_decompression(&decomp_param[idx]);
2258 break;
2259 }
2260 }
2261 if (idx < thread_count) {
2262 break;
2263 }
2264 }
2265}
2266
1caddf8a
DDAG
2267/*
2268 * Allocate data structures etc needed by incoming migration with postcopy-ram
2269 * postcopy-ram's similarly names postcopy_ram_incoming_init does the work
2270 */
2271int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2272{
2273 size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
2274
2275 return postcopy_ram_incoming_init(mis, ram_pages);
2276}
2277
a7180877
DDAG
2278/*
2279 * Called in postcopy mode by ram_load().
2280 * rcu_read_lock is taken prior to this being called.
2281 */
2282static int ram_load_postcopy(QEMUFile *f)
2283{
2284 int flags = 0, ret = 0;
2285 bool place_needed = false;
2286 bool matching_page_sizes = qemu_host_page_size == TARGET_PAGE_SIZE;
2287 MigrationIncomingState *mis = migration_incoming_get_current();
2288 /* Temporary page that is later 'placed' */
2289 void *postcopy_host_page = postcopy_get_tmp_page(mis);
c53b7ddc 2290 void *last_host = NULL;
a3b6ff6d 2291 bool all_zero = false;
a7180877
DDAG
2292
2293 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2294 ram_addr_t addr;
2295 void *host = NULL;
2296 void *page_buffer = NULL;
2297 void *place_source = NULL;
2298 uint8_t ch;
a7180877
DDAG
2299
2300 addr = qemu_get_be64(f);
2301 flags = addr & ~TARGET_PAGE_MASK;
2302 addr &= TARGET_PAGE_MASK;
2303
2304 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2305 place_needed = false;
2306 if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) {
2307 host = host_from_stream_offset(f, addr, flags);
2308 if (!host) {
2309 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2310 ret = -EINVAL;
2311 break;
2312 }
2313 page_buffer = host;
2314 /*
2315 * Postcopy requires that we place whole host pages atomically.
2316 * To make it atomic, the data is read into a temporary page
2317 * that's moved into place later.
2318 * The migration protocol uses, possibly smaller, target-pages
2319 * however the source ensures it always sends all the components
2320 * of a host page in order.
2321 */
2322 page_buffer = postcopy_host_page +
2323 ((uintptr_t)host & ~qemu_host_page_mask);
2324 /* If all TP are zero then we can optimise the place */
2325 if (!((uintptr_t)host & ~qemu_host_page_mask)) {
2326 all_zero = true;
c53b7ddc
DDAG
2327 } else {
2328 /* not the 1st TP within the HP */
2329 if (host != (last_host + TARGET_PAGE_SIZE)) {
2330 error_report("Non-sequential target page %p/%p\n",
2331 host, last_host);
2332 ret = -EINVAL;
2333 break;
2334 }
a7180877
DDAG
2335 }
2336
c53b7ddc 2337
a7180877
DDAG
2338 /*
2339 * If it's the last part of a host page then we place the host
2340 * page
2341 */
2342 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2343 ~qemu_host_page_mask) == 0;
2344 place_source = postcopy_host_page;
2345 }
c53b7ddc 2346 last_host = host;
a7180877
DDAG
2347
2348 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2349 case RAM_SAVE_FLAG_COMPRESS:
2350 ch = qemu_get_byte(f);
2351 memset(page_buffer, ch, TARGET_PAGE_SIZE);
2352 if (ch) {
2353 all_zero = false;
2354 }
2355 break;
2356
2357 case RAM_SAVE_FLAG_PAGE:
2358 all_zero = false;
2359 if (!place_needed || !matching_page_sizes) {
2360 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2361 } else {
2362 /* Avoids the qemu_file copy during postcopy, which is
2363 * going to do a copy later; can only do it when we
2364 * do this read in one go (matching page sizes)
2365 */
2366 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2367 TARGET_PAGE_SIZE);
2368 }
2369 break;
2370 case RAM_SAVE_FLAG_EOS:
2371 /* normal exit */
2372 break;
2373 default:
2374 error_report("Unknown combination of migration flags: %#x"
2375 " (postcopy mode)", flags);
2376 ret = -EINVAL;
2377 }
2378
2379 if (place_needed) {
2380 /* This gets called at the last target page in the host page */
2381 if (all_zero) {
2382 ret = postcopy_place_page_zero(mis,
2383 host + TARGET_PAGE_SIZE -
2384 qemu_host_page_size);
2385 } else {
2386 ret = postcopy_place_page(mis, host + TARGET_PAGE_SIZE -
2387 qemu_host_page_size,
2388 place_source);
2389 }
2390 }
2391 if (!ret) {
2392 ret = qemu_file_get_error(f);
2393 }
2394 }
2395
2396 return ret;
2397}
2398
56e93d26
JQ
2399static int ram_load(QEMUFile *f, void *opaque, int version_id)
2400{
2401 int flags = 0, ret = 0;
2402 static uint64_t seq_iter;
2403 int len = 0;
a7180877
DDAG
2404 /*
2405 * If system is running in postcopy mode, page inserts to host memory must
2406 * be atomic
2407 */
2408 bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
56e93d26
JQ
2409
2410 seq_iter++;
2411
2412 if (version_id != 4) {
2413 ret = -EINVAL;
2414 }
2415
2416 /* This RCU critical section can be very long running.
2417 * When RCU reclaims in the code start to become numerous,
2418 * it will be necessary to reduce the granularity of this
2419 * critical section.
2420 */
2421 rcu_read_lock();
a7180877
DDAG
2422
2423 if (postcopy_running) {
2424 ret = ram_load_postcopy(f);
2425 }
2426
2427 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
56e93d26 2428 ram_addr_t addr, total_ram_bytes;
a776aa15 2429 void *host = NULL;
56e93d26
JQ
2430 uint8_t ch;
2431
2432 addr = qemu_get_be64(f);
2433 flags = addr & ~TARGET_PAGE_MASK;
2434 addr &= TARGET_PAGE_MASK;
2435
a776aa15
DDAG
2436 if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE |
2437 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2438 host = host_from_stream_offset(f, addr, flags);
2439 if (!host) {
2440 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2441 ret = -EINVAL;
2442 break;
2443 }
2444 }
2445
56e93d26
JQ
2446 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2447 case RAM_SAVE_FLAG_MEM_SIZE:
2448 /* Synchronize RAM block list */
2449 total_ram_bytes = addr;
2450 while (!ret && total_ram_bytes) {
2451 RAMBlock *block;
56e93d26
JQ
2452 char id[256];
2453 ram_addr_t length;
2454
2455 len = qemu_get_byte(f);
2456 qemu_get_buffer(f, (uint8_t *)id, len);
2457 id[len] = 0;
2458 length = qemu_get_be64(f);
2459
e3dd7493
DDAG
2460 block = qemu_ram_block_by_name(id);
2461 if (block) {
2462 if (length != block->used_length) {
2463 Error *local_err = NULL;
56e93d26 2464
e3dd7493
DDAG
2465 ret = qemu_ram_resize(block->offset, length,
2466 &local_err);
2467 if (local_err) {
2468 error_report_err(local_err);
56e93d26 2469 }
56e93d26 2470 }
e3dd7493
DDAG
2471 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2472 block->idstr);
2473 } else {
56e93d26
JQ
2474 error_report("Unknown ramblock \"%s\", cannot "
2475 "accept migration", id);
2476 ret = -EINVAL;
2477 }
2478
2479 total_ram_bytes -= length;
2480 }
2481 break;
a776aa15 2482
56e93d26 2483 case RAM_SAVE_FLAG_COMPRESS:
56e93d26
JQ
2484 ch = qemu_get_byte(f);
2485 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2486 break;
a776aa15 2487
56e93d26 2488 case RAM_SAVE_FLAG_PAGE:
56e93d26
JQ
2489 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2490 break;
56e93d26 2491
a776aa15 2492 case RAM_SAVE_FLAG_COMPRESS_PAGE:
56e93d26
JQ
2493 len = qemu_get_be32(f);
2494 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2495 error_report("Invalid compressed data length: %d", len);
2496 ret = -EINVAL;
2497 break;
2498 }
2499 qemu_get_buffer(f, compressed_data_buf, len);
2500 decompress_data_with_multi_threads(compressed_data_buf, host, len);
2501 break;
a776aa15 2502
56e93d26 2503 case RAM_SAVE_FLAG_XBZRLE:
56e93d26
JQ
2504 if (load_xbzrle(f, addr, host) < 0) {
2505 error_report("Failed to decompress XBZRLE page at "
2506 RAM_ADDR_FMT, addr);
2507 ret = -EINVAL;
2508 break;
2509 }
2510 break;
2511 case RAM_SAVE_FLAG_EOS:
2512 /* normal exit */
2513 break;
2514 default:
2515 if (flags & RAM_SAVE_FLAG_HOOK) {
632e3a5c 2516 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
56e93d26
JQ
2517 } else {
2518 error_report("Unknown combination of migration flags: %#x",
2519 flags);
2520 ret = -EINVAL;
2521 }
2522 }
2523 if (!ret) {
2524 ret = qemu_file_get_error(f);
2525 }
2526 }
2527
2528 rcu_read_unlock();
2529 DPRINTF("Completed load of VM with exit code %d seq iteration "
2530 "%" PRIu64 "\n", ret, seq_iter);
2531 return ret;
2532}
2533
2534static SaveVMHandlers savevm_ram_handlers = {
2535 .save_live_setup = ram_save_setup,
2536 .save_live_iterate = ram_save_iterate,
763c906b 2537 .save_live_complete_postcopy = ram_save_complete,
a3e06c3d 2538 .save_live_complete_precopy = ram_save_complete,
56e93d26
JQ
2539 .save_live_pending = ram_save_pending,
2540 .load_state = ram_load,
6ad2a215 2541 .cleanup = ram_migration_cleanup,
56e93d26
JQ
2542};
2543
2544void ram_mig_init(void)
2545{
2546 qemu_mutex_init(&XBZRLE.lock);
2547 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, NULL);
2548}