]> git.proxmox.com Git - mirror_qemu.git/blame - migration/ram.c
migration: Cleanup xbzrle zero page cache update logic
[mirror_qemu.git] / migration / ram.c
CommitLineData
56e93d26
JQ
1/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
76cc7b58
JQ
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
56e93d26
JQ
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
e688df6b 28
1393a485 29#include "qemu/osdep.h"
f348b6d1 30#include "qemu/cutils.h"
56e93d26
JQ
31#include "qemu/bitops.h"
32#include "qemu/bitmap.h"
b85ea5fa 33#include "qemu/madvise.h"
7205c9ec 34#include "qemu/main-loop.h"
c0e0825c 35#include "io/channel-null.h"
709e3fe8 36#include "xbzrle.h"
7b1e1a22 37#include "ram.h"
6666c96a 38#include "migration.h"
f2a8f0a6 39#include "migration/register.h"
7b1e1a22 40#include "migration/misc.h"
08a0aee1 41#include "qemu-file.h"
be07b0ac 42#include "postcopy-ram.h"
53d37d36 43#include "page_cache.h"
56e93d26 44#include "qemu/error-report.h"
e688df6b 45#include "qapi/error.h"
ab7cbb0b 46#include "qapi/qapi-types-migration.h"
9af23989 47#include "qapi/qapi-events-migration.h"
8acabf69 48#include "qapi/qmp/qerror.h"
56e93d26 49#include "trace.h"
56e93d26 50#include "exec/ram_addr.h"
f9494614 51#include "exec/target_page.h"
56e93d26 52#include "qemu/rcu_queue.h"
a91246c9 53#include "migration/colo.h"
53d37d36 54#include "block.h"
b0c3cf94 55#include "sysemu/cpu-throttle.h"
edd090c7 56#include "savevm.h"
b9ee2f7d 57#include "qemu/iov.h"
d32ca5ad 58#include "multifd.h"
278e2f55
AG
59#include "sysemu/runstate.h"
60
e5fdf920
LS
61#include "hw/boards.h" /* for machine_dump_guest_core() */
62
278e2f55
AG
63#if defined(__linux__)
64#include "qemu/userfaultfd.h"
65#endif /* defined(__linux__) */
56e93d26 66
56e93d26
JQ
67/***********************************************************/
68/* ram save/restore */
69
bb890ed5
JQ
70/* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
71 * worked for pages that where filled with the same char. We switched
72 * it to only search for the zero value. And to avoid confusion with
73 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
74 */
75
56e93d26 76#define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
bb890ed5 77#define RAM_SAVE_FLAG_ZERO 0x02
56e93d26
JQ
78#define RAM_SAVE_FLAG_MEM_SIZE 0x04
79#define RAM_SAVE_FLAG_PAGE 0x08
80#define RAM_SAVE_FLAG_EOS 0x10
81#define RAM_SAVE_FLAG_CONTINUE 0x20
82#define RAM_SAVE_FLAG_XBZRLE 0x40
83/* 0x80 is reserved in migration.h start with 0x100 next */
84#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
85
9360447d
JQ
86XBZRLECacheStats xbzrle_counters;
87
56e93d26
JQ
88/* struct contains XBZRLE cache and a static page
89 used by the compression */
90static struct {
91 /* buffer used for XBZRLE encoding */
92 uint8_t *encoded_buf;
93 /* buffer for storing page content */
94 uint8_t *current_buf;
95 /* Cache for XBZRLE, Protected by lock. */
96 PageCache *cache;
97 QemuMutex lock;
c00e0928
JQ
98 /* it will store a page full of zeros */
99 uint8_t *zero_target_page;
f265e0e4
JQ
100 /* buffer used for XBZRLE decoding */
101 uint8_t *decoded_buf;
56e93d26
JQ
102} XBZRLE;
103
56e93d26
JQ
104static void XBZRLE_cache_lock(void)
105{
f4c51a6b 106 if (migrate_use_xbzrle()) {
56e93d26 107 qemu_mutex_lock(&XBZRLE.lock);
f4c51a6b 108 }
56e93d26
JQ
109}
110
111static void XBZRLE_cache_unlock(void)
112{
f4c51a6b 113 if (migrate_use_xbzrle()) {
56e93d26 114 qemu_mutex_unlock(&XBZRLE.lock);
f4c51a6b 115 }
56e93d26
JQ
116}
117
3d0684b2
JQ
118/**
119 * xbzrle_cache_resize: resize the xbzrle cache
120 *
cbde7be9 121 * This function is called from migrate_params_apply in main
3d0684b2
JQ
122 * thread, possibly while a migration is in progress. A running
123 * migration may be using the cache and might finish during this call,
124 * hence changes to the cache are protected by XBZRLE.lock().
125 *
c9dede2d 126 * Returns 0 for success or -1 for error
3d0684b2
JQ
127 *
128 * @new_size: new cache size
8acabf69 129 * @errp: set *errp if the check failed, with reason
56e93d26 130 */
8b9407a0 131int xbzrle_cache_resize(uint64_t new_size, Error **errp)
56e93d26
JQ
132{
133 PageCache *new_cache;
c9dede2d 134 int64_t ret = 0;
56e93d26 135
8acabf69
JQ
136 /* Check for truncation */
137 if (new_size != (size_t)new_size) {
138 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
139 "exceeding address space");
140 return -1;
141 }
142
2a313e5c
JQ
143 if (new_size == migrate_xbzrle_cache_size()) {
144 /* nothing to do */
c9dede2d 145 return 0;
2a313e5c
JQ
146 }
147
56e93d26
JQ
148 XBZRLE_cache_lock();
149
150 if (XBZRLE.cache != NULL) {
80f8dfde 151 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
56e93d26 152 if (!new_cache) {
56e93d26
JQ
153 ret = -1;
154 goto out;
155 }
156
157 cache_fini(XBZRLE.cache);
158 XBZRLE.cache = new_cache;
159 }
56e93d26
JQ
160out:
161 XBZRLE_cache_unlock();
162 return ret;
163}
164
20123ee1
PX
165static bool postcopy_preempt_active(void)
166{
167 return migrate_postcopy_preempt() && migration_in_postcopy();
168}
169
3ded54b1 170bool ramblock_is_ignored(RAMBlock *block)
fbd162e6
YK
171{
172 return !qemu_ram_is_migratable(block) ||
173 (migrate_ignore_shared() && qemu_ram_is_shared(block));
174}
175
343f632c
DDAG
176#undef RAMBLOCK_FOREACH
177
fbd162e6
YK
178int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
179{
180 RAMBlock *block;
181 int ret = 0;
182
89ac5a1d
DDAG
183 RCU_READ_LOCK_GUARD();
184
fbd162e6
YK
185 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
186 ret = func(block, opaque);
187 if (ret) {
188 break;
189 }
190 }
fbd162e6
YK
191 return ret;
192}
193
f9494614
AP
194static void ramblock_recv_map_init(void)
195{
196 RAMBlock *rb;
197
fbd162e6 198 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
f9494614
AP
199 assert(!rb->receivedmap);
200 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
201 }
202}
203
204int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
205{
206 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
207 rb->receivedmap);
208}
209
1cba9f6e
DDAG
210bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
211{
212 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
213}
214
f9494614
AP
215void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
216{
217 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
218}
219
220void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
221 size_t nr)
222{
223 bitmap_set_atomic(rb->receivedmap,
224 ramblock_recv_bitmap_offset(host_addr, rb),
225 nr);
226}
227
a335debb
PX
228#define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
229
230/*
231 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
232 *
233 * Returns >0 if success with sent bytes, or <0 if error.
234 */
235int64_t ramblock_recv_bitmap_send(QEMUFile *file,
236 const char *block_name)
237{
238 RAMBlock *block = qemu_ram_block_by_name(block_name);
239 unsigned long *le_bitmap, nbits;
240 uint64_t size;
241
242 if (!block) {
243 error_report("%s: invalid block name: %s", __func__, block_name);
244 return -1;
245 }
246
898ba906 247 nbits = block->postcopy_length >> TARGET_PAGE_BITS;
a335debb
PX
248
249 /*
250 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
251 * machines we may need 4 more bytes for padding (see below
252 * comment). So extend it a bit before hand.
253 */
254 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
255
256 /*
257 * Always use little endian when sending the bitmap. This is
258 * required that when source and destination VMs are not using the
3a4452d8 259 * same endianness. (Note: big endian won't work.)
a335debb
PX
260 */
261 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
262
263 /* Size of the bitmap, in bytes */
a725ef9f 264 size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
265
266 /*
267 * size is always aligned to 8 bytes for 64bit machines, but it
268 * may not be true for 32bit machines. We need this padding to
269 * make sure the migration can survive even between 32bit and
270 * 64bit machines.
271 */
272 size = ROUND_UP(size, 8);
273
274 qemu_put_be64(file, size);
275 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
276 /*
277 * Mark as an end, in case the middle part is screwed up due to
3a4452d8 278 * some "mysterious" reason.
a335debb
PX
279 */
280 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
281 qemu_fflush(file);
282
bf269906 283 g_free(le_bitmap);
a335debb
PX
284
285 if (qemu_file_get_error(file)) {
286 return qemu_file_get_error(file);
287 }
288
289 return size + sizeof(size);
290}
291
ec481c6c
JQ
292/*
293 * An outstanding page request, on the source, having been received
294 * and queued
295 */
296struct RAMSrcPageRequest {
297 RAMBlock *rb;
298 hwaddr offset;
299 hwaddr len;
300
301 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
302};
303
c01b16ed
PX
304typedef struct {
305 /*
306 * Cached ramblock/offset values if preempted. They're only meaningful if
307 * preempted==true below.
308 */
309 RAMBlock *ram_block;
310 unsigned long ram_page;
311 /*
312 * Whether a postcopy preemption just happened. Will be reset after
313 * precopy recovered to background migration.
314 */
315 bool preempted;
316} PostcopyPreemptState;
317
6f37bb8b
JQ
318/* State of RAM for migration */
319struct RAMState {
204b88b8
JQ
320 /* QEMUFile used for this migration */
321 QEMUFile *f;
278e2f55
AG
322 /* UFFD file descriptor, used in 'write-tracking' migration */
323 int uffdio_fd;
6f37bb8b
JQ
324 /* Last block that we have visited searching for dirty pages */
325 RAMBlock *last_seen_block;
326 /* Last block from where we have sent data */
327 RAMBlock *last_sent_block;
269ace29
JQ
328 /* Last dirty target page we have sent */
329 ram_addr_t last_page;
6f37bb8b
JQ
330 /* last ram version we have seen */
331 uint32_t last_version;
8d820d6f
JQ
332 /* How many times we have dirty too many pages */
333 int dirty_rate_high_cnt;
f664da80
JQ
334 /* these variables are used for bitmap sync */
335 /* last time we did a full bitmap_sync */
336 int64_t time_last_bitmap_sync;
eac74159 337 /* bytes transferred at start_time */
c4bdf0cf 338 uint64_t bytes_xfer_prev;
a66cd90c 339 /* number of dirty pages since start_time */
68908ed6 340 uint64_t num_dirty_pages_period;
b5833fde
JQ
341 /* xbzrle misses since the beginning of the period */
342 uint64_t xbzrle_cache_miss_prev;
e460a4b1
WW
343 /* Amount of xbzrle pages since the beginning of the period */
344 uint64_t xbzrle_pages_prev;
345 /* Amount of xbzrle encoded bytes since the beginning of the period */
346 uint64_t xbzrle_bytes_prev;
1a373522
DH
347 /* Start using XBZRLE (e.g., after the first round). */
348 bool xbzrle_enabled;
05931ec5
JQ
349 /* Are we on the last stage of migration */
350 bool last_stage;
76e03000
XG
351 /* compression statistics since the beginning of the period */
352 /* amount of count that no free thread to compress data */
353 uint64_t compress_thread_busy_prev;
354 /* amount bytes after compression */
355 uint64_t compressed_size_prev;
356 /* amount of compressed pages */
357 uint64_t compress_pages_prev;
358
be8b02ed
XG
359 /* total handled target pages at the beginning of period */
360 uint64_t target_page_count_prev;
361 /* total handled target pages since start */
362 uint64_t target_page_count;
9360447d 363 /* number of dirty bits in the bitmap */
2dfaf12e 364 uint64_t migration_dirty_pages;
386a907b 365 /* Protects modification of the bitmap and migration dirty pages */
108cfae0 366 QemuMutex bitmap_mutex;
68a098f3
JQ
367 /* The RAMBlock used in the last src_page_requests */
368 RAMBlock *last_req_rb;
ec481c6c
JQ
369 /* Queue of outstanding page requests from the destination */
370 QemuMutex src_page_req_mutex;
b58deb34 371 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
c01b16ed
PX
372
373 /* Postcopy preemption informations */
374 PostcopyPreemptState postcopy_preempt_state;
375 /*
376 * Current channel we're using on src VM. Only valid if postcopy-preempt
377 * is enabled.
378 */
379 unsigned int postcopy_channel;
6f37bb8b
JQ
380};
381typedef struct RAMState RAMState;
382
53518d94 383static RAMState *ram_state;
6f37bb8b 384
bd227060
WW
385static NotifierWithReturnList precopy_notifier_list;
386
c01b16ed
PX
387static void postcopy_preempt_reset(RAMState *rs)
388{
389 memset(&rs->postcopy_preempt_state, 0, sizeof(PostcopyPreemptState));
390}
391
a1fe28df
PX
392/* Whether postcopy has queued requests? */
393static bool postcopy_has_request(RAMState *rs)
394{
395 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
396}
397
bd227060
WW
398void precopy_infrastructure_init(void)
399{
400 notifier_with_return_list_init(&precopy_notifier_list);
401}
402
403void precopy_add_notifier(NotifierWithReturn *n)
404{
405 notifier_with_return_list_add(&precopy_notifier_list, n);
406}
407
408void precopy_remove_notifier(NotifierWithReturn *n)
409{
410 notifier_with_return_remove(n);
411}
412
413int precopy_notify(PrecopyNotifyReason reason, Error **errp)
414{
415 PrecopyNotifyData pnd;
416 pnd.reason = reason;
417 pnd.errp = errp;
418
419 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
420}
421
9edabd4d 422uint64_t ram_bytes_remaining(void)
2f4fde93 423{
bae416e5
DDAG
424 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
425 0;
2f4fde93
JQ
426}
427
9360447d 428MigrationStats ram_counters;
96506894 429
26a26069 430void ram_transferred_add(uint64_t bytes)
4c2d0f6d 431{
ae680668
DE
432 if (runstate_is_running()) {
433 ram_counters.precopy_bytes += bytes;
434 } else if (migration_in_postcopy()) {
435 ram_counters.postcopy_bytes += bytes;
436 } else {
437 ram_counters.downtime_bytes += bytes;
438 }
4c2d0f6d
DE
439 ram_counters.transferred += bytes;
440}
441
d59c40cc
LB
442void dirty_sync_missed_zero_copy(void)
443{
444 ram_counters.dirty_sync_missed_zero_copy++;
445}
446
b8fb8cb7
DDAG
447/* used by the search for pages to send */
448struct PageSearchStatus {
449 /* Current block being searched */
450 RAMBlock *block;
a935e30f
JQ
451 /* Current page to search from */
452 unsigned long page;
b8fb8cb7
DDAG
453 /* Set once we wrap around */
454 bool complete_round;
82b54ef4
PX
455 /*
456 * [POSTCOPY-ONLY] Whether current page is explicitly requested by
457 * postcopy. When set, the request is "urgent" because the dest QEMU
458 * threads are waiting for us.
459 */
ea2faf0c 460 bool postcopy_requested;
82b54ef4
PX
461 /*
462 * [POSTCOPY-ONLY] The target channel to use to send current page.
463 *
464 * Note: This may _not_ match with the value in postcopy_requested
465 * above. Let's imagine the case where the postcopy request is exactly
466 * the page that we're sending in progress during precopy. In this case
467 * we'll have postcopy_requested set to true but the target channel
468 * will be the precopy channel (so that we don't split brain on that
469 * specific page since the precopy channel already contains partial of
470 * that page data).
471 *
472 * Besides that specific use case, postcopy_target_channel should
473 * always be equal to postcopy_requested, because by default we send
474 * postcopy pages via postcopy preempt channel.
475 */
476 bool postcopy_target_channel;
b8fb8cb7
DDAG
477};
478typedef struct PageSearchStatus PageSearchStatus;
479
76e03000
XG
480CompressionStats compression_counters;
481
56e93d26 482struct CompressParam {
56e93d26 483 bool done;
90e56fb4 484 bool quit;
5e5fdcff 485 bool zero_page;
56e93d26
JQ
486 QEMUFile *file;
487 QemuMutex mutex;
488 QemuCond cond;
489 RAMBlock *block;
490 ram_addr_t offset;
34ab9e97
XG
491
492 /* internally used fields */
dcaf446e 493 z_stream stream;
34ab9e97 494 uint8_t *originbuf;
56e93d26
JQ
495};
496typedef struct CompressParam CompressParam;
497
498struct DecompressParam {
73a8912b 499 bool done;
90e56fb4 500 bool quit;
56e93d26
JQ
501 QemuMutex mutex;
502 QemuCond cond;
503 void *des;
d341d9f3 504 uint8_t *compbuf;
56e93d26 505 int len;
797ca154 506 z_stream stream;
56e93d26
JQ
507};
508typedef struct DecompressParam DecompressParam;
509
510static CompressParam *comp_param;
511static QemuThread *compress_threads;
512/* comp_done_cond is used to wake up the migration thread when
513 * one of the compression threads has finished the compression.
514 * comp_done_lock is used to co-work with comp_done_cond.
515 */
0d9f9a5c
LL
516static QemuMutex comp_done_lock;
517static QemuCond comp_done_cond;
56e93d26 518
34ab9e97 519static QEMUFile *decomp_file;
56e93d26
JQ
520static DecompressParam *decomp_param;
521static QemuThread *decompress_threads;
73a8912b
LL
522static QemuMutex decomp_done_lock;
523static QemuCond decomp_done_cond;
56e93d26 524
5e5fdcff 525static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
6ef3771c 526 ram_addr_t offset, uint8_t *source_buf);
56e93d26 527
82b54ef4
PX
528static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss,
529 bool postcopy_requested);
530
56e93d26
JQ
531static void *do_data_compress(void *opaque)
532{
533 CompressParam *param = opaque;
a7a9a88f
LL
534 RAMBlock *block;
535 ram_addr_t offset;
5e5fdcff 536 bool zero_page;
56e93d26 537
a7a9a88f 538 qemu_mutex_lock(&param->mutex);
90e56fb4 539 while (!param->quit) {
a7a9a88f
LL
540 if (param->block) {
541 block = param->block;
542 offset = param->offset;
543 param->block = NULL;
544 qemu_mutex_unlock(&param->mutex);
545
5e5fdcff
XG
546 zero_page = do_compress_ram_page(param->file, &param->stream,
547 block, offset, param->originbuf);
a7a9a88f 548
0d9f9a5c 549 qemu_mutex_lock(&comp_done_lock);
a7a9a88f 550 param->done = true;
5e5fdcff 551 param->zero_page = zero_page;
0d9f9a5c
LL
552 qemu_cond_signal(&comp_done_cond);
553 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
554
555 qemu_mutex_lock(&param->mutex);
556 } else {
56e93d26
JQ
557 qemu_cond_wait(&param->cond, &param->mutex);
558 }
56e93d26 559 }
a7a9a88f 560 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
561
562 return NULL;
563}
564
f0afa331 565static void compress_threads_save_cleanup(void)
56e93d26
JQ
566{
567 int i, thread_count;
568
05306935 569 if (!migrate_use_compression() || !comp_param) {
56e93d26
JQ
570 return;
571 }
05306935 572
56e93d26
JQ
573 thread_count = migrate_compress_threads();
574 for (i = 0; i < thread_count; i++) {
dcaf446e
XG
575 /*
576 * we use it as a indicator which shows if the thread is
577 * properly init'd or not
578 */
579 if (!comp_param[i].file) {
580 break;
581 }
05306935
FL
582
583 qemu_mutex_lock(&comp_param[i].mutex);
584 comp_param[i].quit = true;
585 qemu_cond_signal(&comp_param[i].cond);
586 qemu_mutex_unlock(&comp_param[i].mutex);
587
56e93d26 588 qemu_thread_join(compress_threads + i);
56e93d26
JQ
589 qemu_mutex_destroy(&comp_param[i].mutex);
590 qemu_cond_destroy(&comp_param[i].cond);
dcaf446e 591 deflateEnd(&comp_param[i].stream);
34ab9e97 592 g_free(comp_param[i].originbuf);
dcaf446e
XG
593 qemu_fclose(comp_param[i].file);
594 comp_param[i].file = NULL;
56e93d26 595 }
0d9f9a5c
LL
596 qemu_mutex_destroy(&comp_done_lock);
597 qemu_cond_destroy(&comp_done_cond);
56e93d26
JQ
598 g_free(compress_threads);
599 g_free(comp_param);
56e93d26
JQ
600 compress_threads = NULL;
601 comp_param = NULL;
56e93d26
JQ
602}
603
dcaf446e 604static int compress_threads_save_setup(void)
56e93d26
JQ
605{
606 int i, thread_count;
607
608 if (!migrate_use_compression()) {
dcaf446e 609 return 0;
56e93d26 610 }
56e93d26
JQ
611 thread_count = migrate_compress_threads();
612 compress_threads = g_new0(QemuThread, thread_count);
613 comp_param = g_new0(CompressParam, thread_count);
0d9f9a5c
LL
614 qemu_cond_init(&comp_done_cond);
615 qemu_mutex_init(&comp_done_lock);
56e93d26 616 for (i = 0; i < thread_count; i++) {
34ab9e97
XG
617 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
618 if (!comp_param[i].originbuf) {
619 goto exit;
620 }
621
dcaf446e
XG
622 if (deflateInit(&comp_param[i].stream,
623 migrate_compress_level()) != Z_OK) {
34ab9e97 624 g_free(comp_param[i].originbuf);
dcaf446e
XG
625 goto exit;
626 }
627
e110aa91
C
628 /* comp_param[i].file is just used as a dummy buffer to save data,
629 * set its ops to empty.
56e93d26 630 */
77ef2dc1 631 comp_param[i].file = qemu_file_new_output(
c0e0825c 632 QIO_CHANNEL(qio_channel_null_new()));
56e93d26 633 comp_param[i].done = true;
90e56fb4 634 comp_param[i].quit = false;
56e93d26
JQ
635 qemu_mutex_init(&comp_param[i].mutex);
636 qemu_cond_init(&comp_param[i].cond);
637 qemu_thread_create(compress_threads + i, "compress",
638 do_data_compress, comp_param + i,
639 QEMU_THREAD_JOINABLE);
640 }
dcaf446e
XG
641 return 0;
642
643exit:
644 compress_threads_save_cleanup();
645 return -1;
56e93d26
JQ
646}
647
648/**
3d0684b2 649 * save_page_header: write page header to wire
56e93d26
JQ
650 *
651 * If this is the 1st block, it also writes the block identification
652 *
3d0684b2 653 * Returns the number of bytes written
56e93d26
JQ
654 *
655 * @f: QEMUFile where to send the data
656 * @block: block that contains the page we want to send
657 * @offset: offset inside the block for the page
658 * in the lower bits, it contains flags
659 */
2bf3aa85
JQ
660static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
661 ram_addr_t offset)
56e93d26 662{
9f5f380b 663 size_t size, len;
56e93d26 664
24795694
JQ
665 if (block == rs->last_sent_block) {
666 offset |= RAM_SAVE_FLAG_CONTINUE;
667 }
2bf3aa85 668 qemu_put_be64(f, offset);
56e93d26
JQ
669 size = 8;
670
671 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
9f5f380b 672 len = strlen(block->idstr);
2bf3aa85
JQ
673 qemu_put_byte(f, len);
674 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
9f5f380b 675 size += 1 + len;
24795694 676 rs->last_sent_block = block;
56e93d26
JQ
677 }
678 return size;
679}
680
3d0684b2 681/**
179a8080 682 * mig_throttle_guest_down: throttle down the guest
3d0684b2
JQ
683 *
684 * Reduce amount of guest cpu execution to hopefully slow down memory
685 * writes. If guest dirty memory rate is reduced below the rate at
686 * which we can transfer pages to the destination then we should be
687 * able to complete migration. Some workloads dirty memory way too
688 * fast and will not effectively converge, even with auto-converge.
070afca2 689 */
cbbf8182
KZ
690static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
691 uint64_t bytes_dirty_threshold)
070afca2
JH
692{
693 MigrationState *s = migrate_get_current();
2594f56d 694 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
cbbf8182
KZ
695 uint64_t pct_increment = s->parameters.cpu_throttle_increment;
696 bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
4cbc9c7f 697 int pct_max = s->parameters.max_cpu_throttle;
070afca2 698
cbbf8182
KZ
699 uint64_t throttle_now = cpu_throttle_get_percentage();
700 uint64_t cpu_now, cpu_ideal, throttle_inc;
701
070afca2
JH
702 /* We have not started throttling yet. Let's start it. */
703 if (!cpu_throttle_active()) {
704 cpu_throttle_set(pct_initial);
705 } else {
706 /* Throttling already on, just increase the rate */
cbbf8182
KZ
707 if (!pct_tailslow) {
708 throttle_inc = pct_increment;
709 } else {
710 /* Compute the ideal CPU percentage used by Guest, which may
711 * make the dirty rate match the dirty rate threshold. */
712 cpu_now = 100 - throttle_now;
713 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
714 bytes_dirty_period);
715 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
716 }
717 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
070afca2
JH
718 }
719}
720
91fe9a8d
RL
721void mig_throttle_counter_reset(void)
722{
723 RAMState *rs = ram_state;
724
725 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
726 rs->num_dirty_pages_period = 0;
727 rs->bytes_xfer_prev = ram_counters.transferred;
728}
729
3d0684b2
JQ
730/**
731 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
732 *
6f37bb8b 733 * @rs: current RAM state
3d0684b2
JQ
734 * @current_addr: address for the zero page
735 *
736 * Update the xbzrle cache to reflect a page that's been sent as all 0.
56e93d26
JQ
737 * The important thing is that a stale (not-yet-0'd) page be replaced
738 * by the new data.
739 * As a bonus, if the page wasn't in the cache it gets added so that
3d0684b2 740 * when a small write is made into the 0'd page it gets XBZRLE sent.
56e93d26 741 */
6f37bb8b 742static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
56e93d26 743{
56e93d26
JQ
744 /* We don't care if this fails to allocate a new cache page
745 * as long as it updated an old one */
c00e0928 746 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
9360447d 747 ram_counters.dirty_sync_count);
56e93d26
JQ
748}
749
750#define ENCODING_FLAG_XBZRLE 0x1
751
752/**
753 * save_xbzrle_page: compress and send current page
754 *
755 * Returns: 1 means that we wrote the page
756 * 0 means that page is identical to the one already sent
757 * -1 means that xbzrle would be longer than normal
758 *
5a987738 759 * @rs: current RAM state
3d0684b2
JQ
760 * @current_data: pointer to the address of the page contents
761 * @current_addr: addr of the page
56e93d26
JQ
762 * @block: block that contains the page we want to send
763 * @offset: offset inside the block for the page
56e93d26 764 */
204b88b8 765static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
56e93d26 766 ram_addr_t current_addr, RAMBlock *block,
05931ec5 767 ram_addr_t offset)
56e93d26
JQ
768{
769 int encoded_len = 0, bytes_xbzrle;
770 uint8_t *prev_cached_page;
771
9360447d
JQ
772 if (!cache_is_cached(XBZRLE.cache, current_addr,
773 ram_counters.dirty_sync_count)) {
774 xbzrle_counters.cache_miss++;
05931ec5 775 if (!rs->last_stage) {
56e93d26 776 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
9360447d 777 ram_counters.dirty_sync_count) == -1) {
56e93d26
JQ
778 return -1;
779 } else {
780 /* update *current_data when the page has been
781 inserted into cache */
782 *current_data = get_cached_data(XBZRLE.cache, current_addr);
783 }
784 }
785 return -1;
786 }
787
e460a4b1
WW
788 /*
789 * Reaching here means the page has hit the xbzrle cache, no matter what
790 * encoding result it is (normal encoding, overflow or skipping the page),
3a4452d8 791 * count the page as encoded. This is used to calculate the encoding rate.
e460a4b1
WW
792 *
793 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
794 * 2nd page turns out to be skipped (i.e. no new bytes written to the
795 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
796 * skipped page included. In this way, the encoding rate can tell if the
797 * guest page is good for xbzrle encoding.
798 */
799 xbzrle_counters.pages++;
56e93d26
JQ
800 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
801
802 /* save current buffer into memory */
803 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
804
805 /* XBZRLE encoding (if there is no overflow) */
806 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
807 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
808 TARGET_PAGE_SIZE);
ca353803
WY
809
810 /*
811 * Update the cache contents, so that it corresponds to the data
812 * sent, in all cases except where we skip the page.
813 */
05931ec5 814 if (!rs->last_stage && encoded_len != 0) {
ca353803
WY
815 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
816 /*
817 * In the case where we couldn't compress, ensure that the caller
818 * sends the data from the cache, since the guest might have
819 * changed the RAM since we copied it.
820 */
821 *current_data = prev_cached_page;
822 }
823
56e93d26 824 if (encoded_len == 0) {
55c4446b 825 trace_save_xbzrle_page_skipping();
56e93d26
JQ
826 return 0;
827 } else if (encoded_len == -1) {
55c4446b 828 trace_save_xbzrle_page_overflow();
9360447d 829 xbzrle_counters.overflow++;
e460a4b1 830 xbzrle_counters.bytes += TARGET_PAGE_SIZE;
56e93d26
JQ
831 return -1;
832 }
833
56e93d26 834 /* Send XBZRLE based compressed page */
2bf3aa85 835 bytes_xbzrle = save_page_header(rs, rs->f, block,
204b88b8
JQ
836 offset | RAM_SAVE_FLAG_XBZRLE);
837 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
838 qemu_put_be16(rs->f, encoded_len);
839 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
56e93d26 840 bytes_xbzrle += encoded_len + 1 + 2;
e460a4b1
WW
841 /*
842 * Like compressed_size (please see update_compress_thread_counts),
843 * the xbzrle encoded bytes don't count the 8 byte header with
844 * RAM_SAVE_FLAG_CONTINUE.
845 */
846 xbzrle_counters.bytes += bytes_xbzrle - 8;
4c2d0f6d 847 ram_transferred_add(bytes_xbzrle);
56e93d26
JQ
848
849 return 1;
850}
851
3d0684b2
JQ
852/**
853 * migration_bitmap_find_dirty: find the next dirty page from start
f3f491fc 854 *
a5f7b1a6 855 * Returns the page offset within memory region of the start of a dirty page
3d0684b2 856 *
6f37bb8b 857 * @rs: current RAM state
3d0684b2 858 * @rb: RAMBlock where to search for dirty pages
a935e30f 859 * @start: page where we start the search
f3f491fc 860 */
56e93d26 861static inline
a935e30f 862unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
f20e2865 863 unsigned long start)
56e93d26 864{
6b6712ef
JQ
865 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
866 unsigned long *bitmap = rb->bmap;
56e93d26 867
fbd162e6 868 if (ramblock_is_ignored(rb)) {
b895de50
CLG
869 return size;
870 }
871
1a373522 872 return find_next_bit(bitmap, size, start);
56e93d26
JQ
873}
874
1230a25f 875static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
3143577d
WW
876 unsigned long page)
877{
878 uint8_t shift;
879 hwaddr size, start;
880
881 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
882 return;
883 }
884
885 shift = rb->clear_bmap_shift;
886 /*
887 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
888 * can make things easier sometimes since then start address
889 * of the small chunk will always be 64 pages aligned so the
890 * bitmap will always be aligned to unsigned long. We should
891 * even be able to remove this restriction but I'm simply
892 * keeping it.
893 */
894 assert(shift >= 6);
895
896 size = 1ULL << (TARGET_PAGE_BITS + shift);
7648297d 897 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
3143577d
WW
898 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
899 memory_region_clear_dirty_bitmap(rb->mr, start, size);
900}
901
902static void
1230a25f 903migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
3143577d
WW
904 unsigned long start,
905 unsigned long npages)
906{
907 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
908 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
909 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
910
911 /*
912 * Clear pages from start to start + npages - 1, so the end boundary is
913 * exclusive.
914 */
915 for (i = chunk_start; i < chunk_end; i += chunk_pages) {
1230a25f 916 migration_clear_memory_region_dirty_bitmap(rb, i);
3143577d
WW
917 }
918}
919
a6a83cef
RL
920/*
921 * colo_bitmap_find_diry:find contiguous dirty pages from start
922 *
923 * Returns the page offset within memory region of the start of the contiguout
924 * dirty page
925 *
926 * @rs: current RAM state
927 * @rb: RAMBlock where to search for dirty pages
928 * @start: page where we start the search
929 * @num: the number of contiguous dirty pages
930 */
931static inline
932unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
933 unsigned long start, unsigned long *num)
934{
935 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
936 unsigned long *bitmap = rb->bmap;
937 unsigned long first, next;
938
939 *num = 0;
940
941 if (ramblock_is_ignored(rb)) {
942 return size;
943 }
944
945 first = find_next_bit(bitmap, size, start);
946 if (first >= size) {
947 return first;
948 }
949 next = find_next_zero_bit(bitmap, size, first + 1);
950 assert(next >= first);
951 *num = next - first;
952 return first;
953}
954
06b10688 955static inline bool migration_bitmap_clear_dirty(RAMState *rs,
f20e2865
JQ
956 RAMBlock *rb,
957 unsigned long page)
a82d593b
DDAG
958{
959 bool ret;
a82d593b 960
002cad6b
PX
961 /*
962 * Clear dirty bitmap if needed. This _must_ be called before we
963 * send any of the page in the chunk because we need to make sure
964 * we can capture further page content changes when we sync dirty
965 * log the next time. So as long as we are going to send any of
966 * the page in the chunk we clear the remote dirty bitmap for all.
967 * Clearing it earlier won't be a problem, but too late will.
968 */
1230a25f 969 migration_clear_memory_region_dirty_bitmap(rb, page);
002cad6b 970
6b6712ef 971 ret = test_and_clear_bit(page, rb->bmap);
a82d593b 972 if (ret) {
0d8ec885 973 rs->migration_dirty_pages--;
a82d593b 974 }
386a907b 975
a82d593b
DDAG
976 return ret;
977}
978
be39b4cd
DH
979static void dirty_bitmap_clear_section(MemoryRegionSection *section,
980 void *opaque)
981{
982 const hwaddr offset = section->offset_within_region;
983 const hwaddr size = int128_get64(section->size);
984 const unsigned long start = offset >> TARGET_PAGE_BITS;
985 const unsigned long npages = size >> TARGET_PAGE_BITS;
986 RAMBlock *rb = section->mr->ram_block;
987 uint64_t *cleared_bits = opaque;
988
989 /*
990 * We don't grab ram_state->bitmap_mutex because we expect to run
991 * only when starting migration or during postcopy recovery where
992 * we don't have concurrent access.
993 */
994 if (!migration_in_postcopy() && !migrate_background_snapshot()) {
995 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
996 }
997 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
998 bitmap_clear(rb->bmap, start, npages);
999}
1000
1001/*
1002 * Exclude all dirty pages from migration that fall into a discarded range as
1003 * managed by a RamDiscardManager responsible for the mapped memory region of
1004 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
1005 *
1006 * Discarded pages ("logically unplugged") have undefined content and must
1007 * not get migrated, because even reading these pages for migration might
1008 * result in undesired behavior.
1009 *
1010 * Returns the number of cleared bits in the RAMBlock dirty bitmap.
1011 *
1012 * Note: The result is only stable while migrating (precopy/postcopy).
1013 */
1014static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
1015{
1016 uint64_t cleared_bits = 0;
1017
1018 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
1019 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1020 MemoryRegionSection section = {
1021 .mr = rb->mr,
1022 .offset_within_region = 0,
1023 .size = int128_make64(qemu_ram_get_used_length(rb)),
1024 };
1025
1026 ram_discard_manager_replay_discarded(rdm, &section,
1027 dirty_bitmap_clear_section,
1028 &cleared_bits);
1029 }
1030 return cleared_bits;
1031}
1032
9470c5e0
DH
1033/*
1034 * Check if a host-page aligned page falls into a discarded range as managed by
1035 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
1036 *
1037 * Note: The result is only stable while migrating (precopy/postcopy).
1038 */
1039bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
1040{
1041 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1042 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1043 MemoryRegionSection section = {
1044 .mr = rb->mr,
1045 .offset_within_region = start,
1046 .size = int128_make64(qemu_ram_pagesize(rb)),
1047 };
1048
1049 return !ram_discard_manager_is_populated(rdm, &section);
1050 }
1051 return false;
1052}
1053
267691b6 1054/* Called with RCU critical section */
7a3e9571 1055static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
56e93d26 1056{
fb613580
KZ
1057 uint64_t new_dirty_pages =
1058 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
1059
1060 rs->migration_dirty_pages += new_dirty_pages;
1061 rs->num_dirty_pages_period += new_dirty_pages;
56e93d26
JQ
1062}
1063
3d0684b2
JQ
1064/**
1065 * ram_pagesize_summary: calculate all the pagesizes of a VM
1066 *
1067 * Returns a summary bitmap of the page sizes of all RAMBlocks
1068 *
1069 * For VMs with just normal pages this is equivalent to the host page
1070 * size. If it's got some huge pages then it's the OR of all the
1071 * different page sizes.
e8ca1db2
DDAG
1072 */
1073uint64_t ram_pagesize_summary(void)
1074{
1075 RAMBlock *block;
1076 uint64_t summary = 0;
1077
fbd162e6 1078 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
e8ca1db2
DDAG
1079 summary |= block->page_size;
1080 }
1081
1082 return summary;
1083}
1084
aecbfe9c
XG
1085uint64_t ram_get_total_transferred_pages(void)
1086{
1087 return ram_counters.normal + ram_counters.duplicate +
1088 compression_counters.pages + xbzrle_counters.pages;
1089}
1090
b734035b
XG
1091static void migration_update_rates(RAMState *rs, int64_t end_time)
1092{
be8b02ed 1093 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
76e03000 1094 double compressed_size;
b734035b
XG
1095
1096 /* calculate period counters */
1097 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1098 / (end_time - rs->time_last_bitmap_sync);
1099
be8b02ed 1100 if (!page_count) {
b734035b
XG
1101 return;
1102 }
1103
1104 if (migrate_use_xbzrle()) {
e460a4b1
WW
1105 double encoded_size, unencoded_size;
1106
b734035b 1107 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
be8b02ed 1108 rs->xbzrle_cache_miss_prev) / page_count;
b734035b 1109 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
e460a4b1
WW
1110 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1111 TARGET_PAGE_SIZE;
1112 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
92271402 1113 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
e460a4b1 1114 xbzrle_counters.encoding_rate = 0;
e460a4b1
WW
1115 } else {
1116 xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1117 }
1118 rs->xbzrle_pages_prev = xbzrle_counters.pages;
1119 rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
b734035b 1120 }
76e03000
XG
1121
1122 if (migrate_use_compression()) {
1123 compression_counters.busy_rate = (double)(compression_counters.busy -
1124 rs->compress_thread_busy_prev) / page_count;
1125 rs->compress_thread_busy_prev = compression_counters.busy;
1126
1127 compressed_size = compression_counters.compressed_size -
1128 rs->compressed_size_prev;
1129 if (compressed_size) {
1130 double uncompressed_size = (compression_counters.pages -
1131 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1132
1133 /* Compression-Ratio = Uncompressed-size / Compressed-size */
1134 compression_counters.compression_rate =
1135 uncompressed_size / compressed_size;
1136
1137 rs->compress_pages_prev = compression_counters.pages;
1138 rs->compressed_size_prev = compression_counters.compressed_size;
1139 }
1140 }
b734035b
XG
1141}
1142
dc14a470
KZ
1143static void migration_trigger_throttle(RAMState *rs)
1144{
1145 MigrationState *s = migrate_get_current();
1146 uint64_t threshold = s->parameters.throttle_trigger_threshold;
1147
1148 uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
1149 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1150 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1151
1152 /* During block migration the auto-converge logic incorrectly detects
1153 * that ram migration makes no progress. Avoid this by disabling the
1154 * throttling logic during the bulk phase of block migration. */
1155 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1156 /* The following detection logic can be refined later. For now:
1157 Check to see if the ratio between dirtied bytes and the approx.
1158 amount of bytes that just got transferred since the last time
1159 we were in this routine reaches the threshold. If that happens
1160 twice, start or increase throttling. */
1161
1162 if ((bytes_dirty_period > bytes_dirty_threshold) &&
1163 (++rs->dirty_rate_high_cnt >= 2)) {
1164 trace_migration_throttle();
1165 rs->dirty_rate_high_cnt = 0;
cbbf8182
KZ
1166 mig_throttle_guest_down(bytes_dirty_period,
1167 bytes_dirty_threshold);
dc14a470
KZ
1168 }
1169 }
1170}
1171
8d820d6f 1172static void migration_bitmap_sync(RAMState *rs)
56e93d26
JQ
1173{
1174 RAMBlock *block;
56e93d26 1175 int64_t end_time;
56e93d26 1176
9360447d 1177 ram_counters.dirty_sync_count++;
56e93d26 1178
f664da80
JQ
1179 if (!rs->time_last_bitmap_sync) {
1180 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
56e93d26
JQ
1181 }
1182
1183 trace_migration_bitmap_sync_start();
9c1f8f44 1184 memory_global_dirty_log_sync();
56e93d26 1185
108cfae0 1186 qemu_mutex_lock(&rs->bitmap_mutex);
89ac5a1d
DDAG
1187 WITH_RCU_READ_LOCK_GUARD() {
1188 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1189 ramblock_sync_dirty_bitmap(rs, block);
1190 }
1191 ram_counters.remaining = ram_bytes_remaining();
56e93d26 1192 }
108cfae0 1193 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26 1194
9458a9a1 1195 memory_global_after_dirty_log_sync();
a66cd90c 1196 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1ffb5dfd 1197
56e93d26
JQ
1198 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1199
1200 /* more than 1 second = 1000 millisecons */
f664da80 1201 if (end_time > rs->time_last_bitmap_sync + 1000) {
dc14a470 1202 migration_trigger_throttle(rs);
070afca2 1203
b734035b
XG
1204 migration_update_rates(rs, end_time);
1205
be8b02ed 1206 rs->target_page_count_prev = rs->target_page_count;
d693c6f1
FF
1207
1208 /* reset period counters */
f664da80 1209 rs->time_last_bitmap_sync = end_time;
a66cd90c 1210 rs->num_dirty_pages_period = 0;
dc14a470 1211 rs->bytes_xfer_prev = ram_counters.transferred;
56e93d26 1212 }
4addcd4f 1213 if (migrate_use_events()) {
3ab72385 1214 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
4addcd4f 1215 }
56e93d26
JQ
1216}
1217
bd227060
WW
1218static void migration_bitmap_sync_precopy(RAMState *rs)
1219{
1220 Error *local_err = NULL;
1221
1222 /*
1223 * The current notifier usage is just an optimization to migration, so we
1224 * don't stop the normal migration process in the error case.
1225 */
1226 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1227 error_report_err(local_err);
b4a1733c 1228 local_err = NULL;
bd227060
WW
1229 }
1230
1231 migration_bitmap_sync(rs);
1232
1233 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1234 error_report_err(local_err);
1235 }
1236}
1237
a4dbaf8e 1238void ram_release_page(const char *rbname, uint64_t offset)
47fe16ff
JQ
1239{
1240 if (!migrate_release_ram() || !migration_in_postcopy()) {
1241 return;
1242 }
1243
1244 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1245}
1246
6c97ec5f
XG
1247/**
1248 * save_zero_page_to_file: send the zero page to the file
1249 *
1250 * Returns the size of data written to the file, 0 means the page is not
1251 * a zero page
1252 *
1253 * @rs: current RAM state
1254 * @file: the file where the data is saved
1255 * @block: block that contains the page we want to send
1256 * @offset: offset inside the block for the page
1257 */
1258static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1259 RAMBlock *block, ram_addr_t offset)
1260{
1261 uint8_t *p = block->host + offset;
1262 int len = 0;
1263
bad452a7 1264 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
6c97ec5f
XG
1265 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1266 qemu_put_byte(file, 0);
1267 len += 1;
47fe16ff 1268 ram_release_page(block->idstr, offset);
6c97ec5f
XG
1269 }
1270 return len;
1271}
1272
56e93d26 1273/**
3d0684b2 1274 * save_zero_page: send the zero page to the stream
56e93d26 1275 *
3d0684b2 1276 * Returns the number of pages written.
56e93d26 1277 *
f7ccd61b 1278 * @rs: current RAM state
56e93d26
JQ
1279 * @block: block that contains the page we want to send
1280 * @offset: offset inside the block for the page
56e93d26 1281 */
7faccdc3 1282static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
56e93d26 1283{
6c97ec5f 1284 int len = save_zero_page_to_file(rs, rs->f, block, offset);
56e93d26 1285
6c97ec5f 1286 if (len) {
9360447d 1287 ram_counters.duplicate++;
4c2d0f6d 1288 ram_transferred_add(len);
6c97ec5f 1289 return 1;
56e93d26 1290 }
6c97ec5f 1291 return -1;
56e93d26
JQ
1292}
1293
059ff0fb
XG
1294/*
1295 * @pages: the number of pages written by the control path,
1296 * < 0 - error
1297 * > 0 - number of pages written
1298 *
1299 * Return true if the pages has been saved, otherwise false is returned.
1300 */
1301static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1302 int *pages)
1303{
1304 uint64_t bytes_xmit = 0;
1305 int ret;
1306
1307 *pages = -1;
1308 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1309 &bytes_xmit);
1310 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1311 return false;
1312 }
1313
1314 if (bytes_xmit) {
4c2d0f6d 1315 ram_transferred_add(bytes_xmit);
059ff0fb
XG
1316 *pages = 1;
1317 }
1318
1319 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1320 return true;
1321 }
1322
1323 if (bytes_xmit > 0) {
1324 ram_counters.normal++;
1325 } else if (bytes_xmit == 0) {
1326 ram_counters.duplicate++;
1327 }
1328
1329 return true;
1330}
1331
65dacaa0
XG
1332/*
1333 * directly send the page to the stream
1334 *
1335 * Returns the number of pages written.
1336 *
1337 * @rs: current RAM state
1338 * @block: block that contains the page we want to send
1339 * @offset: offset inside the block for the page
1340 * @buf: the page to be sent
1341 * @async: send to page asyncly
1342 */
1343static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1344 uint8_t *buf, bool async)
1345{
4c2d0f6d
DE
1346 ram_transferred_add(save_page_header(rs, rs->f, block,
1347 offset | RAM_SAVE_FLAG_PAGE));
65dacaa0
XG
1348 if (async) {
1349 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
f912ec5b 1350 migrate_release_ram() &&
65dacaa0
XG
1351 migration_in_postcopy());
1352 } else {
1353 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1354 }
4c2d0f6d 1355 ram_transferred_add(TARGET_PAGE_SIZE);
65dacaa0
XG
1356 ram_counters.normal++;
1357 return 1;
1358}
1359
56e93d26 1360/**
3d0684b2 1361 * ram_save_page: send the given page to the stream
56e93d26 1362 *
3d0684b2 1363 * Returns the number of pages written.
3fd3c4b3
DDAG
1364 * < 0 - error
1365 * >=0 - Number of pages written - this might legally be 0
1366 * if xbzrle noticed the page was the same.
56e93d26 1367 *
6f37bb8b 1368 * @rs: current RAM state
56e93d26
JQ
1369 * @block: block that contains the page we want to send
1370 * @offset: offset inside the block for the page
56e93d26 1371 */
05931ec5 1372static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
56e93d26
JQ
1373{
1374 int pages = -1;
56e93d26 1375 uint8_t *p;
56e93d26 1376 bool send_async = true;
a08f6890 1377 RAMBlock *block = pss->block;
8bba004c 1378 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
059ff0fb 1379 ram_addr_t current_addr = block->offset + offset;
56e93d26 1380
2f68e399 1381 p = block->host + offset;
1db9d8e5 1382 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
56e93d26 1383
56e93d26 1384 XBZRLE_cache_lock();
1a373522 1385 if (rs->xbzrle_enabled && !migration_in_postcopy()) {
059ff0fb 1386 pages = save_xbzrle_page(rs, &p, current_addr, block,
05931ec5
JQ
1387 offset);
1388 if (!rs->last_stage) {
059ff0fb
XG
1389 /* Can't send this cached data async, since the cache page
1390 * might get updated before it gets to the wire
56e93d26 1391 */
059ff0fb 1392 send_async = false;
56e93d26
JQ
1393 }
1394 }
1395
1396 /* XBZRLE overflow or normal page */
1397 if (pages == -1) {
65dacaa0 1398 pages = save_normal_page(rs, block, offset, p, send_async);
56e93d26
JQ
1399 }
1400
1401 XBZRLE_cache_unlock();
1402
1403 return pages;
1404}
1405
b9ee2f7d
JQ
1406static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1407 ram_addr_t offset)
1408{
67a4c891 1409 if (multifd_queue_page(rs->f, block, offset) < 0) {
713f762a
IR
1410 return -1;
1411 }
b9ee2f7d
JQ
1412 ram_counters.normal++;
1413
1414 return 1;
1415}
1416
5e5fdcff 1417static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
6ef3771c 1418 ram_addr_t offset, uint8_t *source_buf)
56e93d26 1419{
53518d94 1420 RAMState *rs = ram_state;
20d549cb 1421 uint8_t *p = block->host + offset;
6ef3771c 1422 int ret;
56e93d26 1423
5e5fdcff 1424 if (save_zero_page_to_file(rs, f, block, offset)) {
e7f2e190 1425 return true;
5e5fdcff
XG
1426 }
1427
6ef3771c 1428 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
34ab9e97
XG
1429
1430 /*
1431 * copy it to a internal buffer to avoid it being modified by VM
1432 * so that we can catch up the error during compression and
1433 * decompression
1434 */
1435 memcpy(source_buf, p, TARGET_PAGE_SIZE);
6ef3771c
XG
1436 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1437 if (ret < 0) {
1438 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
b3be2896 1439 error_report("compressed data failed!");
b3be2896 1440 }
e7f2e190 1441 return false;
5e5fdcff
XG
1442}
1443
1444static void
1445update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1446{
4c2d0f6d 1447 ram_transferred_add(bytes_xmit);
76e03000 1448
5e5fdcff
XG
1449 if (param->zero_page) {
1450 ram_counters.duplicate++;
76e03000 1451 return;
5e5fdcff 1452 }
76e03000
XG
1453
1454 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1455 compression_counters.compressed_size += bytes_xmit - 8;
1456 compression_counters.pages++;
56e93d26
JQ
1457}
1458
32b05495
XG
1459static bool save_page_use_compression(RAMState *rs);
1460
ce25d337 1461static void flush_compressed_data(RAMState *rs)
56e93d26
JQ
1462{
1463 int idx, len, thread_count;
1464
32b05495 1465 if (!save_page_use_compression(rs)) {
56e93d26
JQ
1466 return;
1467 }
1468 thread_count = migrate_compress_threads();
a7a9a88f 1469
0d9f9a5c 1470 qemu_mutex_lock(&comp_done_lock);
56e93d26 1471 for (idx = 0; idx < thread_count; idx++) {
a7a9a88f 1472 while (!comp_param[idx].done) {
0d9f9a5c 1473 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26 1474 }
a7a9a88f 1475 }
0d9f9a5c 1476 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
1477
1478 for (idx = 0; idx < thread_count; idx++) {
1479 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 1480 if (!comp_param[idx].quit) {
ce25d337 1481 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
5e5fdcff
XG
1482 /*
1483 * it's safe to fetch zero_page without holding comp_done_lock
1484 * as there is no further request submitted to the thread,
1485 * i.e, the thread should be waiting for a request at this point.
1486 */
1487 update_compress_thread_counts(&comp_param[idx], len);
56e93d26 1488 }
a7a9a88f 1489 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26
JQ
1490 }
1491}
1492
1493static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1494 ram_addr_t offset)
1495{
1496 param->block = block;
1497 param->offset = offset;
1498}
1499
ce25d337
JQ
1500static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1501 ram_addr_t offset)
56e93d26
JQ
1502{
1503 int idx, thread_count, bytes_xmit = -1, pages = -1;
1d58872a 1504 bool wait = migrate_compress_wait_thread();
56e93d26
JQ
1505
1506 thread_count = migrate_compress_threads();
0d9f9a5c 1507 qemu_mutex_lock(&comp_done_lock);
1d58872a
XG
1508retry:
1509 for (idx = 0; idx < thread_count; idx++) {
1510 if (comp_param[idx].done) {
1511 comp_param[idx].done = false;
1512 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1513 qemu_mutex_lock(&comp_param[idx].mutex);
1514 set_compress_params(&comp_param[idx], block, offset);
1515 qemu_cond_signal(&comp_param[idx].cond);
1516 qemu_mutex_unlock(&comp_param[idx].mutex);
1517 pages = 1;
5e5fdcff 1518 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
56e93d26 1519 break;
56e93d26
JQ
1520 }
1521 }
1d58872a
XG
1522
1523 /*
1524 * wait for the free thread if the user specifies 'compress-wait-thread',
1525 * otherwise we will post the page out in the main thread as normal page.
1526 */
1527 if (pages < 0 && wait) {
1528 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1529 goto retry;
1530 }
0d9f9a5c 1531 qemu_mutex_unlock(&comp_done_lock);
56e93d26
JQ
1532
1533 return pages;
1534}
1535
3d0684b2
JQ
1536/**
1537 * find_dirty_block: find the next dirty page and update any state
1538 * associated with the search process.
b9e60928 1539 *
a5f7b1a6 1540 * Returns true if a page is found
b9e60928 1541 *
6f37bb8b 1542 * @rs: current RAM state
3d0684b2
JQ
1543 * @pss: data about the state of the current dirty page scan
1544 * @again: set to false if the search has scanned the whole of RAM
b9e60928 1545 */
f20e2865 1546static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
b9e60928 1547{
82b54ef4
PX
1548 /*
1549 * This is not a postcopy requested page, mark it "not urgent", and use
1550 * precopy channel to send it.
1551 */
ea2faf0c 1552 pss->postcopy_requested = false;
82b54ef4 1553 pss->postcopy_target_channel = RAM_CHANNEL_PRECOPY;
ea2faf0c 1554
f20e2865 1555 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
6f37bb8b 1556 if (pss->complete_round && pss->block == rs->last_seen_block &&
a935e30f 1557 pss->page >= rs->last_page) {
b9e60928
DDAG
1558 /*
1559 * We've been once around the RAM and haven't found anything.
1560 * Give up.
1561 */
1562 *again = false;
1563 return false;
1564 }
542147f4
DH
1565 if (!offset_in_ramblock(pss->block,
1566 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
b9e60928 1567 /* Didn't find anything in this RAM Block */
a935e30f 1568 pss->page = 0;
b9e60928
DDAG
1569 pss->block = QLIST_NEXT_RCU(pss->block, next);
1570 if (!pss->block) {
48df9d80
XG
1571 /*
1572 * If memory migration starts over, we will meet a dirtied page
1573 * which may still exists in compression threads's ring, so we
1574 * should flush the compressed data to make sure the new page
1575 * is not overwritten by the old one in the destination.
1576 *
1577 * Also If xbzrle is on, stop using the data compression at this
1578 * point. In theory, xbzrle can do better than compression.
1579 */
1580 flush_compressed_data(rs);
1581
b9e60928
DDAG
1582 /* Hit the end of the list */
1583 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1584 /* Flag that we've looped */
1585 pss->complete_round = true;
1a373522
DH
1586 /* After the first round, enable XBZRLE. */
1587 if (migrate_use_xbzrle()) {
1588 rs->xbzrle_enabled = true;
1589 }
b9e60928
DDAG
1590 }
1591 /* Didn't find anything this time, but try again on the new block */
1592 *again = true;
1593 return false;
1594 } else {
1595 /* Can go around again, but... */
1596 *again = true;
1597 /* We've found something so probably don't need to */
1598 return true;
1599 }
1600}
1601
3d0684b2
JQ
1602/**
1603 * unqueue_page: gets a page of the queue
1604 *
a82d593b 1605 * Helper for 'get_queued_page' - gets a page off the queue
a82d593b 1606 *
3d0684b2
JQ
1607 * Returns the block of the page (or NULL if none available)
1608 *
ec481c6c 1609 * @rs: current RAM state
3d0684b2 1610 * @offset: used to return the offset within the RAMBlock
a82d593b 1611 */
f20e2865 1612static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
a82d593b 1613{
a1fe28df 1614 struct RAMSrcPageRequest *entry;
a82d593b
DDAG
1615 RAMBlock *block = NULL;
1616
a1fe28df 1617 if (!postcopy_has_request(rs)) {
ae526e32
XG
1618 return NULL;
1619 }
1620
6e8a355d 1621 QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
a1fe28df
PX
1622
1623 /*
1624 * This should _never_ change even after we take the lock, because no one
1625 * should be taking anything off the request list other than us.
1626 */
1627 assert(postcopy_has_request(rs));
1628
1629 entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1630 block = entry->rb;
1631 *offset = entry->offset;
1632
777f53c7
TH
1633 if (entry->len > TARGET_PAGE_SIZE) {
1634 entry->len -= TARGET_PAGE_SIZE;
1635 entry->offset += TARGET_PAGE_SIZE;
a1fe28df
PX
1636 } else {
1637 memory_region_unref(block->mr);
1638 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1639 g_free(entry);
1640 migration_consume_urgent_request();
a82d593b 1641 }
a82d593b
DDAG
1642
1643 return block;
1644}
1645
278e2f55
AG
1646#if defined(__linux__)
1647/**
1648 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1649 * is found, return RAM block pointer and page offset
1650 *
1651 * Returns pointer to the RAMBlock containing faulting page,
1652 * NULL if no write faults are pending
1653 *
1654 * @rs: current RAM state
1655 * @offset: page offset from the beginning of the block
1656 */
1657static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1658{
1659 struct uffd_msg uffd_msg;
1660 void *page_address;
82ea3e3b 1661 RAMBlock *block;
278e2f55
AG
1662 int res;
1663
1664 if (!migrate_background_snapshot()) {
1665 return NULL;
1666 }
1667
1668 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1669 if (res <= 0) {
1670 return NULL;
1671 }
1672
1673 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
82ea3e3b
AG
1674 block = qemu_ram_block_from_host(page_address, false, offset);
1675 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1676 return block;
278e2f55
AG
1677}
1678
1679/**
1680 * ram_save_release_protection: release UFFD write protection after
1681 * a range of pages has been saved
1682 *
1683 * @rs: current RAM state
1684 * @pss: page-search-status structure
1685 * @start_page: index of the first page in the range relative to pss->block
1686 *
1687 * Returns 0 on success, negative value in case of an error
1688*/
1689static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1690 unsigned long start_page)
1691{
1692 int res = 0;
1693
1694 /* Check if page is from UFFD-managed region. */
1695 if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1696 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
258f5c98 1697 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
278e2f55
AG
1698
1699 /* Flush async buffers before un-protect. */
1700 qemu_fflush(rs->f);
1701 /* Un-protect memory range. */
1702 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1703 false, false);
1704 }
1705
1706 return res;
1707}
1708
1709/* ram_write_tracking_available: check if kernel supports required UFFD features
1710 *
1711 * Returns true if supports, false otherwise
1712 */
1713bool ram_write_tracking_available(void)
1714{
1715 uint64_t uffd_features;
1716 int res;
1717
1718 res = uffd_query_features(&uffd_features);
1719 return (res == 0 &&
1720 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1721}
1722
1723/* ram_write_tracking_compatible: check if guest configuration is
1724 * compatible with 'write-tracking'
1725 *
1726 * Returns true if compatible, false otherwise
1727 */
1728bool ram_write_tracking_compatible(void)
1729{
1730 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1731 int uffd_fd;
82ea3e3b 1732 RAMBlock *block;
278e2f55
AG
1733 bool ret = false;
1734
1735 /* Open UFFD file descriptor */
1736 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1737 if (uffd_fd < 0) {
1738 return false;
1739 }
1740
1741 RCU_READ_LOCK_GUARD();
1742
82ea3e3b 1743 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
278e2f55
AG
1744 uint64_t uffd_ioctls;
1745
1746 /* Nothing to do with read-only and MMIO-writable regions */
82ea3e3b 1747 if (block->mr->readonly || block->mr->rom_device) {
278e2f55
AG
1748 continue;
1749 }
1750 /* Try to register block memory via UFFD-IO to track writes */
82ea3e3b 1751 if (uffd_register_memory(uffd_fd, block->host, block->max_length,
278e2f55
AG
1752 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1753 goto out;
1754 }
1755 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1756 goto out;
1757 }
1758 }
1759 ret = true;
1760
1761out:
1762 uffd_close_fd(uffd_fd);
1763 return ret;
1764}
1765
f7b9dcfb
DH
1766static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1767 ram_addr_t size)
1768{
1769 /*
1770 * We read one byte of each page; this will preallocate page tables if
1771 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1772 * where no page was populated yet. This might require adaption when
1773 * supporting other mappings, like shmem.
1774 */
1775 for (; offset < size; offset += block->page_size) {
1776 char tmp = *((char *)block->host + offset);
1777
1778 /* Don't optimize the read out */
1779 asm volatile("" : "+r" (tmp));
1780 }
1781}
1782
6fee3a1f
DH
1783static inline int populate_read_section(MemoryRegionSection *section,
1784 void *opaque)
1785{
1786 const hwaddr size = int128_get64(section->size);
1787 hwaddr offset = section->offset_within_region;
1788 RAMBlock *block = section->mr->ram_block;
1789
1790 populate_read_range(block, offset, size);
1791 return 0;
1792}
1793
eeccb99c 1794/*
f7b9dcfb
DH
1795 * ram_block_populate_read: preallocate page tables and populate pages in the
1796 * RAM block by reading a byte of each page.
eeccb99c
AG
1797 *
1798 * Since it's solely used for userfault_fd WP feature, here we just
1799 * hardcode page size to qemu_real_host_page_size.
1800 *
82ea3e3b 1801 * @block: RAM block to populate
eeccb99c 1802 */
6fee3a1f 1803static void ram_block_populate_read(RAMBlock *rb)
eeccb99c 1804{
6fee3a1f
DH
1805 /*
1806 * Skip populating all pages that fall into a discarded range as managed by
1807 * a RamDiscardManager responsible for the mapped memory region of the
1808 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1809 * must not get populated automatically. We don't have to track
1810 * modifications via userfaultfd WP reliably, because these pages will
1811 * not be part of the migration stream either way -- see
1812 * ramblock_dirty_bitmap_exclude_discarded_pages().
1813 *
1814 * Note: The result is only stable while migrating (precopy/postcopy).
1815 */
1816 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1817 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1818 MemoryRegionSection section = {
1819 .mr = rb->mr,
1820 .offset_within_region = 0,
1821 .size = rb->mr->size,
1822 };
1823
1824 ram_discard_manager_replay_populated(rdm, &section,
1825 populate_read_section, NULL);
1826 } else {
1827 populate_read_range(rb, 0, rb->used_length);
1828 }
eeccb99c
AG
1829}
1830
1831/*
1832 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1833 */
1834void ram_write_tracking_prepare(void)
1835{
82ea3e3b 1836 RAMBlock *block;
eeccb99c
AG
1837
1838 RCU_READ_LOCK_GUARD();
1839
82ea3e3b 1840 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
eeccb99c 1841 /* Nothing to do with read-only and MMIO-writable regions */
82ea3e3b 1842 if (block->mr->readonly || block->mr->rom_device) {
eeccb99c
AG
1843 continue;
1844 }
1845
1846 /*
1847 * Populate pages of the RAM block before enabling userfault_fd
1848 * write protection.
1849 *
1850 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1851 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1852 * pages with pte_none() entries in page table.
1853 */
f7b9dcfb 1854 ram_block_populate_read(block);
eeccb99c
AG
1855 }
1856}
1857
278e2f55
AG
1858/*
1859 * ram_write_tracking_start: start UFFD-WP memory tracking
1860 *
1861 * Returns 0 for success or negative value in case of error
1862 */
1863int ram_write_tracking_start(void)
1864{
1865 int uffd_fd;
1866 RAMState *rs = ram_state;
82ea3e3b 1867 RAMBlock *block;
278e2f55
AG
1868
1869 /* Open UFFD file descriptor */
1870 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1871 if (uffd_fd < 0) {
1872 return uffd_fd;
1873 }
1874 rs->uffdio_fd = uffd_fd;
1875
1876 RCU_READ_LOCK_GUARD();
1877
82ea3e3b 1878 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
278e2f55 1879 /* Nothing to do with read-only and MMIO-writable regions */
82ea3e3b 1880 if (block->mr->readonly || block->mr->rom_device) {
278e2f55
AG
1881 continue;
1882 }
1883
1884 /* Register block memory with UFFD to track writes */
82ea3e3b
AG
1885 if (uffd_register_memory(rs->uffdio_fd, block->host,
1886 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
278e2f55
AG
1887 goto fail;
1888 }
1889 /* Apply UFFD write protection to the block memory range */
82ea3e3b
AG
1890 if (uffd_change_protection(rs->uffdio_fd, block->host,
1891 block->max_length, true, false)) {
278e2f55
AG
1892 goto fail;
1893 }
82ea3e3b
AG
1894 block->flags |= RAM_UF_WRITEPROTECT;
1895 memory_region_ref(block->mr);
278e2f55 1896
82ea3e3b
AG
1897 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1898 block->host, block->max_length);
278e2f55
AG
1899 }
1900
1901 return 0;
1902
1903fail:
1904 error_report("ram_write_tracking_start() failed: restoring initial memory state");
1905
82ea3e3b
AG
1906 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1907 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
278e2f55
AG
1908 continue;
1909 }
1910 /*
1911 * In case some memory block failed to be write-protected
1912 * remove protection and unregister all succeeded RAM blocks
1913 */
82ea3e3b
AG
1914 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1915 false, false);
1916 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
278e2f55 1917 /* Cleanup flags and remove reference */
82ea3e3b
AG
1918 block->flags &= ~RAM_UF_WRITEPROTECT;
1919 memory_region_unref(block->mr);
278e2f55
AG
1920 }
1921
1922 uffd_close_fd(uffd_fd);
1923 rs->uffdio_fd = -1;
1924 return -1;
1925}
1926
1927/**
1928 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1929 */
1930void ram_write_tracking_stop(void)
1931{
1932 RAMState *rs = ram_state;
82ea3e3b 1933 RAMBlock *block;
278e2f55
AG
1934
1935 RCU_READ_LOCK_GUARD();
1936
82ea3e3b
AG
1937 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1938 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
278e2f55
AG
1939 continue;
1940 }
1941 /* Remove protection and unregister all affected RAM blocks */
82ea3e3b
AG
1942 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1943 false, false);
1944 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
278e2f55 1945
82ea3e3b
AG
1946 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1947 block->host, block->max_length);
278e2f55
AG
1948
1949 /* Cleanup flags and remove reference */
82ea3e3b
AG
1950 block->flags &= ~RAM_UF_WRITEPROTECT;
1951 memory_region_unref(block->mr);
278e2f55
AG
1952 }
1953
1954 /* Finally close UFFD file descriptor */
1955 uffd_close_fd(rs->uffdio_fd);
1956 rs->uffdio_fd = -1;
1957}
1958
1959#else
1960/* No target OS support, stubs just fail or ignore */
1961
1962static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1963{
1964 (void) rs;
1965 (void) offset;
1966
1967 return NULL;
1968}
1969
1970static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1971 unsigned long start_page)
1972{
1973 (void) rs;
1974 (void) pss;
1975 (void) start_page;
1976
1977 return 0;
1978}
1979
1980bool ram_write_tracking_available(void)
1981{
1982 return false;
1983}
1984
1985bool ram_write_tracking_compatible(void)
1986{
1987 assert(0);
1988 return false;
1989}
1990
1991int ram_write_tracking_start(void)
1992{
1993 assert(0);
1994 return -1;
1995}
1996
1997void ram_write_tracking_stop(void)
1998{
1999 assert(0);
2000}
2001#endif /* defined(__linux__) */
2002
c01b16ed
PX
2003/*
2004 * Check whether two addr/offset of the ramblock falls onto the same host huge
2005 * page. Returns true if so, false otherwise.
2006 */
2007static bool offset_on_same_huge_page(RAMBlock *rb, uint64_t addr1,
2008 uint64_t addr2)
2009{
2010 size_t page_size = qemu_ram_pagesize(rb);
2011
2012 addr1 = ROUND_DOWN(addr1, page_size);
2013 addr2 = ROUND_DOWN(addr2, page_size);
2014
2015 return addr1 == addr2;
2016}
2017
2018/*
2019 * Whether a previous preempted precopy huge page contains current requested
2020 * page? Returns true if so, false otherwise.
2021 *
2022 * This should really happen very rarely, because it means when we were sending
2023 * during background migration for postcopy we're sending exactly the page that
2024 * some vcpu got faulted on on dest node. When it happens, we probably don't
2025 * need to do much but drop the request, because we know right after we restore
2026 * the precopy stream it'll be serviced. It'll slightly affect the order of
2027 * postcopy requests to be serviced (e.g. it'll be the same as we move current
2028 * request to the end of the queue) but it shouldn't be a big deal. The most
2029 * imporant thing is we can _never_ try to send a partial-sent huge page on the
2030 * POSTCOPY channel again, otherwise that huge page will got "split brain" on
2031 * two channels (PRECOPY, POSTCOPY).
2032 */
2033static bool postcopy_preempted_contains(RAMState *rs, RAMBlock *block,
2034 ram_addr_t offset)
2035{
2036 PostcopyPreemptState *state = &rs->postcopy_preempt_state;
2037
2038 /* No preemption at all? */
2039 if (!state->preempted) {
2040 return false;
2041 }
2042
2043 /* Not even the same ramblock? */
2044 if (state->ram_block != block) {
2045 return false;
2046 }
2047
2048 return offset_on_same_huge_page(block, offset,
2049 state->ram_page << TARGET_PAGE_BITS);
2050}
2051
3d0684b2 2052/**
ff1543af 2053 * get_queued_page: unqueue a page from the postcopy requests
3d0684b2
JQ
2054 *
2055 * Skips pages that are already sent (!dirty)
a82d593b 2056 *
a5f7b1a6 2057 * Returns true if a queued page is found
a82d593b 2058 *
6f37bb8b 2059 * @rs: current RAM state
3d0684b2 2060 * @pss: data about the state of the current dirty page scan
a82d593b 2061 */
f20e2865 2062static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
a82d593b
DDAG
2063{
2064 RAMBlock *block;
2065 ram_addr_t offset;
777f53c7
TH
2066 bool dirty;
2067
2068 do {
2069 block = unqueue_page(rs, &offset);
2070 /*
2071 * We're sending this page, and since it's postcopy nothing else
2072 * will dirty it, and we must make sure it doesn't get sent again
2073 * even if this queue request was received after the background
2074 * search already sent it.
2075 */
2076 if (block) {
2077 unsigned long page;
2078
2079 page = offset >> TARGET_PAGE_BITS;
2080 dirty = test_bit(page, block->bmap);
2081 if (!dirty) {
2082 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2083 page);
2084 } else {
2085 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2086 }
2087 }
a82d593b 2088
777f53c7 2089 } while (block && !dirty);
a82d593b 2090
c01b16ed
PX
2091 if (block) {
2092 /* See comment above postcopy_preempted_contains() */
2093 if (postcopy_preempted_contains(rs, block, offset)) {
2094 trace_postcopy_preempt_hit(block->idstr, offset);
82b54ef4
PX
2095 /*
2096 * If what we preempted previously was exactly what we're
2097 * requesting right now, restore the preempted precopy
2098 * immediately, boosting its priority as it's requested by
2099 * postcopy.
2100 */
2101 postcopy_preempt_restore(rs, pss, true);
2102 return true;
c01b16ed
PX
2103 }
2104 } else {
278e2f55
AG
2105 /*
2106 * Poll write faults too if background snapshot is enabled; that's
2107 * when we have vcpus got blocked by the write protected pages.
2108 */
2109 block = poll_fault_page(rs, &offset);
2110 }
2111
a82d593b 2112 if (block) {
a82d593b
DDAG
2113 /*
2114 * We want the background search to continue from the queued page
2115 * since the guest is likely to want other pages near to the page
2116 * it just requested.
2117 */
2118 pss->block = block;
a935e30f 2119 pss->page = offset >> TARGET_PAGE_BITS;
422314e7
WY
2120
2121 /*
2122 * This unqueued page would break the "one round" check, even is
2123 * really rare.
2124 */
2125 pss->complete_round = false;
82b54ef4 2126 /* Mark it an urgent request, meanwhile using POSTCOPY channel */
ea2faf0c 2127 pss->postcopy_requested = true;
82b54ef4 2128 pss->postcopy_target_channel = RAM_CHANNEL_POSTCOPY;
a82d593b
DDAG
2129 }
2130
2131 return !!block;
2132}
2133
6c595cde 2134/**
5e58f968
JQ
2135 * migration_page_queue_free: drop any remaining pages in the ram
2136 * request queue
6c595cde 2137 *
3d0684b2
JQ
2138 * It should be empty at the end anyway, but in error cases there may
2139 * be some left. in case that there is any page left, we drop it.
2140 *
6c595cde 2141 */
83c13382 2142static void migration_page_queue_free(RAMState *rs)
6c595cde 2143{
ec481c6c 2144 struct RAMSrcPageRequest *mspr, *next_mspr;
6c595cde
DDAG
2145 /* This queue generally should be empty - but in the case of a failed
2146 * migration might have some droppings in.
2147 */
89ac5a1d 2148 RCU_READ_LOCK_GUARD();
ec481c6c 2149 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
6c595cde 2150 memory_region_unref(mspr->rb->mr);
ec481c6c 2151 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
6c595cde
DDAG
2152 g_free(mspr);
2153 }
6c595cde
DDAG
2154}
2155
2156/**
3d0684b2
JQ
2157 * ram_save_queue_pages: queue the page for transmission
2158 *
2159 * A request from postcopy destination for example.
2160 *
2161 * Returns zero on success or negative on error
2162 *
3d0684b2
JQ
2163 * @rbname: Name of the RAMBLock of the request. NULL means the
2164 * same that last one.
2165 * @start: starting address from the start of the RAMBlock
2166 * @len: length (in bytes) to send
6c595cde 2167 */
96506894 2168int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
6c595cde
DDAG
2169{
2170 RAMBlock *ramblock;
53518d94 2171 RAMState *rs = ram_state;
6c595cde 2172
9360447d 2173 ram_counters.postcopy_requests++;
89ac5a1d
DDAG
2174 RCU_READ_LOCK_GUARD();
2175
6c595cde
DDAG
2176 if (!rbname) {
2177 /* Reuse last RAMBlock */
68a098f3 2178 ramblock = rs->last_req_rb;
6c595cde
DDAG
2179
2180 if (!ramblock) {
2181 /*
2182 * Shouldn't happen, we can't reuse the last RAMBlock if
2183 * it's the 1st request.
2184 */
2185 error_report("ram_save_queue_pages no previous block");
03acb4e9 2186 return -1;
6c595cde
DDAG
2187 }
2188 } else {
2189 ramblock = qemu_ram_block_by_name(rbname);
2190
2191 if (!ramblock) {
2192 /* We shouldn't be asked for a non-existent RAMBlock */
2193 error_report("ram_save_queue_pages no block '%s'", rbname);
03acb4e9 2194 return -1;
6c595cde 2195 }
68a098f3 2196 rs->last_req_rb = ramblock;
6c595cde
DDAG
2197 }
2198 trace_ram_save_queue_pages(ramblock->idstr, start, len);
542147f4 2199 if (!offset_in_ramblock(ramblock, start + len - 1)) {
9458ad6b
JQ
2200 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2201 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
6c595cde 2202 __func__, start, len, ramblock->used_length);
03acb4e9 2203 return -1;
6c595cde
DDAG
2204 }
2205
ec481c6c 2206 struct RAMSrcPageRequest *new_entry =
b21e2380 2207 g_new0(struct RAMSrcPageRequest, 1);
6c595cde
DDAG
2208 new_entry->rb = ramblock;
2209 new_entry->offset = start;
2210 new_entry->len = len;
2211
2212 memory_region_ref(ramblock->mr);
ec481c6c
JQ
2213 qemu_mutex_lock(&rs->src_page_req_mutex);
2214 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
e03a34f8 2215 migration_make_urgent_request();
ec481c6c 2216 qemu_mutex_unlock(&rs->src_page_req_mutex);
6c595cde
DDAG
2217
2218 return 0;
6c595cde
DDAG
2219}
2220
d7400a34
XG
2221static bool save_page_use_compression(RAMState *rs)
2222{
2223 if (!migrate_use_compression()) {
2224 return false;
2225 }
2226
2227 /*
1a373522
DH
2228 * If xbzrle is enabled (e.g., after first round of migration), stop
2229 * using the data compression. In theory, xbzrle can do better than
2230 * compression.
d7400a34 2231 */
1a373522
DH
2232 if (rs->xbzrle_enabled) {
2233 return false;
d7400a34
XG
2234 }
2235
1a373522 2236 return true;
d7400a34
XG
2237}
2238
5e5fdcff
XG
2239/*
2240 * try to compress the page before posting it out, return true if the page
2241 * has been properly handled by compression, otherwise needs other
2242 * paths to handle it
2243 */
2244static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2245{
2246 if (!save_page_use_compression(rs)) {
2247 return false;
2248 }
2249
2250 /*
2251 * When starting the process of a new block, the first page of
2252 * the block should be sent out before other pages in the same
2253 * block, and all the pages in last block should have been sent
2254 * out, keeping this order is important, because the 'cont' flag
2255 * is used to avoid resending the block name.
2256 *
2257 * We post the fist page as normal page as compression will take
2258 * much CPU resource.
2259 */
2260 if (block != rs->last_sent_block) {
2261 flush_compressed_data(rs);
2262 return false;
2263 }
2264
2265 if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2266 return true;
2267 }
2268
76e03000 2269 compression_counters.busy++;
5e5fdcff
XG
2270 return false;
2271}
2272
a82d593b 2273/**
3d0684b2 2274 * ram_save_target_page: save one target page
a82d593b 2275 *
3d0684b2 2276 * Returns the number of pages written
a82d593b 2277 *
6f37bb8b 2278 * @rs: current RAM state
3d0684b2 2279 * @pss: data about the page we want to send
a82d593b 2280 */
05931ec5 2281static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss)
a82d593b 2282{
a8ec91f9 2283 RAMBlock *block = pss->block;
8bba004c 2284 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
a8ec91f9
XG
2285 int res;
2286
2287 if (control_save_page(rs, block, offset, &res)) {
2288 return res;
2289 }
2290
5e5fdcff
XG
2291 if (save_compress_page(rs, block, offset)) {
2292 return 1;
d7400a34
XG
2293 }
2294
2295 res = save_zero_page(rs, block, offset);
2296 if (res > 0) {
2297 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2298 * page would be stale
2299 */
ef5c3d13 2300 if (rs->xbzrle_enabled) {
d7400a34
XG
2301 XBZRLE_cache_lock();
2302 xbzrle_cache_zero_page(rs, block->offset + offset);
2303 XBZRLE_cache_unlock();
2304 }
d7400a34
XG
2305 return res;
2306 }
2307
da3f56cb 2308 /*
6f39c90b
PX
2309 * Do not use multifd in postcopy as one whole host page should be
2310 * placed. Meanwhile postcopy requires atomic update of pages, so even
2311 * if host page size == guest page size the dest guest during run may
2312 * still see partially copied pages which is data corruption.
da3f56cb 2313 */
6f39c90b 2314 if (migrate_use_multifd() && !migration_in_postcopy()) {
b9ee2f7d 2315 return ram_save_multifd_page(rs, block, offset);
a82d593b
DDAG
2316 }
2317
05931ec5 2318 return ram_save_page(rs, pss);
a82d593b
DDAG
2319}
2320
c01b16ed
PX
2321static bool postcopy_needs_preempt(RAMState *rs, PageSearchStatus *pss)
2322{
c8750de1
PX
2323 MigrationState *ms = migrate_get_current();
2324
c01b16ed
PX
2325 /* Not enabled eager preempt? Then never do that. */
2326 if (!migrate_postcopy_preempt()) {
2327 return false;
2328 }
2329
c8750de1
PX
2330 /* If the user explicitly disabled breaking of huge page, skip */
2331 if (!ms->postcopy_preempt_break_huge) {
2332 return false;
2333 }
2334
c01b16ed
PX
2335 /* If the ramblock we're sending is a small page? Never bother. */
2336 if (qemu_ram_pagesize(pss->block) == TARGET_PAGE_SIZE) {
2337 return false;
2338 }
2339
2340 /* Not in postcopy at all? */
2341 if (!migration_in_postcopy()) {
2342 return false;
2343 }
2344
2345 /*
2346 * If we're already handling a postcopy request, don't preempt as this page
2347 * has got the same high priority.
2348 */
2349 if (pss->postcopy_requested) {
2350 return false;
2351 }
2352
2353 /* If there's postcopy requests, then check it up! */
2354 return postcopy_has_request(rs);
2355}
2356
2357/* Returns true if we preempted precopy, false otherwise */
2358static void postcopy_do_preempt(RAMState *rs, PageSearchStatus *pss)
2359{
2360 PostcopyPreemptState *p_state = &rs->postcopy_preempt_state;
2361
2362 trace_postcopy_preempt_triggered(pss->block->idstr, pss->page);
2363
2364 /*
2365 * Time to preempt precopy. Cache current PSS into preempt state, so that
2366 * after handling the postcopy pages we can recover to it. We need to do
2367 * so because the dest VM will have partial of the precopy huge page kept
2368 * over in its tmp huge page caches; better move on with it when we can.
2369 */
2370 p_state->ram_block = pss->block;
2371 p_state->ram_page = pss->page;
2372 p_state->preempted = true;
2373}
2374
2375/* Whether we're preempted by a postcopy request during sending a huge page */
2376static bool postcopy_preempt_triggered(RAMState *rs)
2377{
2378 return rs->postcopy_preempt_state.preempted;
2379}
2380
82b54ef4
PX
2381static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss,
2382 bool postcopy_requested)
c01b16ed
PX
2383{
2384 PostcopyPreemptState *state = &rs->postcopy_preempt_state;
2385
2386 assert(state->preempted);
2387
2388 pss->block = state->ram_block;
2389 pss->page = state->ram_page;
82b54ef4
PX
2390
2391 /* Whether this is a postcopy request? */
2392 pss->postcopy_requested = postcopy_requested;
2393 /*
2394 * When restoring a preempted page, the old data resides in PRECOPY
2395 * slow channel, even if postcopy_requested is set. So always use
2396 * PRECOPY channel here.
2397 */
2398 pss->postcopy_target_channel = RAM_CHANNEL_PRECOPY;
c01b16ed
PX
2399
2400 trace_postcopy_preempt_restored(pss->block->idstr, pss->page);
2401
2402 /* Reset preempt state, most importantly, set preempted==false */
2403 postcopy_preempt_reset(rs);
2404}
2405
2406static void postcopy_preempt_choose_channel(RAMState *rs, PageSearchStatus *pss)
2407{
2408 MigrationState *s = migrate_get_current();
82b54ef4 2409 unsigned int channel = pss->postcopy_target_channel;
c01b16ed
PX
2410 QEMUFile *next;
2411
c01b16ed
PX
2412 if (channel != rs->postcopy_channel) {
2413 if (channel == RAM_CHANNEL_PRECOPY) {
2414 next = s->to_dst_file;
2415 } else {
2416 next = s->postcopy_qemufile_src;
2417 }
2418 /* Update and cache the current channel */
2419 rs->f = next;
2420 rs->postcopy_channel = channel;
2421
2422 /*
2423 * If channel switched, reset last_sent_block since the old sent block
2424 * may not be on the same channel.
2425 */
2426 rs->last_sent_block = NULL;
2427
2428 trace_postcopy_preempt_switch_channel(channel);
2429 }
2430
2431 trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2432}
2433
2434/* We need to make sure rs->f always points to the default channel elsewhere */
2435static void postcopy_preempt_reset_channel(RAMState *rs)
2436{
20123ee1 2437 if (postcopy_preempt_active()) {
c01b16ed
PX
2438 rs->postcopy_channel = RAM_CHANNEL_PRECOPY;
2439 rs->f = migrate_get_current()->to_dst_file;
2440 trace_postcopy_preempt_reset_channel();
2441 }
2442}
2443
a82d593b 2444/**
3d0684b2 2445 * ram_save_host_page: save a whole host page
a82d593b 2446 *
3d0684b2
JQ
2447 * Starting at *offset send pages up to the end of the current host
2448 * page. It's valid for the initial offset to point into the middle of
2449 * a host page in which case the remainder of the hostpage is sent.
2450 * Only dirty target pages are sent. Note that the host page size may
2451 * be a huge page for this block.
1eb3fc0a
DDAG
2452 * The saving stops at the boundary of the used_length of the block
2453 * if the RAMBlock isn't a multiple of the host page size.
a82d593b 2454 *
3d0684b2
JQ
2455 * Returns the number of pages written or negative on error
2456 *
6f37bb8b 2457 * @rs: current RAM state
3d0684b2 2458 * @pss: data about the page we want to send
a82d593b 2459 */
05931ec5 2460static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
a82d593b
DDAG
2461{
2462 int tmppages, pages = 0;
a935e30f
JQ
2463 size_t pagesize_bits =
2464 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
ba1b7c81
KJ
2465 unsigned long hostpage_boundary =
2466 QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
278e2f55
AG
2467 unsigned long start_page = pss->page;
2468 int res;
4c011c37 2469
fbd162e6 2470 if (ramblock_is_ignored(pss->block)) {
b895de50
CLG
2471 error_report("block %s should not be migrated !", pss->block->idstr);
2472 return 0;
2473 }
2474
20123ee1 2475 if (postcopy_preempt_active()) {
c01b16ed
PX
2476 postcopy_preempt_choose_channel(rs, pss);
2477 }
2478
a82d593b 2479 do {
c01b16ed
PX
2480 if (postcopy_needs_preempt(rs, pss)) {
2481 postcopy_do_preempt(rs, pss);
2482 break;
2483 }
2484
1faa5665 2485 /* Check the pages is dirty and if it is send it */
ba1b7c81 2486 if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
05931ec5 2487 tmppages = ram_save_target_page(rs, pss);
ba1b7c81
KJ
2488 if (tmppages < 0) {
2489 return tmppages;
2490 }
a82d593b 2491
ba1b7c81
KJ
2492 pages += tmppages;
2493 /*
2494 * Allow rate limiting to happen in the middle of huge pages if
2495 * something is sent in the current iteration.
2496 */
2497 if (pagesize_bits > 1 && tmppages > 0) {
2498 migration_rate_limit();
2499 }
23feba90 2500 }
ba1b7c81
KJ
2501 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2502 } while ((pss->page < hostpage_boundary) &&
8bba004c
AR
2503 offset_in_ramblock(pss->block,
2504 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
ba1b7c81 2505 /* The offset we leave with is the min boundary of host page and block */
258f5c98 2506 pss->page = MIN(pss->page, hostpage_boundary);
278e2f55 2507
c01b16ed
PX
2508 /*
2509 * When with postcopy preempt mode, flush the data as soon as possible for
2510 * postcopy requests, because we've already sent a whole huge page, so the
2511 * dst node should already have enough resource to atomically filling in
2512 * the current missing page.
2513 *
2514 * More importantly, when using separate postcopy channel, we must do
2515 * explicit flush or it won't flush until the buffer is full.
2516 */
2517 if (migrate_postcopy_preempt() && pss->postcopy_requested) {
2518 qemu_fflush(rs->f);
2519 }
2520
278e2f55
AG
2521 res = ram_save_release_protection(rs, pss, start_page);
2522 return (res < 0 ? res : pages);
a82d593b 2523}
6c595cde 2524
56e93d26 2525/**
3d0684b2 2526 * ram_find_and_save_block: finds a dirty page and sends it to f
56e93d26
JQ
2527 *
2528 * Called within an RCU critical section.
2529 *
e8f3735f
XG
2530 * Returns the number of pages written where zero means no dirty pages,
2531 * or negative on error
56e93d26 2532 *
6f37bb8b 2533 * @rs: current RAM state
a82d593b
DDAG
2534 *
2535 * On systems where host-page-size > target-page-size it will send all the
2536 * pages in a host page that are dirty.
56e93d26 2537 */
05931ec5 2538static int ram_find_and_save_block(RAMState *rs)
56e93d26 2539{
b8fb8cb7 2540 PageSearchStatus pss;
56e93d26 2541 int pages = 0;
b9e60928 2542 bool again, found;
56e93d26 2543
0827b9e9
AA
2544 /* No dirty page as there is zero RAM */
2545 if (!ram_bytes_total()) {
2546 return pages;
2547 }
2548
4934a5dd
PX
2549 /*
2550 * Always keep last_seen_block/last_page valid during this procedure,
2551 * because find_dirty_block() relies on these values (e.g., we compare
2552 * last_seen_block with pss.block to see whether we searched all the
2553 * ramblocks) to detect the completion of migration. Having NULL value
2554 * of last_seen_block can conditionally cause below loop to run forever.
2555 */
2556 if (!rs->last_seen_block) {
2557 rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks);
2558 rs->last_page = 0;
2559 }
2560
6f37bb8b 2561 pss.block = rs->last_seen_block;
a935e30f 2562 pss.page = rs->last_page;
b8fb8cb7
DDAG
2563 pss.complete_round = false;
2564
b9e60928 2565 do {
a82d593b 2566 again = true;
f20e2865 2567 found = get_queued_page(rs, &pss);
b9e60928 2568
a82d593b 2569 if (!found) {
c01b16ed
PX
2570 /*
2571 * Recover previous precopy ramblock/offset if postcopy has
2572 * preempted precopy. Otherwise find the next dirty bit.
2573 */
2574 if (postcopy_preempt_triggered(rs)) {
82b54ef4 2575 postcopy_preempt_restore(rs, &pss, false);
c01b16ed
PX
2576 found = true;
2577 } else {
2578 /* priority queue empty, so just search for something dirty */
2579 found = find_dirty_block(rs, &pss, &again);
2580 }
a82d593b 2581 }
f3f491fc 2582
a82d593b 2583 if (found) {
05931ec5 2584 pages = ram_save_host_page(rs, &pss);
56e93d26 2585 }
b9e60928 2586 } while (!pages && again);
56e93d26 2587
6f37bb8b 2588 rs->last_seen_block = pss.block;
a935e30f 2589 rs->last_page = pss.page;
56e93d26
JQ
2590
2591 return pages;
2592}
2593
2594void acct_update_position(QEMUFile *f, size_t size, bool zero)
2595{
2596 uint64_t pages = size / TARGET_PAGE_SIZE;
f7ccd61b 2597
56e93d26 2598 if (zero) {
9360447d 2599 ram_counters.duplicate += pages;
56e93d26 2600 } else {
9360447d 2601 ram_counters.normal += pages;
4c2d0f6d 2602 ram_transferred_add(size);
1a93bd2f 2603 qemu_file_credit_transfer(f, size);
56e93d26
JQ
2604 }
2605}
2606
fbd162e6 2607static uint64_t ram_bytes_total_common(bool count_ignored)
56e93d26
JQ
2608{
2609 RAMBlock *block;
2610 uint64_t total = 0;
2611
89ac5a1d
DDAG
2612 RCU_READ_LOCK_GUARD();
2613
fbd162e6
YK
2614 if (count_ignored) {
2615 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2616 total += block->used_length;
2617 }
2618 } else {
2619 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2620 total += block->used_length;
2621 }
99e15582 2622 }
56e93d26
JQ
2623 return total;
2624}
2625
fbd162e6
YK
2626uint64_t ram_bytes_total(void)
2627{
2628 return ram_bytes_total_common(false);
2629}
2630
f265e0e4 2631static void xbzrle_load_setup(void)
56e93d26 2632{
f265e0e4 2633 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
56e93d26
JQ
2634}
2635
f265e0e4
JQ
2636static void xbzrle_load_cleanup(void)
2637{
2638 g_free(XBZRLE.decoded_buf);
2639 XBZRLE.decoded_buf = NULL;
2640}
2641
7d7c96be
PX
2642static void ram_state_cleanup(RAMState **rsp)
2643{
b9ccaf6d
DDAG
2644 if (*rsp) {
2645 migration_page_queue_free(*rsp);
2646 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2647 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2648 g_free(*rsp);
2649 *rsp = NULL;
2650 }
7d7c96be
PX
2651}
2652
84593a08
PX
2653static void xbzrle_cleanup(void)
2654{
2655 XBZRLE_cache_lock();
2656 if (XBZRLE.cache) {
2657 cache_fini(XBZRLE.cache);
2658 g_free(XBZRLE.encoded_buf);
2659 g_free(XBZRLE.current_buf);
2660 g_free(XBZRLE.zero_target_page);
2661 XBZRLE.cache = NULL;
2662 XBZRLE.encoded_buf = NULL;
2663 XBZRLE.current_buf = NULL;
2664 XBZRLE.zero_target_page = NULL;
2665 }
2666 XBZRLE_cache_unlock();
2667}
2668
f265e0e4 2669static void ram_save_cleanup(void *opaque)
56e93d26 2670{
53518d94 2671 RAMState **rsp = opaque;
6b6712ef 2672 RAMBlock *block;
eb859c53 2673
278e2f55
AG
2674 /* We don't use dirty log with background snapshots */
2675 if (!migrate_background_snapshot()) {
2676 /* caller have hold iothread lock or is in a bh, so there is
2677 * no writing race against the migration bitmap
2678 */
63b41db4
HH
2679 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2680 /*
2681 * do not stop dirty log without starting it, since
2682 * memory_global_dirty_log_stop will assert that
2683 * memory_global_dirty_log_start/stop used in pairs
2684 */
2685 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2686 }
278e2f55 2687 }
6b6712ef 2688
fbd162e6 2689 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
002cad6b
PX
2690 g_free(block->clear_bmap);
2691 block->clear_bmap = NULL;
6b6712ef
JQ
2692 g_free(block->bmap);
2693 block->bmap = NULL;
56e93d26
JQ
2694 }
2695
84593a08 2696 xbzrle_cleanup();
f0afa331 2697 compress_threads_save_cleanup();
7d7c96be 2698 ram_state_cleanup(rsp);
56e93d26
JQ
2699}
2700
6f37bb8b 2701static void ram_state_reset(RAMState *rs)
56e93d26 2702{
6f37bb8b
JQ
2703 rs->last_seen_block = NULL;
2704 rs->last_sent_block = NULL;
269ace29 2705 rs->last_page = 0;
6f37bb8b 2706 rs->last_version = ram_list.version;
1a373522 2707 rs->xbzrle_enabled = false;
c01b16ed
PX
2708 postcopy_preempt_reset(rs);
2709 rs->postcopy_channel = RAM_CHANNEL_PRECOPY;
56e93d26
JQ
2710}
2711
2712#define MAX_WAIT 50 /* ms, half buffered_file limit */
2713
e0b266f0
DDAG
2714/* **** functions for postcopy ***** */
2715
ced1c616
PB
2716void ram_postcopy_migrated_memory_release(MigrationState *ms)
2717{
2718 struct RAMBlock *block;
ced1c616 2719
fbd162e6 2720 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
6b6712ef
JQ
2721 unsigned long *bitmap = block->bmap;
2722 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2723 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
ced1c616
PB
2724
2725 while (run_start < range) {
2726 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
8bba004c
AR
2727 ram_discard_range(block->idstr,
2728 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2729 ((ram_addr_t)(run_end - run_start))
2730 << TARGET_PAGE_BITS);
ced1c616
PB
2731 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2732 }
2733 }
2734}
2735
3d0684b2
JQ
2736/**
2737 * postcopy_send_discard_bm_ram: discard a RAMBlock
2738 *
e0b266f0 2739 * Callback from postcopy_each_ram_send_discard for each RAMBlock
3d0684b2
JQ
2740 *
2741 * @ms: current migration state
89dab31b 2742 * @block: RAMBlock to discard
e0b266f0 2743 */
9e7d1223 2744static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
e0b266f0 2745{
6b6712ef 2746 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
e0b266f0 2747 unsigned long current;
1e7cf8c3 2748 unsigned long *bitmap = block->bmap;
e0b266f0 2749
6b6712ef 2750 for (current = 0; current < end; ) {
1e7cf8c3 2751 unsigned long one = find_next_bit(bitmap, end, current);
33a5cb62 2752 unsigned long zero, discard_length;
e0b266f0 2753
33a5cb62
WY
2754 if (one >= end) {
2755 break;
2756 }
e0b266f0 2757
1e7cf8c3 2758 zero = find_next_zero_bit(bitmap, end, one + 1);
33a5cb62
WY
2759
2760 if (zero >= end) {
2761 discard_length = end - one;
e0b266f0 2762 } else {
33a5cb62
WY
2763 discard_length = zero - one;
2764 }
810cf2bb 2765 postcopy_discard_send_range(ms, one, discard_length);
33a5cb62 2766 current = one + discard_length;
e0b266f0 2767 }
e0b266f0
DDAG
2768}
2769
f30c2e5b
PX
2770static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2771
3d0684b2
JQ
2772/**
2773 * postcopy_each_ram_send_discard: discard all RAMBlocks
2774 *
e0b266f0
DDAG
2775 * Utility for the outgoing postcopy code.
2776 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2777 * passing it bitmap indexes and name.
e0b266f0
DDAG
2778 * (qemu_ram_foreach_block ends up passing unscaled lengths
2779 * which would mean postcopy code would have to deal with target page)
3d0684b2
JQ
2780 *
2781 * @ms: current migration state
e0b266f0 2782 */
739fcc1b 2783static void postcopy_each_ram_send_discard(MigrationState *ms)
e0b266f0
DDAG
2784{
2785 struct RAMBlock *block;
e0b266f0 2786
fbd162e6 2787 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
810cf2bb 2788 postcopy_discard_send_init(ms, block->idstr);
e0b266f0 2789
f30c2e5b
PX
2790 /*
2791 * Deal with TPS != HPS and huge pages. It discard any partially sent
2792 * host-page size chunks, mark any partially dirty host-page size
2793 * chunks as all dirty. In this case the host-page is the host-page
2794 * for the particular RAMBlock, i.e. it might be a huge page.
2795 */
2796 postcopy_chunk_hostpages_pass(ms, block);
2797
e0b266f0
DDAG
2798 /*
2799 * Postcopy sends chunks of bitmap over the wire, but it
2800 * just needs indexes at this point, avoids it having
2801 * target page specific code.
2802 */
739fcc1b 2803 postcopy_send_discard_bm_ram(ms, block);
810cf2bb 2804 postcopy_discard_send_finish(ms);
e0b266f0 2805 }
e0b266f0
DDAG
2806}
2807
3d0684b2 2808/**
8324ef86 2809 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
3d0684b2
JQ
2810 *
2811 * Helper for postcopy_chunk_hostpages; it's called twice to
2812 * canonicalize the two bitmaps, that are similar, but one is
2813 * inverted.
99e314eb 2814 *
3d0684b2
JQ
2815 * Postcopy requires that all target pages in a hostpage are dirty or
2816 * clean, not a mix. This function canonicalizes the bitmaps.
99e314eb 2817 *
3d0684b2 2818 * @ms: current migration state
3d0684b2 2819 * @block: block that contains the page we want to canonicalize
99e314eb 2820 */
1e7cf8c3 2821static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
99e314eb 2822{
53518d94 2823 RAMState *rs = ram_state;
6b6712ef 2824 unsigned long *bitmap = block->bmap;
29c59172 2825 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
6b6712ef 2826 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
99e314eb
DDAG
2827 unsigned long run_start;
2828
29c59172
DDAG
2829 if (block->page_size == TARGET_PAGE_SIZE) {
2830 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2831 return;
2832 }
2833
1e7cf8c3
WY
2834 /* Find a dirty page */
2835 run_start = find_next_bit(bitmap, pages, 0);
99e314eb 2836
6b6712ef 2837 while (run_start < pages) {
99e314eb
DDAG
2838
2839 /*
2840 * If the start of this run of pages is in the middle of a host
2841 * page, then we need to fixup this host page.
2842 */
9dec3cc3 2843 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
99e314eb 2844 /* Find the end of this run */
1e7cf8c3 2845 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
99e314eb
DDAG
2846 /*
2847 * If the end isn't at the start of a host page, then the
2848 * run doesn't finish at the end of a host page
2849 * and we need to discard.
2850 */
99e314eb
DDAG
2851 }
2852
9dec3cc3 2853 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
99e314eb 2854 unsigned long page;
dad45ab2
WY
2855 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2856 host_ratio);
2857 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
99e314eb 2858
99e314eb
DDAG
2859 /* Clean up the bitmap */
2860 for (page = fixup_start_addr;
2861 page < fixup_start_addr + host_ratio; page++) {
99e314eb
DDAG
2862 /*
2863 * Remark them as dirty, updating the count for any pages
2864 * that weren't previously dirty.
2865 */
0d8ec885 2866 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
99e314eb
DDAG
2867 }
2868 }
2869
1e7cf8c3
WY
2870 /* Find the next dirty page for the next iteration */
2871 run_start = find_next_bit(bitmap, pages, run_start);
99e314eb
DDAG
2872 }
2873}
2874
3d0684b2
JQ
2875/**
2876 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2877 *
e0b266f0
DDAG
2878 * Transmit the set of pages to be discarded after precopy to the target
2879 * these are pages that:
2880 * a) Have been previously transmitted but are now dirty again
2881 * b) Pages that have never been transmitted, this ensures that
2882 * any pages on the destination that have been mapped by background
2883 * tasks get discarded (transparent huge pages is the specific concern)
2884 * Hopefully this is pretty sparse
3d0684b2
JQ
2885 *
2886 * @ms: current migration state
e0b266f0 2887 */
739fcc1b 2888void ram_postcopy_send_discard_bitmap(MigrationState *ms)
e0b266f0 2889{
53518d94 2890 RAMState *rs = ram_state;
e0b266f0 2891
89ac5a1d 2892 RCU_READ_LOCK_GUARD();
e0b266f0
DDAG
2893
2894 /* This should be our last sync, the src is now paused */
eb859c53 2895 migration_bitmap_sync(rs);
e0b266f0 2896
6b6712ef
JQ
2897 /* Easiest way to make sure we don't resume in the middle of a host-page */
2898 rs->last_seen_block = NULL;
2899 rs->last_sent_block = NULL;
2900 rs->last_page = 0;
e0b266f0 2901
739fcc1b 2902 postcopy_each_ram_send_discard(ms);
e0b266f0 2903
739fcc1b 2904 trace_ram_postcopy_send_discard_bitmap();
e0b266f0
DDAG
2905}
2906
3d0684b2
JQ
2907/**
2908 * ram_discard_range: discard dirtied pages at the beginning of postcopy
e0b266f0 2909 *
3d0684b2 2910 * Returns zero on success
e0b266f0 2911 *
36449157
JQ
2912 * @rbname: name of the RAMBlock of the request. NULL means the
2913 * same that last one.
3d0684b2
JQ
2914 * @start: RAMBlock starting page
2915 * @length: RAMBlock size
e0b266f0 2916 */
aaa2064c 2917int ram_discard_range(const char *rbname, uint64_t start, size_t length)
e0b266f0 2918{
36449157 2919 trace_ram_discard_range(rbname, start, length);
d3a5038c 2920
89ac5a1d 2921 RCU_READ_LOCK_GUARD();
36449157 2922 RAMBlock *rb = qemu_ram_block_by_name(rbname);
e0b266f0
DDAG
2923
2924 if (!rb) {
36449157 2925 error_report("ram_discard_range: Failed to find block '%s'", rbname);
03acb4e9 2926 return -1;
e0b266f0
DDAG
2927 }
2928
814bb08f
PX
2929 /*
2930 * On source VM, we don't need to update the received bitmap since
2931 * we don't even have one.
2932 */
2933 if (rb->receivedmap) {
2934 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2935 length >> qemu_target_page_bits());
2936 }
2937
03acb4e9 2938 return ram_block_discard_range(rb, start, length);
e0b266f0
DDAG
2939}
2940
84593a08
PX
2941/*
2942 * For every allocation, we will try not to crash the VM if the
2943 * allocation failed.
2944 */
2945static int xbzrle_init(void)
2946{
2947 Error *local_err = NULL;
2948
2949 if (!migrate_use_xbzrle()) {
2950 return 0;
2951 }
2952
2953 XBZRLE_cache_lock();
2954
2955 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2956 if (!XBZRLE.zero_target_page) {
2957 error_report("%s: Error allocating zero page", __func__);
2958 goto err_out;
2959 }
2960
2961 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2962 TARGET_PAGE_SIZE, &local_err);
2963 if (!XBZRLE.cache) {
2964 error_report_err(local_err);
2965 goto free_zero_page;
2966 }
2967
2968 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2969 if (!XBZRLE.encoded_buf) {
2970 error_report("%s: Error allocating encoded_buf", __func__);
2971 goto free_cache;
2972 }
2973
2974 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2975 if (!XBZRLE.current_buf) {
2976 error_report("%s: Error allocating current_buf", __func__);
2977 goto free_encoded_buf;
2978 }
2979
2980 /* We are all good */
2981 XBZRLE_cache_unlock();
2982 return 0;
2983
2984free_encoded_buf:
2985 g_free(XBZRLE.encoded_buf);
2986 XBZRLE.encoded_buf = NULL;
2987free_cache:
2988 cache_fini(XBZRLE.cache);
2989 XBZRLE.cache = NULL;
2990free_zero_page:
2991 g_free(XBZRLE.zero_target_page);
2992 XBZRLE.zero_target_page = NULL;
2993err_out:
2994 XBZRLE_cache_unlock();
2995 return -ENOMEM;
2996}
2997
53518d94 2998static int ram_state_init(RAMState **rsp)
56e93d26 2999{
7d00ee6a
PX
3000 *rsp = g_try_new0(RAMState, 1);
3001
3002 if (!*rsp) {
3003 error_report("%s: Init ramstate fail", __func__);
3004 return -1;
3005 }
53518d94
JQ
3006
3007 qemu_mutex_init(&(*rsp)->bitmap_mutex);
3008 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3009 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
56e93d26 3010
7d00ee6a 3011 /*
40c4d4a8
IR
3012 * Count the total number of pages used by ram blocks not including any
3013 * gaps due to alignment or unplugs.
03158519 3014 * This must match with the initial values of dirty bitmap.
7d00ee6a 3015 */
40c4d4a8 3016 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
7d00ee6a
PX
3017 ram_state_reset(*rsp);
3018
3019 return 0;
3020}
3021
d6eff5d7 3022static void ram_list_init_bitmaps(void)
7d00ee6a 3023{
002cad6b 3024 MigrationState *ms = migrate_get_current();
d6eff5d7
PX
3025 RAMBlock *block;
3026 unsigned long pages;
002cad6b 3027 uint8_t shift;
56e93d26 3028
0827b9e9
AA
3029 /* Skip setting bitmap if there is no RAM */
3030 if (ram_bytes_total()) {
002cad6b
PX
3031 shift = ms->clear_bitmap_shift;
3032 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3033 error_report("clear_bitmap_shift (%u) too big, using "
3034 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3035 shift = CLEAR_BITMAP_SHIFT_MAX;
3036 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3037 error_report("clear_bitmap_shift (%u) too small, using "
3038 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3039 shift = CLEAR_BITMAP_SHIFT_MIN;
3040 }
3041
fbd162e6 3042 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
d6eff5d7 3043 pages = block->max_length >> TARGET_PAGE_BITS;
03158519
WY
3044 /*
3045 * The initial dirty bitmap for migration must be set with all
3046 * ones to make sure we'll migrate every guest RAM page to
3047 * destination.
40c4d4a8
IR
3048 * Here we set RAMBlock.bmap all to 1 because when rebegin a
3049 * new migration after a failed migration, ram_list.
3050 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3051 * guest memory.
03158519 3052 */
6b6712ef 3053 block->bmap = bitmap_new(pages);
40c4d4a8 3054 bitmap_set(block->bmap, 0, pages);
002cad6b
PX
3055 block->clear_bmap_shift = shift;
3056 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
0827b9e9 3057 }
f3f491fc 3058 }
d6eff5d7
PX
3059}
3060
be39b4cd
DH
3061static void migration_bitmap_clear_discarded_pages(RAMState *rs)
3062{
3063 unsigned long pages;
3064 RAMBlock *rb;
3065
3066 RCU_READ_LOCK_GUARD();
3067
3068 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3069 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
3070 rs->migration_dirty_pages -= pages;
3071 }
3072}
3073
d6eff5d7
PX
3074static void ram_init_bitmaps(RAMState *rs)
3075{
3076 /* For memory_global_dirty_log_start below. */
3077 qemu_mutex_lock_iothread();
3078 qemu_mutex_lock_ramlist();
f3f491fc 3079
89ac5a1d
DDAG
3080 WITH_RCU_READ_LOCK_GUARD() {
3081 ram_list_init_bitmaps();
278e2f55
AG
3082 /* We don't use dirty log with background snapshots */
3083 if (!migrate_background_snapshot()) {
63b41db4 3084 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
278e2f55
AG
3085 migration_bitmap_sync_precopy(rs);
3086 }
89ac5a1d 3087 }
56e93d26 3088 qemu_mutex_unlock_ramlist();
49877834 3089 qemu_mutex_unlock_iothread();
be39b4cd
DH
3090
3091 /*
3092 * After an eventual first bitmap sync, fixup the initial bitmap
3093 * containing all 1s to exclude any discarded pages from migration.
3094 */
3095 migration_bitmap_clear_discarded_pages(rs);
d6eff5d7
PX
3096}
3097
3098static int ram_init_all(RAMState **rsp)
3099{
3100 if (ram_state_init(rsp)) {
3101 return -1;
3102 }
3103
3104 if (xbzrle_init()) {
3105 ram_state_cleanup(rsp);
3106 return -1;
3107 }
3108
3109 ram_init_bitmaps(*rsp);
a91246c9
HZ
3110
3111 return 0;
3112}
3113
08614f34
PX
3114static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3115{
3116 RAMBlock *block;
3117 uint64_t pages = 0;
3118
3119 /*
3120 * Postcopy is not using xbzrle/compression, so no need for that.
3121 * Also, since source are already halted, we don't need to care
3122 * about dirty page logging as well.
3123 */
3124
fbd162e6 3125 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
08614f34
PX
3126 pages += bitmap_count_one(block->bmap,
3127 block->used_length >> TARGET_PAGE_BITS);
3128 }
3129
3130 /* This may not be aligned with current bitmaps. Recalculate. */
3131 rs->migration_dirty_pages = pages;
3132
1a373522 3133 ram_state_reset(rs);
08614f34
PX
3134
3135 /* Update RAMState cache of output QEMUFile */
3136 rs->f = out;
3137
3138 trace_ram_state_resume_prepare(pages);
3139}
3140
6bcb05fc
WW
3141/*
3142 * This function clears bits of the free pages reported by the caller from the
3143 * migration dirty bitmap. @addr is the host address corresponding to the
3144 * start of the continuous guest free pages, and @len is the total bytes of
3145 * those pages.
3146 */
3147void qemu_guest_free_page_hint(void *addr, size_t len)
3148{
3149 RAMBlock *block;
3150 ram_addr_t offset;
3151 size_t used_len, start, npages;
3152 MigrationState *s = migrate_get_current();
3153
3154 /* This function is currently expected to be used during live migration */
3155 if (!migration_is_setup_or_active(s->state)) {
3156 return;
3157 }
3158
3159 for (; len > 0; len -= used_len, addr += used_len) {
3160 block = qemu_ram_block_from_host(addr, false, &offset);
3161 if (unlikely(!block || offset >= block->used_length)) {
3162 /*
3163 * The implementation might not support RAMBlock resize during
3164 * live migration, but it could happen in theory with future
3165 * updates. So we add a check here to capture that case.
3166 */
3167 error_report_once("%s unexpected error", __func__);
3168 return;
3169 }
3170
3171 if (len <= block->used_length - offset) {
3172 used_len = len;
3173 } else {
3174 used_len = block->used_length - offset;
3175 }
3176
3177 start = offset >> TARGET_PAGE_BITS;
3178 npages = used_len >> TARGET_PAGE_BITS;
3179
3180 qemu_mutex_lock(&ram_state->bitmap_mutex);
3143577d
WW
3181 /*
3182 * The skipped free pages are equavalent to be sent from clear_bmap's
3183 * perspective, so clear the bits from the memory region bitmap which
3184 * are initially set. Otherwise those skipped pages will be sent in
3185 * the next round after syncing from the memory region bitmap.
3186 */
1230a25f 3187 migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
6bcb05fc
WW
3188 ram_state->migration_dirty_pages -=
3189 bitmap_count_one_with_offset(block->bmap, start, npages);
3190 bitmap_clear(block->bmap, start, npages);
3191 qemu_mutex_unlock(&ram_state->bitmap_mutex);
3192 }
3193}
3194
3d0684b2
JQ
3195/*
3196 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
a91246c9
HZ
3197 * long-running RCU critical section. When rcu-reclaims in the code
3198 * start to become numerous it will be necessary to reduce the
3199 * granularity of these critical sections.
3200 */
3201
3d0684b2
JQ
3202/**
3203 * ram_save_setup: Setup RAM for migration
3204 *
3205 * Returns zero to indicate success and negative for error
3206 *
3207 * @f: QEMUFile where to send the data
3208 * @opaque: RAMState pointer
3209 */
a91246c9
HZ
3210static int ram_save_setup(QEMUFile *f, void *opaque)
3211{
53518d94 3212 RAMState **rsp = opaque;
a91246c9 3213 RAMBlock *block;
33d70973 3214 int ret;
a91246c9 3215
dcaf446e
XG
3216 if (compress_threads_save_setup()) {
3217 return -1;
3218 }
3219
a91246c9
HZ
3220 /* migration has already setup the bitmap, reuse it. */
3221 if (!migration_in_colo_state()) {
7d00ee6a 3222 if (ram_init_all(rsp) != 0) {
dcaf446e 3223 compress_threads_save_cleanup();
a91246c9 3224 return -1;
53518d94 3225 }
a91246c9 3226 }
53518d94 3227 (*rsp)->f = f;
a91246c9 3228
0e6ebd48
DDAG
3229 WITH_RCU_READ_LOCK_GUARD() {
3230 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
56e93d26 3231
0e6ebd48
DDAG
3232 RAMBLOCK_FOREACH_MIGRATABLE(block) {
3233 qemu_put_byte(f, strlen(block->idstr));
3234 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3235 qemu_put_be64(f, block->used_length);
3236 if (migrate_postcopy_ram() && block->page_size !=
3237 qemu_host_page_size) {
3238 qemu_put_be64(f, block->page_size);
3239 }
3240 if (migrate_ignore_shared()) {
3241 qemu_put_be64(f, block->mr->addr);
3242 }
fbd162e6 3243 }
56e93d26
JQ
3244 }
3245
56e93d26
JQ
3246 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3247 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3248
33d70973
LB
3249 ret = multifd_send_sync_main(f);
3250 if (ret < 0) {
3251 return ret;
3252 }
3253
56e93d26 3254 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
35374cbd 3255 qemu_fflush(f);
56e93d26
JQ
3256
3257 return 0;
3258}
3259
3d0684b2
JQ
3260/**
3261 * ram_save_iterate: iterative stage for migration
3262 *
3263 * Returns zero to indicate success and negative for error
3264 *
3265 * @f: QEMUFile where to send the data
3266 * @opaque: RAMState pointer
3267 */
56e93d26
JQ
3268static int ram_save_iterate(QEMUFile *f, void *opaque)
3269{
53518d94
JQ
3270 RAMState **temp = opaque;
3271 RAMState *rs = *temp;
3d4095b2 3272 int ret = 0;
56e93d26
JQ
3273 int i;
3274 int64_t t0;
5c90308f 3275 int done = 0;
56e93d26 3276
b2557345
PL
3277 if (blk_mig_bulk_active()) {
3278 /* Avoid transferring ram during bulk phase of block migration as
3279 * the bulk phase will usually take a long time and transferring
3280 * ram updates during that time is pointless. */
3281 goto out;
3282 }
3283
63268c49
PX
3284 /*
3285 * We'll take this lock a little bit long, but it's okay for two reasons.
3286 * Firstly, the only possible other thread to take it is who calls
3287 * qemu_guest_free_page_hint(), which should be rare; secondly, see
3288 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3289 * guarantees that we'll at least released it in a regular basis.
3290 */
3291 qemu_mutex_lock(&rs->bitmap_mutex);
89ac5a1d
DDAG
3292 WITH_RCU_READ_LOCK_GUARD() {
3293 if (ram_list.version != rs->last_version) {
3294 ram_state_reset(rs);
3295 }
56e93d26 3296
89ac5a1d
DDAG
3297 /* Read version before ram_list.blocks */
3298 smp_rmb();
56e93d26 3299
89ac5a1d 3300 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
56e93d26 3301
89ac5a1d
DDAG
3302 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3303 i = 0;
3304 while ((ret = qemu_file_rate_limit(f)) == 0 ||
a1fe28df 3305 postcopy_has_request(rs)) {
89ac5a1d 3306 int pages;
e03a34f8 3307
89ac5a1d
DDAG
3308 if (qemu_file_get_error(f)) {
3309 break;
3310 }
e8f3735f 3311
05931ec5 3312 pages = ram_find_and_save_block(rs);
89ac5a1d
DDAG
3313 /* no more pages to sent */
3314 if (pages == 0) {
3315 done = 1;
3316 break;
3317 }
e8f3735f 3318
89ac5a1d
DDAG
3319 if (pages < 0) {
3320 qemu_file_set_error(f, pages);
56e93d26
JQ
3321 break;
3322 }
89ac5a1d
DDAG
3323
3324 rs->target_page_count += pages;
3325
644acf99
WY
3326 /*
3327 * During postcopy, it is necessary to make sure one whole host
3328 * page is sent in one chunk.
3329 */
3330 if (migrate_postcopy_ram()) {
3331 flush_compressed_data(rs);
3332 }
3333
89ac5a1d
DDAG
3334 /*
3335 * we want to check in the 1st loop, just in case it was the 1st
3336 * time and we had to sync the dirty bitmap.
3337 * qemu_clock_get_ns() is a bit expensive, so we only check each
3338 * some iterations
3339 */
3340 if ((i & 63) == 0) {
3341 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3342 1000000;
3343 if (t1 > MAX_WAIT) {
3344 trace_ram_save_iterate_big_wait(t1, i);
3345 break;
3346 }
3347 }
3348 i++;
56e93d26 3349 }
56e93d26 3350 }
63268c49 3351 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26 3352
c01b16ed
PX
3353 postcopy_preempt_reset_channel(rs);
3354
56e93d26
JQ
3355 /*
3356 * Must occur before EOS (or any QEMUFile operation)
3357 * because of RDMA protocol.
3358 */
3359 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3360
b2557345 3361out:
b69a0227
JQ
3362 if (ret >= 0
3363 && migration_is_setup_or_active(migrate_get_current()->state)) {
33d70973
LB
3364 ret = multifd_send_sync_main(rs->f);
3365 if (ret < 0) {
3366 return ret;
3367 }
3368
3d4095b2
JQ
3369 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3370 qemu_fflush(f);
4c2d0f6d 3371 ram_transferred_add(8);
56e93d26 3372
3d4095b2
JQ
3373 ret = qemu_file_get_error(f);
3374 }
56e93d26
JQ
3375 if (ret < 0) {
3376 return ret;
3377 }
3378
5c90308f 3379 return done;
56e93d26
JQ
3380}
3381
3d0684b2
JQ
3382/**
3383 * ram_save_complete: function called to send the remaining amount of ram
3384 *
e8f3735f 3385 * Returns zero to indicate success or negative on error
3d0684b2
JQ
3386 *
3387 * Called with iothread lock
3388 *
3389 * @f: QEMUFile where to send the data
3390 * @opaque: RAMState pointer
3391 */
56e93d26
JQ
3392static int ram_save_complete(QEMUFile *f, void *opaque)
3393{
53518d94
JQ
3394 RAMState **temp = opaque;
3395 RAMState *rs = *temp;
e8f3735f 3396 int ret = 0;
6f37bb8b 3397
05931ec5
JQ
3398 rs->last_stage = !migration_in_colo_state();
3399
89ac5a1d
DDAG
3400 WITH_RCU_READ_LOCK_GUARD() {
3401 if (!migration_in_postcopy()) {
3402 migration_bitmap_sync_precopy(rs);
3403 }
56e93d26 3404
89ac5a1d 3405 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
56e93d26 3406
89ac5a1d 3407 /* try transferring iterative blocks of memory */
56e93d26 3408
89ac5a1d 3409 /* flush all remaining blocks regardless of rate limiting */
c13221b5 3410 qemu_mutex_lock(&rs->bitmap_mutex);
89ac5a1d
DDAG
3411 while (true) {
3412 int pages;
56e93d26 3413
05931ec5 3414 pages = ram_find_and_save_block(rs);
89ac5a1d
DDAG
3415 /* no more blocks to sent */
3416 if (pages == 0) {
3417 break;
3418 }
3419 if (pages < 0) {
3420 ret = pages;
3421 break;
3422 }
e8f3735f 3423 }
c13221b5 3424 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26 3425
89ac5a1d
DDAG
3426 flush_compressed_data(rs);
3427 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3428 }
d09a6fde 3429
33d70973
LB
3430 if (ret < 0) {
3431 return ret;
3d4095b2 3432 }
56e93d26 3433
c01b16ed
PX
3434 postcopy_preempt_reset_channel(rs);
3435
33d70973
LB
3436 ret = multifd_send_sync_main(rs->f);
3437 if (ret < 0) {
3438 return ret;
3439 }
3440
3441 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3442 qemu_fflush(f);
3443
3444 return 0;
56e93d26
JQ
3445}
3446
c31b098f 3447static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
47995026
VSO
3448 uint64_t *res_precopy_only,
3449 uint64_t *res_compatible,
3450 uint64_t *res_postcopy_only)
56e93d26 3451{
53518d94
JQ
3452 RAMState **temp = opaque;
3453 RAMState *rs = *temp;
56e93d26
JQ
3454 uint64_t remaining_size;
3455
9edabd4d 3456 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 3457
5727309d 3458 if (!migration_in_postcopy() &&
663e6c1d 3459 remaining_size < max_size) {
56e93d26 3460 qemu_mutex_lock_iothread();
89ac5a1d
DDAG
3461 WITH_RCU_READ_LOCK_GUARD() {
3462 migration_bitmap_sync_precopy(rs);
3463 }
56e93d26 3464 qemu_mutex_unlock_iothread();
9edabd4d 3465 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 3466 }
c31b098f 3467
86e1167e
VSO
3468 if (migrate_postcopy_ram()) {
3469 /* We can do postcopy, and all the data is postcopiable */
47995026 3470 *res_compatible += remaining_size;
86e1167e 3471 } else {
47995026 3472 *res_precopy_only += remaining_size;
86e1167e 3473 }
56e93d26
JQ
3474}
3475
3476static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3477{
3478 unsigned int xh_len;
3479 int xh_flags;
063e760a 3480 uint8_t *loaded_data;
56e93d26 3481
56e93d26
JQ
3482 /* extract RLE header */
3483 xh_flags = qemu_get_byte(f);
3484 xh_len = qemu_get_be16(f);
3485
3486 if (xh_flags != ENCODING_FLAG_XBZRLE) {
3487 error_report("Failed to load XBZRLE page - wrong compression!");
3488 return -1;
3489 }
3490
3491 if (xh_len > TARGET_PAGE_SIZE) {
3492 error_report("Failed to load XBZRLE page - len overflow!");
3493 return -1;
3494 }
f265e0e4 3495 loaded_data = XBZRLE.decoded_buf;
56e93d26 3496 /* load data and decode */
f265e0e4 3497 /* it can change loaded_data to point to an internal buffer */
063e760a 3498 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
56e93d26
JQ
3499
3500 /* decode RLE */
063e760a 3501 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
56e93d26
JQ
3502 TARGET_PAGE_SIZE) == -1) {
3503 error_report("Failed to load XBZRLE page - decode error!");
3504 return -1;
3505 }
3506
3507 return 0;
3508}
3509
3d0684b2
JQ
3510/**
3511 * ram_block_from_stream: read a RAMBlock id from the migration stream
3512 *
3513 * Must be called from within a rcu critical section.
3514 *
56e93d26 3515 * Returns a pointer from within the RCU-protected ram_list.
a7180877 3516 *
755e8d7c 3517 * @mis: the migration incoming state pointer
3d0684b2
JQ
3518 * @f: QEMUFile where to read the data from
3519 * @flags: Page flags (mostly to see if it's a continuation of previous block)
c01b16ed 3520 * @channel: the channel we're using
a7180877 3521 */
755e8d7c 3522static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
c01b16ed
PX
3523 QEMUFile *f, int flags,
3524 int channel)
56e93d26 3525{
c01b16ed 3526 RAMBlock *block = mis->last_recv_block[channel];
56e93d26
JQ
3527 char id[256];
3528 uint8_t len;
3529
3530 if (flags & RAM_SAVE_FLAG_CONTINUE) {
4c4bad48 3531 if (!block) {
56e93d26
JQ
3532 error_report("Ack, bad migration stream!");
3533 return NULL;
3534 }
4c4bad48 3535 return block;
56e93d26
JQ
3536 }
3537
3538 len = qemu_get_byte(f);
3539 qemu_get_buffer(f, (uint8_t *)id, len);
3540 id[len] = 0;
3541
e3dd7493 3542 block = qemu_ram_block_by_name(id);
4c4bad48
HZ
3543 if (!block) {
3544 error_report("Can't find block %s", id);
3545 return NULL;
56e93d26
JQ
3546 }
3547
fbd162e6 3548 if (ramblock_is_ignored(block)) {
b895de50
CLG
3549 error_report("block %s should not be migrated !", id);
3550 return NULL;
3551 }
3552
c01b16ed 3553 mis->last_recv_block[channel] = block;
755e8d7c 3554
4c4bad48
HZ
3555 return block;
3556}
3557
3558static inline void *host_from_ram_block_offset(RAMBlock *block,
3559 ram_addr_t offset)
3560{
3561 if (!offset_in_ramblock(block, offset)) {
3562 return NULL;
3563 }
3564
3565 return block->host + offset;
56e93d26
JQ
3566}
3567
6a23f639
DH
3568static void *host_page_from_ram_block_offset(RAMBlock *block,
3569 ram_addr_t offset)
3570{
3571 /* Note: Explicitly no check against offset_in_ramblock(). */
3572 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3573 block->page_size);
3574}
3575
3576static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3577 ram_addr_t offset)
3578{
3579 return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3580}
3581
13af18f2 3582static inline void *colo_cache_from_block_offset(RAMBlock *block,
8af66371 3583 ram_addr_t offset, bool record_bitmap)
13af18f2
ZC
3584{
3585 if (!offset_in_ramblock(block, offset)) {
3586 return NULL;
3587 }
3588 if (!block->colo_cache) {
3589 error_report("%s: colo_cache is NULL in block :%s",
3590 __func__, block->idstr);
3591 return NULL;
3592 }
7d9acafa
ZC
3593
3594 /*
3595 * During colo checkpoint, we need bitmap of these migrated pages.
3596 * It help us to decide which pages in ram cache should be flushed
3597 * into VM's RAM later.
3598 */
8af66371
HZ
3599 if (record_bitmap &&
3600 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
7d9acafa
ZC
3601 ram_state->migration_dirty_pages++;
3602 }
13af18f2
ZC
3603 return block->colo_cache + offset;
3604}
3605
3d0684b2
JQ
3606/**
3607 * ram_handle_compressed: handle the zero page case
3608 *
56e93d26
JQ
3609 * If a page (or a whole RDMA chunk) has been
3610 * determined to be zero, then zap it.
3d0684b2
JQ
3611 *
3612 * @host: host address for the zero page
3613 * @ch: what the page is filled from. We only support zero
3614 * @size: size of the zero page
56e93d26
JQ
3615 */
3616void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3617{
bad452a7 3618 if (ch != 0 || !buffer_is_zero(host, size)) {
56e93d26
JQ
3619 memset(host, ch, size);
3620 }
3621}
3622
797ca154
XG
3623/* return the size after decompression, or negative value on error */
3624static int
3625qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3626 const uint8_t *source, size_t source_len)
3627{
3628 int err;
3629
3630 err = inflateReset(stream);
3631 if (err != Z_OK) {
3632 return -1;
3633 }
3634
3635 stream->avail_in = source_len;
3636 stream->next_in = (uint8_t *)source;
3637 stream->avail_out = dest_len;
3638 stream->next_out = dest;
3639
3640 err = inflate(stream, Z_NO_FLUSH);
3641 if (err != Z_STREAM_END) {
3642 return -1;
3643 }
3644
3645 return stream->total_out;
3646}
3647
56e93d26
JQ
3648static void *do_data_decompress(void *opaque)
3649{
3650 DecompressParam *param = opaque;
3651 unsigned long pagesize;
33d151f4 3652 uint8_t *des;
34ab9e97 3653 int len, ret;
56e93d26 3654
33d151f4 3655 qemu_mutex_lock(&param->mutex);
90e56fb4 3656 while (!param->quit) {
33d151f4
LL
3657 if (param->des) {
3658 des = param->des;
3659 len = param->len;
3660 param->des = 0;
3661 qemu_mutex_unlock(&param->mutex);
3662
56e93d26 3663 pagesize = TARGET_PAGE_SIZE;
34ab9e97
XG
3664
3665 ret = qemu_uncompress_data(&param->stream, des, pagesize,
3666 param->compbuf, len);
f548222c 3667 if (ret < 0 && migrate_get_current()->decompress_error_check) {
34ab9e97
XG
3668 error_report("decompress data failed");
3669 qemu_file_set_error(decomp_file, ret);
3670 }
73a8912b 3671
33d151f4
LL
3672 qemu_mutex_lock(&decomp_done_lock);
3673 param->done = true;
3674 qemu_cond_signal(&decomp_done_cond);
3675 qemu_mutex_unlock(&decomp_done_lock);
3676
3677 qemu_mutex_lock(&param->mutex);
3678 } else {
3679 qemu_cond_wait(&param->cond, &param->mutex);
3680 }
56e93d26 3681 }
33d151f4 3682 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
3683
3684 return NULL;
3685}
3686
34ab9e97 3687static int wait_for_decompress_done(void)
5533b2e9
LL
3688{
3689 int idx, thread_count;
3690
3691 if (!migrate_use_compression()) {
34ab9e97 3692 return 0;
5533b2e9
LL
3693 }
3694
3695 thread_count = migrate_decompress_threads();
3696 qemu_mutex_lock(&decomp_done_lock);
3697 for (idx = 0; idx < thread_count; idx++) {
3698 while (!decomp_param[idx].done) {
3699 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3700 }
3701 }
3702 qemu_mutex_unlock(&decomp_done_lock);
34ab9e97 3703 return qemu_file_get_error(decomp_file);
5533b2e9
LL
3704}
3705
f0afa331 3706static void compress_threads_load_cleanup(void)
56e93d26
JQ
3707{
3708 int i, thread_count;
3709
3416ab5b
JQ
3710 if (!migrate_use_compression()) {
3711 return;
3712 }
56e93d26
JQ
3713 thread_count = migrate_decompress_threads();
3714 for (i = 0; i < thread_count; i++) {
797ca154
XG
3715 /*
3716 * we use it as a indicator which shows if the thread is
3717 * properly init'd or not
3718 */
3719 if (!decomp_param[i].compbuf) {
3720 break;
3721 }
3722
56e93d26 3723 qemu_mutex_lock(&decomp_param[i].mutex);
90e56fb4 3724 decomp_param[i].quit = true;
56e93d26
JQ
3725 qemu_cond_signal(&decomp_param[i].cond);
3726 qemu_mutex_unlock(&decomp_param[i].mutex);
3727 }
3728 for (i = 0; i < thread_count; i++) {
797ca154
XG
3729 if (!decomp_param[i].compbuf) {
3730 break;
3731 }
3732
56e93d26
JQ
3733 qemu_thread_join(decompress_threads + i);
3734 qemu_mutex_destroy(&decomp_param[i].mutex);
3735 qemu_cond_destroy(&decomp_param[i].cond);
797ca154 3736 inflateEnd(&decomp_param[i].stream);
56e93d26 3737 g_free(decomp_param[i].compbuf);
797ca154 3738 decomp_param[i].compbuf = NULL;
56e93d26
JQ
3739 }
3740 g_free(decompress_threads);
3741 g_free(decomp_param);
56e93d26
JQ
3742 decompress_threads = NULL;
3743 decomp_param = NULL;
34ab9e97 3744 decomp_file = NULL;
56e93d26
JQ
3745}
3746
34ab9e97 3747static int compress_threads_load_setup(QEMUFile *f)
797ca154
XG
3748{
3749 int i, thread_count;
3750
3751 if (!migrate_use_compression()) {
3752 return 0;
3753 }
3754
3755 thread_count = migrate_decompress_threads();
3756 decompress_threads = g_new0(QemuThread, thread_count);
3757 decomp_param = g_new0(DecompressParam, thread_count);
3758 qemu_mutex_init(&decomp_done_lock);
3759 qemu_cond_init(&decomp_done_cond);
34ab9e97 3760 decomp_file = f;
797ca154
XG
3761 for (i = 0; i < thread_count; i++) {
3762 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3763 goto exit;
3764 }
3765
3766 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3767 qemu_mutex_init(&decomp_param[i].mutex);
3768 qemu_cond_init(&decomp_param[i].cond);
3769 decomp_param[i].done = true;
3770 decomp_param[i].quit = false;
3771 qemu_thread_create(decompress_threads + i, "decompress",
3772 do_data_decompress, decomp_param + i,
3773 QEMU_THREAD_JOINABLE);
3774 }
3775 return 0;
3776exit:
3777 compress_threads_load_cleanup();
3778 return -1;
3779}
3780
c1bc6626 3781static void decompress_data_with_multi_threads(QEMUFile *f,
56e93d26
JQ
3782 void *host, int len)
3783{
3784 int idx, thread_count;
3785
3786 thread_count = migrate_decompress_threads();
37396950 3787 QEMU_LOCK_GUARD(&decomp_done_lock);
56e93d26
JQ
3788 while (true) {
3789 for (idx = 0; idx < thread_count; idx++) {
73a8912b 3790 if (decomp_param[idx].done) {
33d151f4
LL
3791 decomp_param[idx].done = false;
3792 qemu_mutex_lock(&decomp_param[idx].mutex);
c1bc6626 3793 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
56e93d26
JQ
3794 decomp_param[idx].des = host;
3795 decomp_param[idx].len = len;
33d151f4
LL
3796 qemu_cond_signal(&decomp_param[idx].cond);
3797 qemu_mutex_unlock(&decomp_param[idx].mutex);
56e93d26
JQ
3798 break;
3799 }
3800 }
3801 if (idx < thread_count) {
3802 break;
73a8912b
LL
3803 } else {
3804 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
56e93d26
JQ
3805 }
3806 }
3807}
3808
b70cb3b4
RL
3809static void colo_init_ram_state(void)
3810{
3811 ram_state_init(&ram_state);
b70cb3b4
RL
3812}
3813
13af18f2
ZC
3814/*
3815 * colo cache: this is for secondary VM, we cache the whole
3816 * memory of the secondary VM, it is need to hold the global lock
3817 * to call this helper.
3818 */
3819int colo_init_ram_cache(void)
3820{
3821 RAMBlock *block;
3822
44901b5a
PB
3823 WITH_RCU_READ_LOCK_GUARD() {
3824 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3825 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
8dbe22c6 3826 NULL, false, false);
44901b5a
PB
3827 if (!block->colo_cache) {
3828 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3829 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3830 block->used_length);
3831 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3832 if (block->colo_cache) {
3833 qemu_anon_ram_free(block->colo_cache, block->used_length);
3834 block->colo_cache = NULL;
3835 }
89ac5a1d 3836 }
44901b5a 3837 return -errno;
89ac5a1d 3838 }
e5fdf920
LS
3839 if (!machine_dump_guest_core(current_machine)) {
3840 qemu_madvise(block->colo_cache, block->used_length,
3841 QEMU_MADV_DONTDUMP);
3842 }
13af18f2 3843 }
13af18f2 3844 }
44901b5a 3845
7d9acafa
ZC
3846 /*
3847 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3848 * with to decide which page in cache should be flushed into SVM's RAM. Here
3849 * we use the same name 'ram_bitmap' as for migration.
3850 */
3851 if (ram_bytes_total()) {
3852 RAMBlock *block;
3853
fbd162e6 3854 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7d9acafa 3855 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
7d9acafa 3856 block->bmap = bitmap_new(pages);
7d9acafa
ZC
3857 }
3858 }
7d9acafa 3859
b70cb3b4 3860 colo_init_ram_state();
13af18f2 3861 return 0;
13af18f2
ZC
3862}
3863
0393031a
HZ
3864/* TODO: duplicated with ram_init_bitmaps */
3865void colo_incoming_start_dirty_log(void)
3866{
3867 RAMBlock *block = NULL;
3868 /* For memory_global_dirty_log_start below. */
3869 qemu_mutex_lock_iothread();
3870 qemu_mutex_lock_ramlist();
3871
3872 memory_global_dirty_log_sync();
3873 WITH_RCU_READ_LOCK_GUARD() {
3874 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3875 ramblock_sync_dirty_bitmap(ram_state, block);
3876 /* Discard this dirty bitmap record */
3877 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3878 }
63b41db4 3879 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
0393031a
HZ
3880 }
3881 ram_state->migration_dirty_pages = 0;
3882 qemu_mutex_unlock_ramlist();
3883 qemu_mutex_unlock_iothread();
3884}
3885
13af18f2
ZC
3886/* It is need to hold the global lock to call this helper */
3887void colo_release_ram_cache(void)
3888{
3889 RAMBlock *block;
3890
63b41db4 3891 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
fbd162e6 3892 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7d9acafa
ZC
3893 g_free(block->bmap);
3894 block->bmap = NULL;
3895 }
3896
89ac5a1d
DDAG
3897 WITH_RCU_READ_LOCK_GUARD() {
3898 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3899 if (block->colo_cache) {
3900 qemu_anon_ram_free(block->colo_cache, block->used_length);
3901 block->colo_cache = NULL;
3902 }
13af18f2
ZC
3903 }
3904 }
0393031a 3905 ram_state_cleanup(&ram_state);
13af18f2
ZC
3906}
3907
f265e0e4
JQ
3908/**
3909 * ram_load_setup: Setup RAM for migration incoming side
3910 *
3911 * Returns zero to indicate success and negative for error
3912 *
3913 * @f: QEMUFile where to receive the data
3914 * @opaque: RAMState pointer
3915 */
3916static int ram_load_setup(QEMUFile *f, void *opaque)
3917{
34ab9e97 3918 if (compress_threads_load_setup(f)) {
797ca154
XG
3919 return -1;
3920 }
3921
f265e0e4 3922 xbzrle_load_setup();
f9494614 3923 ramblock_recv_map_init();
13af18f2 3924
f265e0e4
JQ
3925 return 0;
3926}
3927
3928static int ram_load_cleanup(void *opaque)
3929{
f9494614 3930 RAMBlock *rb;
56eb90af 3931
fbd162e6 3932 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
bd108a44 3933 qemu_ram_block_writeback(rb);
56eb90af
JH
3934 }
3935
f265e0e4 3936 xbzrle_load_cleanup();
f0afa331 3937 compress_threads_load_cleanup();
f9494614 3938
fbd162e6 3939 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
f9494614
AP
3940 g_free(rb->receivedmap);
3941 rb->receivedmap = NULL;
3942 }
13af18f2 3943
f265e0e4
JQ
3944 return 0;
3945}
3946
3d0684b2
JQ
3947/**
3948 * ram_postcopy_incoming_init: allocate postcopy data structures
3949 *
3950 * Returns 0 for success and negative if there was one error
3951 *
3952 * @mis: current migration incoming state
3953 *
3954 * Allocate data structures etc needed by incoming migration with
3955 * postcopy-ram. postcopy-ram's similarly names
3956 * postcopy_ram_incoming_init does the work.
1caddf8a
DDAG
3957 */
3958int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3959{
c136180c 3960 return postcopy_ram_incoming_init(mis);
1caddf8a
DDAG
3961}
3962
3d0684b2
JQ
3963/**
3964 * ram_load_postcopy: load a page in postcopy case
3965 *
3966 * Returns 0 for success or -errno in case of error
3967 *
a7180877
DDAG
3968 * Called in postcopy mode by ram_load().
3969 * rcu_read_lock is taken prior to this being called.
3d0684b2
JQ
3970 *
3971 * @f: QEMUFile where to send the data
36f62f11 3972 * @channel: the channel to use for loading
a7180877 3973 */
36f62f11 3974int ram_load_postcopy(QEMUFile *f, int channel)
a7180877
DDAG
3975{
3976 int flags = 0, ret = 0;
3977 bool place_needed = false;
1aa83678 3978 bool matches_target_page_size = false;
a7180877 3979 MigrationIncomingState *mis = migration_incoming_get_current();
36f62f11 3980 PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
a7180877
DDAG
3981
3982 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3983 ram_addr_t addr;
a7180877
DDAG
3984 void *page_buffer = NULL;
3985 void *place_source = NULL;
df9ff5e1 3986 RAMBlock *block = NULL;
a7180877 3987 uint8_t ch;
644acf99 3988 int len;
a7180877
DDAG
3989
3990 addr = qemu_get_be64(f);
7a9ddfbf
PX
3991
3992 /*
3993 * If qemu file error, we should stop here, and then "addr"
3994 * may be invalid
3995 */
3996 ret = qemu_file_get_error(f);
3997 if (ret) {
3998 break;
3999 }
4000
a7180877
DDAG
4001 flags = addr & ~TARGET_PAGE_MASK;
4002 addr &= TARGET_PAGE_MASK;
4003
36f62f11 4004 trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
644acf99
WY
4005 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4006 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
c01b16ed 4007 block = ram_block_from_stream(mis, f, flags, channel);
6a23f639
DH
4008 if (!block) {
4009 ret = -EINVAL;
4010 break;
4011 }
4c4bad48 4012
898ba906
DH
4013 /*
4014 * Relying on used_length is racy and can result in false positives.
4015 * We might place pages beyond used_length in case RAM was shrunk
4016 * while in postcopy, which is fine - trying to place via
4017 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
4018 */
4019 if (!block->host || addr >= block->postcopy_length) {
a7180877
DDAG
4020 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4021 ret = -EINVAL;
4022 break;
4023 }
77dadc3f 4024 tmp_page->target_pages++;
1aa83678 4025 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
a7180877 4026 /*
28abd200
DDAG
4027 * Postcopy requires that we place whole host pages atomically;
4028 * these may be huge pages for RAMBlocks that are backed by
4029 * hugetlbfs.
a7180877
DDAG
4030 * To make it atomic, the data is read into a temporary page
4031 * that's moved into place later.
4032 * The migration protocol uses, possibly smaller, target-pages
4033 * however the source ensures it always sends all the components
91ba442f 4034 * of a host page in one chunk.
a7180877 4035 */
77dadc3f 4036 page_buffer = tmp_page->tmp_huge_page +
6a23f639
DH
4037 host_page_offset_from_ram_block_offset(block, addr);
4038 /* If all TP are zero then we can optimise the place */
77dadc3f
PX
4039 if (tmp_page->target_pages == 1) {
4040 tmp_page->host_addr =
4041 host_page_from_ram_block_offset(block, addr);
4042 } else if (tmp_page->host_addr !=
4043 host_page_from_ram_block_offset(block, addr)) {
c53b7ddc 4044 /* not the 1st TP within the HP */
36f62f11 4045 error_report("Non-same host page detected on channel %d: "
cfc7dc8a
PX
4046 "Target host page %p, received host page %p "
4047 "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
36f62f11 4048 channel, tmp_page->host_addr,
cfc7dc8a
PX
4049 host_page_from_ram_block_offset(block, addr),
4050 block->idstr, addr, tmp_page->target_pages);
6a23f639
DH
4051 ret = -EINVAL;
4052 break;
a7180877
DDAG
4053 }
4054
4055 /*
4056 * If it's the last part of a host page then we place the host
4057 * page
4058 */
77dadc3f
PX
4059 if (tmp_page->target_pages ==
4060 (block->page_size / TARGET_PAGE_SIZE)) {
4cbb3c63 4061 place_needed = true;
4cbb3c63 4062 }
77dadc3f 4063 place_source = tmp_page->tmp_huge_page;
a7180877
DDAG
4064 }
4065
4066 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
bb890ed5 4067 case RAM_SAVE_FLAG_ZERO:
a7180877 4068 ch = qemu_get_byte(f);
2e36bc1b
WY
4069 /*
4070 * Can skip to set page_buffer when
4071 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
4072 */
4073 if (ch || !matches_target_page_size) {
4074 memset(page_buffer, ch, TARGET_PAGE_SIZE);
4075 }
a7180877 4076 if (ch) {
77dadc3f 4077 tmp_page->all_zero = false;
a7180877
DDAG
4078 }
4079 break;
4080
4081 case RAM_SAVE_FLAG_PAGE:
77dadc3f 4082 tmp_page->all_zero = false;
1aa83678
PX
4083 if (!matches_target_page_size) {
4084 /* For huge pages, we always use temporary buffer */
a7180877
DDAG
4085 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4086 } else {
1aa83678
PX
4087 /*
4088 * For small pages that matches target page size, we
4089 * avoid the qemu_file copy. Instead we directly use
4090 * the buffer of QEMUFile to place the page. Note: we
4091 * cannot do any QEMUFile operation before using that
4092 * buffer to make sure the buffer is valid when
4093 * placing the page.
a7180877
DDAG
4094 */
4095 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4096 TARGET_PAGE_SIZE);
4097 }
4098 break;
644acf99 4099 case RAM_SAVE_FLAG_COMPRESS_PAGE:
77dadc3f 4100 tmp_page->all_zero = false;
644acf99
WY
4101 len = qemu_get_be32(f);
4102 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4103 error_report("Invalid compressed data length: %d", len);
4104 ret = -EINVAL;
4105 break;
4106 }
4107 decompress_data_with_multi_threads(f, page_buffer, len);
4108 break;
4109
a7180877
DDAG
4110 case RAM_SAVE_FLAG_EOS:
4111 /* normal exit */
6df264ac 4112 multifd_recv_sync_main();
a7180877
DDAG
4113 break;
4114 default:
29fccade 4115 error_report("Unknown combination of migration flags: 0x%x"
a7180877
DDAG
4116 " (postcopy mode)", flags);
4117 ret = -EINVAL;
7a9ddfbf
PX
4118 break;
4119 }
4120
644acf99
WY
4121 /* Got the whole host page, wait for decompress before placing. */
4122 if (place_needed) {
4123 ret |= wait_for_decompress_done();
4124 }
4125
7a9ddfbf
PX
4126 /* Detect for any possible file errors */
4127 if (!ret && qemu_file_get_error(f)) {
4128 ret = qemu_file_get_error(f);
a7180877
DDAG
4129 }
4130
7a9ddfbf 4131 if (!ret && place_needed) {
77dadc3f
PX
4132 if (tmp_page->all_zero) {
4133 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
a7180877 4134 } else {
77dadc3f
PX
4135 ret = postcopy_place_page(mis, tmp_page->host_addr,
4136 place_source, block);
a7180877 4137 }
ddf35bdf 4138 place_needed = false;
77dadc3f 4139 postcopy_temp_page_reset(tmp_page);
a7180877 4140 }
a7180877
DDAG
4141 }
4142
4143 return ret;
4144}
4145
acab30b8
DHB
4146static bool postcopy_is_advised(void)
4147{
4148 PostcopyState ps = postcopy_state_get();
4149 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
4150}
4151
4152static bool postcopy_is_running(void)
4153{
4154 PostcopyState ps = postcopy_state_get();
4155 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4156}
4157
e6f4aa18
ZC
4158/*
4159 * Flush content of RAM cache into SVM's memory.
4160 * Only flush the pages that be dirtied by PVM or SVM or both.
4161 */
24fa16f8 4162void colo_flush_ram_cache(void)
e6f4aa18
ZC
4163{
4164 RAMBlock *block = NULL;
4165 void *dst_host;
4166 void *src_host;
4167 unsigned long offset = 0;
4168
d1955d22 4169 memory_global_dirty_log_sync();
89ac5a1d
DDAG
4170 WITH_RCU_READ_LOCK_GUARD() {
4171 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4172 ramblock_sync_dirty_bitmap(ram_state, block);
4173 }
d1955d22 4174 }
d1955d22 4175
e6f4aa18 4176 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
89ac5a1d
DDAG
4177 WITH_RCU_READ_LOCK_GUARD() {
4178 block = QLIST_FIRST_RCU(&ram_list.blocks);
e6f4aa18 4179
89ac5a1d 4180 while (block) {
a6a83cef 4181 unsigned long num = 0;
e6f4aa18 4182
a6a83cef 4183 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
542147f4
DH
4184 if (!offset_in_ramblock(block,
4185 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
89ac5a1d 4186 offset = 0;
a6a83cef 4187 num = 0;
89ac5a1d
DDAG
4188 block = QLIST_NEXT_RCU(block, next);
4189 } else {
a6a83cef
RL
4190 unsigned long i = 0;
4191
4192 for (i = 0; i < num; i++) {
4193 migration_bitmap_clear_dirty(ram_state, block, offset + i);
4194 }
8bba004c
AR
4195 dst_host = block->host
4196 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4197 src_host = block->colo_cache
4198 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
a6a83cef
RL
4199 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
4200 offset += num;
89ac5a1d 4201 }
e6f4aa18
ZC
4202 }
4203 }
e6f4aa18
ZC
4204 trace_colo_flush_ram_cache_end();
4205}
4206
10da4a36
WY
4207/**
4208 * ram_load_precopy: load pages in precopy case
4209 *
4210 * Returns 0 for success or -errno in case of error
4211 *
4212 * Called in precopy mode by ram_load().
4213 * rcu_read_lock is taken prior to this being called.
4214 *
4215 * @f: QEMUFile where to send the data
4216 */
4217static int ram_load_precopy(QEMUFile *f)
56e93d26 4218{
755e8d7c 4219 MigrationIncomingState *mis = migration_incoming_get_current();
e65cec5e 4220 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
ef08fb38 4221 /* ADVISE is earlier, it shows the source has the postcopy capability on */
acab30b8 4222 bool postcopy_advised = postcopy_is_advised();
edc60127
JQ
4223 if (!migrate_use_compression()) {
4224 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4225 }
a7180877 4226
10da4a36 4227 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
56e93d26 4228 ram_addr_t addr, total_ram_bytes;
0393031a 4229 void *host = NULL, *host_bak = NULL;
56e93d26
JQ
4230 uint8_t ch;
4231
e65cec5e
YK
4232 /*
4233 * Yield periodically to let main loop run, but an iteration of
4234 * the main loop is expensive, so do it each some iterations
4235 */
4236 if ((i & 32767) == 0 && qemu_in_coroutine()) {
4237 aio_co_schedule(qemu_get_current_aio_context(),
4238 qemu_coroutine_self());
4239 qemu_coroutine_yield();
4240 }
4241 i++;
4242
56e93d26
JQ
4243 addr = qemu_get_be64(f);
4244 flags = addr & ~TARGET_PAGE_MASK;
4245 addr &= TARGET_PAGE_MASK;
4246
edc60127
JQ
4247 if (flags & invalid_flags) {
4248 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4249 error_report("Received an unexpected compressed page");
4250 }
4251
4252 ret = -EINVAL;
4253 break;
4254 }
4255
bb890ed5 4256 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
a776aa15 4257 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
c01b16ed
PX
4258 RAMBlock *block = ram_block_from_stream(mis, f, flags,
4259 RAM_CHANNEL_PRECOPY);
4c4bad48 4260
0393031a 4261 host = host_from_ram_block_offset(block, addr);
13af18f2 4262 /*
0393031a
HZ
4263 * After going into COLO stage, we should not load the page
4264 * into SVM's memory directly, we put them into colo_cache firstly.
4265 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
4266 * Previously, we copied all these memory in preparing stage of COLO
4267 * while we need to stop VM, which is a time-consuming process.
4268 * Here we optimize it by a trick, back-up every page while in
4269 * migration process while COLO is enabled, though it affects the
4270 * speed of the migration, but it obviously reduce the downtime of
4271 * back-up all SVM'S memory in COLO preparing stage.
13af18f2 4272 */
0393031a
HZ
4273 if (migration_incoming_colo_enabled()) {
4274 if (migration_incoming_in_colo_state()) {
4275 /* In COLO stage, put all pages into cache temporarily */
8af66371 4276 host = colo_cache_from_block_offset(block, addr, true);
0393031a
HZ
4277 } else {
4278 /*
4279 * In migration stage but before COLO stage,
4280 * Put all pages into both cache and SVM's memory.
4281 */
8af66371 4282 host_bak = colo_cache_from_block_offset(block, addr, false);
0393031a 4283 }
13af18f2 4284 }
a776aa15
DDAG
4285 if (!host) {
4286 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4287 ret = -EINVAL;
4288 break;
4289 }
13af18f2
ZC
4290 if (!migration_incoming_in_colo_state()) {
4291 ramblock_recv_bitmap_set(block, host);
4292 }
4293
1db9d8e5 4294 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
a776aa15
DDAG
4295 }
4296
56e93d26
JQ
4297 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4298 case RAM_SAVE_FLAG_MEM_SIZE:
4299 /* Synchronize RAM block list */
4300 total_ram_bytes = addr;
4301 while (!ret && total_ram_bytes) {
4302 RAMBlock *block;
56e93d26
JQ
4303 char id[256];
4304 ram_addr_t length;
4305
4306 len = qemu_get_byte(f);
4307 qemu_get_buffer(f, (uint8_t *)id, len);
4308 id[len] = 0;
4309 length = qemu_get_be64(f);
4310
e3dd7493 4311 block = qemu_ram_block_by_name(id);
b895de50
CLG
4312 if (block && !qemu_ram_is_migratable(block)) {
4313 error_report("block %s should not be migrated !", id);
4314 ret = -EINVAL;
4315 } else if (block) {
e3dd7493
DDAG
4316 if (length != block->used_length) {
4317 Error *local_err = NULL;
56e93d26 4318
fa53a0e5 4319 ret = qemu_ram_resize(block, length,
e3dd7493
DDAG
4320 &local_err);
4321 if (local_err) {
4322 error_report_err(local_err);
56e93d26 4323 }
56e93d26 4324 }
ef08fb38 4325 /* For postcopy we need to check hugepage sizes match */
e846b746 4326 if (postcopy_advised && migrate_postcopy_ram() &&
ef08fb38
DDAG
4327 block->page_size != qemu_host_page_size) {
4328 uint64_t remote_page_size = qemu_get_be64(f);
4329 if (remote_page_size != block->page_size) {
4330 error_report("Mismatched RAM page size %s "
4331 "(local) %zd != %" PRId64,
4332 id, block->page_size,
4333 remote_page_size);
4334 ret = -EINVAL;
4335 }
4336 }
fbd162e6
YK
4337 if (migrate_ignore_shared()) {
4338 hwaddr addr = qemu_get_be64(f);
fbd162e6
YK
4339 if (ramblock_is_ignored(block) &&
4340 block->mr->addr != addr) {
4341 error_report("Mismatched GPAs for block %s "
4342 "%" PRId64 "!= %" PRId64,
4343 id, (uint64_t)addr,
4344 (uint64_t)block->mr->addr);
4345 ret = -EINVAL;
4346 }
4347 }
e3dd7493
DDAG
4348 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4349 block->idstr);
4350 } else {
56e93d26
JQ
4351 error_report("Unknown ramblock \"%s\", cannot "
4352 "accept migration", id);
4353 ret = -EINVAL;
4354 }
4355
4356 total_ram_bytes -= length;
4357 }
4358 break;
a776aa15 4359
bb890ed5 4360 case RAM_SAVE_FLAG_ZERO:
56e93d26
JQ
4361 ch = qemu_get_byte(f);
4362 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4363 break;
a776aa15 4364
56e93d26 4365 case RAM_SAVE_FLAG_PAGE:
56e93d26
JQ
4366 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4367 break;
56e93d26 4368
a776aa15 4369 case RAM_SAVE_FLAG_COMPRESS_PAGE:
56e93d26
JQ
4370 len = qemu_get_be32(f);
4371 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4372 error_report("Invalid compressed data length: %d", len);
4373 ret = -EINVAL;
4374 break;
4375 }
c1bc6626 4376 decompress_data_with_multi_threads(f, host, len);
56e93d26 4377 break;
a776aa15 4378
56e93d26 4379 case RAM_SAVE_FLAG_XBZRLE:
56e93d26
JQ
4380 if (load_xbzrle(f, addr, host) < 0) {
4381 error_report("Failed to decompress XBZRLE page at "
4382 RAM_ADDR_FMT, addr);
4383 ret = -EINVAL;
4384 break;
4385 }
4386 break;
4387 case RAM_SAVE_FLAG_EOS:
4388 /* normal exit */
6df264ac 4389 multifd_recv_sync_main();
56e93d26
JQ
4390 break;
4391 default:
4392 if (flags & RAM_SAVE_FLAG_HOOK) {
632e3a5c 4393 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
56e93d26 4394 } else {
29fccade 4395 error_report("Unknown combination of migration flags: 0x%x",
56e93d26
JQ
4396 flags);
4397 ret = -EINVAL;
4398 }
4399 }
4400 if (!ret) {
4401 ret = qemu_file_get_error(f);
4402 }
0393031a
HZ
4403 if (!ret && host_bak) {
4404 memcpy(host_bak, host, TARGET_PAGE_SIZE);
4405 }
56e93d26
JQ
4406 }
4407
ca1a6b70 4408 ret |= wait_for_decompress_done();
10da4a36
WY
4409 return ret;
4410}
4411
4412static int ram_load(QEMUFile *f, void *opaque, int version_id)
4413{
4414 int ret = 0;
4415 static uint64_t seq_iter;
4416 /*
4417 * If system is running in postcopy mode, page inserts to host memory must
4418 * be atomic
4419 */
4420 bool postcopy_running = postcopy_is_running();
4421
4422 seq_iter++;
4423
4424 if (version_id != 4) {
4425 return -EINVAL;
4426 }
4427
4428 /*
4429 * This RCU critical section can be very long running.
4430 * When RCU reclaims in the code start to become numerous,
4431 * it will be necessary to reduce the granularity of this
4432 * critical section.
4433 */
89ac5a1d
DDAG
4434 WITH_RCU_READ_LOCK_GUARD() {
4435 if (postcopy_running) {
36f62f11
PX
4436 /*
4437 * Note! Here RAM_CHANNEL_PRECOPY is the precopy channel of
4438 * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
4439 * service fast page faults.
4440 */
4441 ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
89ac5a1d
DDAG
4442 } else {
4443 ret = ram_load_precopy(f);
4444 }
10da4a36 4445 }
55c4446b 4446 trace_ram_load_complete(ret, seq_iter);
e6f4aa18 4447
56e93d26
JQ
4448 return ret;
4449}
4450
c6467627
VSO
4451static bool ram_has_postcopy(void *opaque)
4452{
469dd51b 4453 RAMBlock *rb;
fbd162e6 4454 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
469dd51b
JH
4455 if (ramblock_is_pmem(rb)) {
4456 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4457 "is not supported now!", rb->idstr, rb->host);
4458 return false;
4459 }
4460 }
4461
c6467627
VSO
4462 return migrate_postcopy_ram();
4463}
4464
edd090c7
PX
4465/* Sync all the dirty bitmap with destination VM. */
4466static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4467{
4468 RAMBlock *block;
4469 QEMUFile *file = s->to_dst_file;
4470 int ramblock_count = 0;
4471
4472 trace_ram_dirty_bitmap_sync_start();
4473
fbd162e6 4474 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
edd090c7
PX
4475 qemu_savevm_send_recv_bitmap(file, block->idstr);
4476 trace_ram_dirty_bitmap_request(block->idstr);
4477 ramblock_count++;
4478 }
4479
4480 trace_ram_dirty_bitmap_sync_wait();
4481
4482 /* Wait until all the ramblocks' dirty bitmap synced */
4483 while (ramblock_count--) {
4484 qemu_sem_wait(&s->rp_state.rp_sem);
4485 }
4486
4487 trace_ram_dirty_bitmap_sync_complete();
4488
4489 return 0;
4490}
4491
4492static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4493{
4494 qemu_sem_post(&s->rp_state.rp_sem);
4495}
4496
a335debb
PX
4497/*
4498 * Read the received bitmap, revert it as the initial dirty bitmap.
4499 * This is only used when the postcopy migration is paused but wants
4500 * to resume from a middle point.
4501 */
4502int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4503{
4504 int ret = -EINVAL;
43044ac0 4505 /* from_dst_file is always valid because we're within rp_thread */
a335debb
PX
4506 QEMUFile *file = s->rp_state.from_dst_file;
4507 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
a725ef9f 4508 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
4509 uint64_t size, end_mark;
4510
4511 trace_ram_dirty_bitmap_reload_begin(block->idstr);
4512
4513 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4514 error_report("%s: incorrect state %s", __func__,
4515 MigrationStatus_str(s->state));
4516 return -EINVAL;
4517 }
4518
4519 /*
4520 * Note: see comments in ramblock_recv_bitmap_send() on why we
3a4452d8 4521 * need the endianness conversion, and the paddings.
a335debb
PX
4522 */
4523 local_size = ROUND_UP(local_size, 8);
4524
4525 /* Add paddings */
4526 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4527
4528 size = qemu_get_be64(file);
4529
4530 /* The size of the bitmap should match with our ramblock */
4531 if (size != local_size) {
4532 error_report("%s: ramblock '%s' bitmap size mismatch "
4533 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4534 block->idstr, size, local_size);
4535 ret = -EINVAL;
4536 goto out;
4537 }
4538
4539 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4540 end_mark = qemu_get_be64(file);
4541
4542 ret = qemu_file_get_error(file);
4543 if (ret || size != local_size) {
4544 error_report("%s: read bitmap failed for ramblock '%s': %d"
4545 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4546 __func__, block->idstr, ret, local_size, size);
4547 ret = -EIO;
4548 goto out;
4549 }
4550
4551 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
af3bbbe9 4552 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
a335debb
PX
4553 __func__, block->idstr, end_mark);
4554 ret = -EINVAL;
4555 goto out;
4556 }
4557
4558 /*
3a4452d8 4559 * Endianness conversion. We are during postcopy (though paused).
a335debb
PX
4560 * The dirty bitmap won't change. We can directly modify it.
4561 */
4562 bitmap_from_le(block->bmap, le_bitmap, nbits);
4563
4564 /*
4565 * What we received is "received bitmap". Revert it as the initial
4566 * dirty bitmap for this ramblock.
4567 */
4568 bitmap_complement(block->bmap, block->bmap, nbits);
4569
be39b4cd
DH
4570 /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4571 ramblock_dirty_bitmap_clear_discarded_pages(block);
4572
4573 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
a335debb
PX
4574 trace_ram_dirty_bitmap_reload_complete(block->idstr);
4575
edd090c7
PX
4576 /*
4577 * We succeeded to sync bitmap for current ramblock. If this is
4578 * the last one to sync, we need to notify the main send thread.
4579 */
4580 ram_dirty_bitmap_reload_notify(s);
4581
a335debb
PX
4582 ret = 0;
4583out:
bf269906 4584 g_free(le_bitmap);
a335debb
PX
4585 return ret;
4586}
4587
edd090c7
PX
4588static int ram_resume_prepare(MigrationState *s, void *opaque)
4589{
4590 RAMState *rs = *(RAMState **)opaque;
08614f34 4591 int ret;
edd090c7 4592
08614f34
PX
4593 ret = ram_dirty_bitmap_sync_all(s, rs);
4594 if (ret) {
4595 return ret;
4596 }
4597
4598 ram_state_resume_prepare(rs, s->to_dst_file);
4599
4600 return 0;
edd090c7
PX
4601}
4602
36f62f11
PX
4603void postcopy_preempt_shutdown_file(MigrationState *s)
4604{
4605 qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
4606 qemu_fflush(s->postcopy_qemufile_src);
4607}
4608
56e93d26 4609static SaveVMHandlers savevm_ram_handlers = {
9907e842 4610 .save_setup = ram_save_setup,
56e93d26 4611 .save_live_iterate = ram_save_iterate,
763c906b 4612 .save_live_complete_postcopy = ram_save_complete,
a3e06c3d 4613 .save_live_complete_precopy = ram_save_complete,
c6467627 4614 .has_postcopy = ram_has_postcopy,
56e93d26
JQ
4615 .save_live_pending = ram_save_pending,
4616 .load_state = ram_load,
f265e0e4
JQ
4617 .save_cleanup = ram_save_cleanup,
4618 .load_setup = ram_load_setup,
4619 .load_cleanup = ram_load_cleanup,
edd090c7 4620 .resume_prepare = ram_resume_prepare,
56e93d26
JQ
4621};
4622
c7c0e724
DH
4623static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4624 size_t old_size, size_t new_size)
4625{
cc61c703 4626 PostcopyState ps = postcopy_state_get();
c7c0e724
DH
4627 ram_addr_t offset;
4628 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4629 Error *err = NULL;
4630
4631 if (ramblock_is_ignored(rb)) {
4632 return;
4633 }
4634
4635 if (!migration_is_idle()) {
4636 /*
4637 * Precopy code on the source cannot deal with the size of RAM blocks
4638 * changing at random points in time - especially after sending the
4639 * RAM block sizes in the migration stream, they must no longer change.
4640 * Abort and indicate a proper reason.
4641 */
4642 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
458fecca 4643 migration_cancel(err);
c7c0e724 4644 error_free(err);
c7c0e724 4645 }
cc61c703
DH
4646
4647 switch (ps) {
4648 case POSTCOPY_INCOMING_ADVISE:
4649 /*
4650 * Update what ram_postcopy_incoming_init()->init_range() does at the
4651 * time postcopy was advised. Syncing RAM blocks with the source will
4652 * result in RAM resizes.
4653 */
4654 if (old_size < new_size) {
4655 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4656 error_report("RAM block '%s' discard of resized RAM failed",
4657 rb->idstr);
4658 }
4659 }
898ba906 4660 rb->postcopy_length = new_size;
cc61c703
DH
4661 break;
4662 case POSTCOPY_INCOMING_NONE:
4663 case POSTCOPY_INCOMING_RUNNING:
4664 case POSTCOPY_INCOMING_END:
4665 /*
4666 * Once our guest is running, postcopy does no longer care about
4667 * resizes. When growing, the new memory was not available on the
4668 * source, no handler needed.
4669 */
4670 break;
4671 default:
4672 error_report("RAM block '%s' resized during postcopy state: %d",
4673 rb->idstr, ps);
4674 exit(-1);
4675 }
c7c0e724
DH
4676}
4677
4678static RAMBlockNotifier ram_mig_ram_notifier = {
4679 .ram_block_resized = ram_mig_ram_block_resized,
4680};
4681
56e93d26
JQ
4682void ram_mig_init(void)
4683{
4684 qemu_mutex_init(&XBZRLE.lock);
ce62df53 4685 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
c7c0e724 4686 ram_block_notifier_add(&ram_mig_ram_notifier);
56e93d26 4687}