]> git.proxmox.com Git - mirror_qemu.git/blame - migration/ram.c
migration: Make dirty_sync_missed_zero_copy atomic
[mirror_qemu.git] / migration / ram.c
CommitLineData
56e93d26
JQ
1/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
76cc7b58
JQ
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
56e93d26
JQ
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
e688df6b 28
1393a485 29#include "qemu/osdep.h"
f348b6d1 30#include "qemu/cutils.h"
56e93d26
JQ
31#include "qemu/bitops.h"
32#include "qemu/bitmap.h"
b85ea5fa 33#include "qemu/madvise.h"
7205c9ec 34#include "qemu/main-loop.h"
c0e0825c 35#include "io/channel-null.h"
709e3fe8 36#include "xbzrle.h"
7b1e1a22 37#include "ram.h"
6666c96a 38#include "migration.h"
f2a8f0a6 39#include "migration/register.h"
7b1e1a22 40#include "migration/misc.h"
08a0aee1 41#include "qemu-file.h"
be07b0ac 42#include "postcopy-ram.h"
53d37d36 43#include "page_cache.h"
56e93d26 44#include "qemu/error-report.h"
e688df6b 45#include "qapi/error.h"
ab7cbb0b 46#include "qapi/qapi-types-migration.h"
9af23989 47#include "qapi/qapi-events-migration.h"
8acabf69 48#include "qapi/qmp/qerror.h"
56e93d26 49#include "trace.h"
56e93d26 50#include "exec/ram_addr.h"
f9494614 51#include "exec/target_page.h"
56e93d26 52#include "qemu/rcu_queue.h"
a91246c9 53#include "migration/colo.h"
53d37d36 54#include "block.h"
b0c3cf94 55#include "sysemu/cpu-throttle.h"
edd090c7 56#include "savevm.h"
b9ee2f7d 57#include "qemu/iov.h"
d32ca5ad 58#include "multifd.h"
278e2f55
AG
59#include "sysemu/runstate.h"
60
e5fdf920
LS
61#include "hw/boards.h" /* for machine_dump_guest_core() */
62
278e2f55
AG
63#if defined(__linux__)
64#include "qemu/userfaultfd.h"
65#endif /* defined(__linux__) */
56e93d26 66
56e93d26
JQ
67/***********************************************************/
68/* ram save/restore */
69
7b548761
JQ
70/*
71 * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
72 * worked for pages that were filled with the same char. We switched
bb890ed5 73 * it to only search for the zero value. And to avoid confusion with
7b548761 74 * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it.
bb890ed5 75 */
7b548761
JQ
76/*
77 * RAM_SAVE_FLAG_FULL was obsoleted in 2009, it can be reused now
78 */
79#define RAM_SAVE_FLAG_FULL 0x01
bb890ed5 80#define RAM_SAVE_FLAG_ZERO 0x02
56e93d26
JQ
81#define RAM_SAVE_FLAG_MEM_SIZE 0x04
82#define RAM_SAVE_FLAG_PAGE 0x08
83#define RAM_SAVE_FLAG_EOS 0x10
84#define RAM_SAVE_FLAG_CONTINUE 0x20
85#define RAM_SAVE_FLAG_XBZRLE 0x40
7b548761 86/* 0x80 is reserved in qemu-file.h for RAM_SAVE_FLAG_HOOK */
56e93d26 87#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
7b548761 88/* We can't use any flag that is bigger than 0x200 */
56e93d26 89
04ffce13 90int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int,
91 uint8_t *, int) = xbzrle_encode_buffer;
92#if defined(CONFIG_AVX512BW_OPT)
93#include "qemu/cpuid.h"
94static void __attribute__((constructor)) init_cpu_flag(void)
95{
96 unsigned max = __get_cpuid_max(0, NULL);
97 int a, b, c, d;
98 if (max >= 1) {
99 __cpuid(1, a, b, c, d);
100 /* We must check that AVX is not just available, but usable. */
101 if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
102 int bv;
103 __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
104 __cpuid_count(7, 0, a, b, c, d);
105 /* 0xe6:
106 * XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
107 * and ZMM16-ZMM31 state are enabled by OS)
108 * XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
109 */
110 if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
111 xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512;
112 }
113 }
114 }
115}
116#endif
117
9360447d
JQ
118XBZRLECacheStats xbzrle_counters;
119
f1668764
PX
120/* used by the search for pages to send */
121struct PageSearchStatus {
122 /* The migration channel used for a specific host page */
123 QEMUFile *pss_channel;
ec6f3ab9
PX
124 /* Last block from where we have sent data */
125 RAMBlock *last_sent_block;
f1668764
PX
126 /* Current block being searched */
127 RAMBlock *block;
128 /* Current page to search from */
129 unsigned long page;
130 /* Set once we wrap around */
131 bool complete_round;
f1668764
PX
132 /* Whether we're sending a host page */
133 bool host_page_sending;
134 /* The start/end of current host page. Invalid if host_page_sending==false */
135 unsigned long host_page_start;
136 unsigned long host_page_end;
137};
138typedef struct PageSearchStatus PageSearchStatus;
139
56e93d26
JQ
140/* struct contains XBZRLE cache and a static page
141 used by the compression */
142static struct {
143 /* buffer used for XBZRLE encoding */
144 uint8_t *encoded_buf;
145 /* buffer for storing page content */
146 uint8_t *current_buf;
147 /* Cache for XBZRLE, Protected by lock. */
148 PageCache *cache;
149 QemuMutex lock;
c00e0928
JQ
150 /* it will store a page full of zeros */
151 uint8_t *zero_target_page;
f265e0e4
JQ
152 /* buffer used for XBZRLE decoding */
153 uint8_t *decoded_buf;
56e93d26
JQ
154} XBZRLE;
155
56e93d26
JQ
156static void XBZRLE_cache_lock(void)
157{
f4c51a6b 158 if (migrate_use_xbzrle()) {
56e93d26 159 qemu_mutex_lock(&XBZRLE.lock);
f4c51a6b 160 }
56e93d26
JQ
161}
162
163static void XBZRLE_cache_unlock(void)
164{
f4c51a6b 165 if (migrate_use_xbzrle()) {
56e93d26 166 qemu_mutex_unlock(&XBZRLE.lock);
f4c51a6b 167 }
56e93d26
JQ
168}
169
3d0684b2
JQ
170/**
171 * xbzrle_cache_resize: resize the xbzrle cache
172 *
cbde7be9 173 * This function is called from migrate_params_apply in main
3d0684b2
JQ
174 * thread, possibly while a migration is in progress. A running
175 * migration may be using the cache and might finish during this call,
176 * hence changes to the cache are protected by XBZRLE.lock().
177 *
c9dede2d 178 * Returns 0 for success or -1 for error
3d0684b2
JQ
179 *
180 * @new_size: new cache size
8acabf69 181 * @errp: set *errp if the check failed, with reason
56e93d26 182 */
8b9407a0 183int xbzrle_cache_resize(uint64_t new_size, Error **errp)
56e93d26
JQ
184{
185 PageCache *new_cache;
c9dede2d 186 int64_t ret = 0;
56e93d26 187
8acabf69
JQ
188 /* Check for truncation */
189 if (new_size != (size_t)new_size) {
190 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
191 "exceeding address space");
192 return -1;
193 }
194
2a313e5c
JQ
195 if (new_size == migrate_xbzrle_cache_size()) {
196 /* nothing to do */
c9dede2d 197 return 0;
2a313e5c
JQ
198 }
199
56e93d26
JQ
200 XBZRLE_cache_lock();
201
202 if (XBZRLE.cache != NULL) {
80f8dfde 203 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
56e93d26 204 if (!new_cache) {
56e93d26
JQ
205 ret = -1;
206 goto out;
207 }
208
209 cache_fini(XBZRLE.cache);
210 XBZRLE.cache = new_cache;
211 }
56e93d26
JQ
212out:
213 XBZRLE_cache_unlock();
214 return ret;
215}
216
20123ee1
PX
217static bool postcopy_preempt_active(void)
218{
219 return migrate_postcopy_preempt() && migration_in_postcopy();
220}
221
3ded54b1 222bool ramblock_is_ignored(RAMBlock *block)
fbd162e6
YK
223{
224 return !qemu_ram_is_migratable(block) ||
225 (migrate_ignore_shared() && qemu_ram_is_shared(block));
226}
227
343f632c
DDAG
228#undef RAMBLOCK_FOREACH
229
fbd162e6
YK
230int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
231{
232 RAMBlock *block;
233 int ret = 0;
234
89ac5a1d
DDAG
235 RCU_READ_LOCK_GUARD();
236
fbd162e6
YK
237 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
238 ret = func(block, opaque);
239 if (ret) {
240 break;
241 }
242 }
fbd162e6
YK
243 return ret;
244}
245
f9494614
AP
246static void ramblock_recv_map_init(void)
247{
248 RAMBlock *rb;
249
fbd162e6 250 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
f9494614
AP
251 assert(!rb->receivedmap);
252 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
253 }
254}
255
256int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
257{
258 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
259 rb->receivedmap);
260}
261
1cba9f6e
DDAG
262bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
263{
264 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
265}
266
f9494614
AP
267void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
268{
269 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
270}
271
272void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
273 size_t nr)
274{
275 bitmap_set_atomic(rb->receivedmap,
276 ramblock_recv_bitmap_offset(host_addr, rb),
277 nr);
278}
279
a335debb
PX
280#define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
281
282/*
283 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
284 *
285 * Returns >0 if success with sent bytes, or <0 if error.
286 */
287int64_t ramblock_recv_bitmap_send(QEMUFile *file,
288 const char *block_name)
289{
290 RAMBlock *block = qemu_ram_block_by_name(block_name);
291 unsigned long *le_bitmap, nbits;
292 uint64_t size;
293
294 if (!block) {
295 error_report("%s: invalid block name: %s", __func__, block_name);
296 return -1;
297 }
298
898ba906 299 nbits = block->postcopy_length >> TARGET_PAGE_BITS;
a335debb
PX
300
301 /*
302 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
303 * machines we may need 4 more bytes for padding (see below
304 * comment). So extend it a bit before hand.
305 */
306 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
307
308 /*
309 * Always use little endian when sending the bitmap. This is
310 * required that when source and destination VMs are not using the
3a4452d8 311 * same endianness. (Note: big endian won't work.)
a335debb
PX
312 */
313 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
314
315 /* Size of the bitmap, in bytes */
a725ef9f 316 size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
317
318 /*
319 * size is always aligned to 8 bytes for 64bit machines, but it
320 * may not be true for 32bit machines. We need this padding to
321 * make sure the migration can survive even between 32bit and
322 * 64bit machines.
323 */
324 size = ROUND_UP(size, 8);
325
326 qemu_put_be64(file, size);
327 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
328 /*
329 * Mark as an end, in case the middle part is screwed up due to
3a4452d8 330 * some "mysterious" reason.
a335debb
PX
331 */
332 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
333 qemu_fflush(file);
334
bf269906 335 g_free(le_bitmap);
a335debb
PX
336
337 if (qemu_file_get_error(file)) {
338 return qemu_file_get_error(file);
339 }
340
341 return size + sizeof(size);
342}
343
ec481c6c
JQ
344/*
345 * An outstanding page request, on the source, having been received
346 * and queued
347 */
348struct RAMSrcPageRequest {
349 RAMBlock *rb;
350 hwaddr offset;
351 hwaddr len;
352
353 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
354};
355
6f37bb8b
JQ
356/* State of RAM for migration */
357struct RAMState {
f1668764
PX
358 /*
359 * PageSearchStatus structures for the channels when send pages.
360 * Protected by the bitmap_mutex.
361 */
362 PageSearchStatus pss[RAM_CHANNEL_MAX];
278e2f55
AG
363 /* UFFD file descriptor, used in 'write-tracking' migration */
364 int uffdio_fd;
8d80e195
JQ
365 /* total ram size in bytes */
366 uint64_t ram_bytes_total;
6f37bb8b
JQ
367 /* Last block that we have visited searching for dirty pages */
368 RAMBlock *last_seen_block;
269ace29
JQ
369 /* Last dirty target page we have sent */
370 ram_addr_t last_page;
6f37bb8b
JQ
371 /* last ram version we have seen */
372 uint32_t last_version;
8d820d6f
JQ
373 /* How many times we have dirty too many pages */
374 int dirty_rate_high_cnt;
f664da80
JQ
375 /* these variables are used for bitmap sync */
376 /* last time we did a full bitmap_sync */
377 int64_t time_last_bitmap_sync;
eac74159 378 /* bytes transferred at start_time */
c4bdf0cf 379 uint64_t bytes_xfer_prev;
a66cd90c 380 /* number of dirty pages since start_time */
68908ed6 381 uint64_t num_dirty_pages_period;
b5833fde
JQ
382 /* xbzrle misses since the beginning of the period */
383 uint64_t xbzrle_cache_miss_prev;
e460a4b1
WW
384 /* Amount of xbzrle pages since the beginning of the period */
385 uint64_t xbzrle_pages_prev;
386 /* Amount of xbzrle encoded bytes since the beginning of the period */
387 uint64_t xbzrle_bytes_prev;
1a373522
DH
388 /* Start using XBZRLE (e.g., after the first round). */
389 bool xbzrle_enabled;
05931ec5
JQ
390 /* Are we on the last stage of migration */
391 bool last_stage;
76e03000
XG
392 /* compression statistics since the beginning of the period */
393 /* amount of count that no free thread to compress data */
394 uint64_t compress_thread_busy_prev;
395 /* amount bytes after compression */
396 uint64_t compressed_size_prev;
397 /* amount of compressed pages */
398 uint64_t compress_pages_prev;
399
be8b02ed
XG
400 /* total handled target pages at the beginning of period */
401 uint64_t target_page_count_prev;
402 /* total handled target pages since start */
403 uint64_t target_page_count;
9360447d 404 /* number of dirty bits in the bitmap */
2dfaf12e 405 uint64_t migration_dirty_pages;
f1668764
PX
406 /*
407 * Protects:
408 * - dirty/clear bitmap
409 * - migration_dirty_pages
410 * - pss structures
411 */
108cfae0 412 QemuMutex bitmap_mutex;
68a098f3
JQ
413 /* The RAMBlock used in the last src_page_requests */
414 RAMBlock *last_req_rb;
ec481c6c
JQ
415 /* Queue of outstanding page requests from the destination */
416 QemuMutex src_page_req_mutex;
b58deb34 417 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
6f37bb8b
JQ
418};
419typedef struct RAMState RAMState;
420
53518d94 421static RAMState *ram_state;
6f37bb8b 422
bd227060
WW
423static NotifierWithReturnList precopy_notifier_list;
424
a1fe28df
PX
425/* Whether postcopy has queued requests? */
426static bool postcopy_has_request(RAMState *rs)
427{
428 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
429}
430
bd227060
WW
431void precopy_infrastructure_init(void)
432{
433 notifier_with_return_list_init(&precopy_notifier_list);
434}
435
436void precopy_add_notifier(NotifierWithReturn *n)
437{
438 notifier_with_return_list_add(&precopy_notifier_list, n);
439}
440
441void precopy_remove_notifier(NotifierWithReturn *n)
442{
443 notifier_with_return_remove(n);
444}
445
446int precopy_notify(PrecopyNotifyReason reason, Error **errp)
447{
448 PrecopyNotifyData pnd;
449 pnd.reason = reason;
450 pnd.errp = errp;
451
452 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
453}
454
9edabd4d 455uint64_t ram_bytes_remaining(void)
2f4fde93 456{
bae416e5
DDAG
457 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
458 0;
2f4fde93
JQ
459}
460
abce5fa1 461RAMStats ram_counters;
96506894 462
26a26069 463void ram_transferred_add(uint64_t bytes)
4c2d0f6d 464{
ae680668
DE
465 if (runstate_is_running()) {
466 ram_counters.precopy_bytes += bytes;
467 } else if (migration_in_postcopy()) {
abce5fa1 468 stat64_add(&ram_counters.postcopy_bytes, bytes);
ae680668
DE
469 } else {
470 ram_counters.downtime_bytes += bytes;
471 }
abce5fa1 472 stat64_add(&ram_counters.transferred, bytes);
4c2d0f6d
DE
473}
474
4010ba38
JQ
475struct MigrationOps {
476 int (*ram_save_target_page)(RAMState *rs, PageSearchStatus *pss);
477};
478typedef struct MigrationOps MigrationOps;
479
480MigrationOps *migration_ops;
481
76e03000
XG
482CompressionStats compression_counters;
483
56e93d26 484struct CompressParam {
56e93d26 485 bool done;
90e56fb4 486 bool quit;
5e5fdcff 487 bool zero_page;
56e93d26
JQ
488 QEMUFile *file;
489 QemuMutex mutex;
490 QemuCond cond;
491 RAMBlock *block;
492 ram_addr_t offset;
34ab9e97
XG
493
494 /* internally used fields */
dcaf446e 495 z_stream stream;
34ab9e97 496 uint8_t *originbuf;
56e93d26
JQ
497};
498typedef struct CompressParam CompressParam;
499
500struct DecompressParam {
73a8912b 501 bool done;
90e56fb4 502 bool quit;
56e93d26
JQ
503 QemuMutex mutex;
504 QemuCond cond;
505 void *des;
d341d9f3 506 uint8_t *compbuf;
56e93d26 507 int len;
797ca154 508 z_stream stream;
56e93d26
JQ
509};
510typedef struct DecompressParam DecompressParam;
511
512static CompressParam *comp_param;
513static QemuThread *compress_threads;
514/* comp_done_cond is used to wake up the migration thread when
515 * one of the compression threads has finished the compression.
516 * comp_done_lock is used to co-work with comp_done_cond.
517 */
0d9f9a5c
LL
518static QemuMutex comp_done_lock;
519static QemuCond comp_done_cond;
56e93d26 520
34ab9e97 521static QEMUFile *decomp_file;
56e93d26
JQ
522static DecompressParam *decomp_param;
523static QemuThread *decompress_threads;
73a8912b
LL
524static QemuMutex decomp_done_lock;
525static QemuCond decomp_done_cond;
56e93d26 526
93589827
PX
527static int ram_save_host_page_urgent(PageSearchStatus *pss);
528
5e5fdcff 529static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
6ef3771c 530 ram_addr_t offset, uint8_t *source_buf);
56e93d26 531
ebd88a49
PX
532/* NOTE: page is the PFN not real ram_addr_t. */
533static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page)
534{
535 pss->block = rb;
536 pss->page = page;
537 pss->complete_round = false;
538}
539
93589827
PX
540/*
541 * Check whether two PSSs are actively sending the same page. Return true
542 * if it is, false otherwise.
543 */
544static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2)
545{
546 return pss1->host_page_sending && pss2->host_page_sending &&
547 (pss1->host_page_start == pss2->host_page_start);
548}
549
56e93d26
JQ
550static void *do_data_compress(void *opaque)
551{
552 CompressParam *param = opaque;
a7a9a88f
LL
553 RAMBlock *block;
554 ram_addr_t offset;
5e5fdcff 555 bool zero_page;
56e93d26 556
a7a9a88f 557 qemu_mutex_lock(&param->mutex);
90e56fb4 558 while (!param->quit) {
a7a9a88f
LL
559 if (param->block) {
560 block = param->block;
561 offset = param->offset;
562 param->block = NULL;
563 qemu_mutex_unlock(&param->mutex);
564
5e5fdcff
XG
565 zero_page = do_compress_ram_page(param->file, &param->stream,
566 block, offset, param->originbuf);
a7a9a88f 567
0d9f9a5c 568 qemu_mutex_lock(&comp_done_lock);
a7a9a88f 569 param->done = true;
5e5fdcff 570 param->zero_page = zero_page;
0d9f9a5c
LL
571 qemu_cond_signal(&comp_done_cond);
572 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
573
574 qemu_mutex_lock(&param->mutex);
575 } else {
56e93d26
JQ
576 qemu_cond_wait(&param->cond, &param->mutex);
577 }
56e93d26 578 }
a7a9a88f 579 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
580
581 return NULL;
582}
583
f0afa331 584static void compress_threads_save_cleanup(void)
56e93d26
JQ
585{
586 int i, thread_count;
587
05306935 588 if (!migrate_use_compression() || !comp_param) {
56e93d26
JQ
589 return;
590 }
05306935 591
56e93d26
JQ
592 thread_count = migrate_compress_threads();
593 for (i = 0; i < thread_count; i++) {
dcaf446e
XG
594 /*
595 * we use it as a indicator which shows if the thread is
596 * properly init'd or not
597 */
598 if (!comp_param[i].file) {
599 break;
600 }
05306935
FL
601
602 qemu_mutex_lock(&comp_param[i].mutex);
603 comp_param[i].quit = true;
604 qemu_cond_signal(&comp_param[i].cond);
605 qemu_mutex_unlock(&comp_param[i].mutex);
606
56e93d26 607 qemu_thread_join(compress_threads + i);
56e93d26
JQ
608 qemu_mutex_destroy(&comp_param[i].mutex);
609 qemu_cond_destroy(&comp_param[i].cond);
dcaf446e 610 deflateEnd(&comp_param[i].stream);
34ab9e97 611 g_free(comp_param[i].originbuf);
dcaf446e
XG
612 qemu_fclose(comp_param[i].file);
613 comp_param[i].file = NULL;
56e93d26 614 }
0d9f9a5c
LL
615 qemu_mutex_destroy(&comp_done_lock);
616 qemu_cond_destroy(&comp_done_cond);
56e93d26
JQ
617 g_free(compress_threads);
618 g_free(comp_param);
56e93d26
JQ
619 compress_threads = NULL;
620 comp_param = NULL;
56e93d26
JQ
621}
622
dcaf446e 623static int compress_threads_save_setup(void)
56e93d26
JQ
624{
625 int i, thread_count;
626
627 if (!migrate_use_compression()) {
dcaf446e 628 return 0;
56e93d26 629 }
56e93d26
JQ
630 thread_count = migrate_compress_threads();
631 compress_threads = g_new0(QemuThread, thread_count);
632 comp_param = g_new0(CompressParam, thread_count);
0d9f9a5c
LL
633 qemu_cond_init(&comp_done_cond);
634 qemu_mutex_init(&comp_done_lock);
56e93d26 635 for (i = 0; i < thread_count; i++) {
34ab9e97
XG
636 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
637 if (!comp_param[i].originbuf) {
638 goto exit;
639 }
640
dcaf446e
XG
641 if (deflateInit(&comp_param[i].stream,
642 migrate_compress_level()) != Z_OK) {
34ab9e97 643 g_free(comp_param[i].originbuf);
dcaf446e
XG
644 goto exit;
645 }
646
e110aa91
C
647 /* comp_param[i].file is just used as a dummy buffer to save data,
648 * set its ops to empty.
56e93d26 649 */
77ef2dc1 650 comp_param[i].file = qemu_file_new_output(
c0e0825c 651 QIO_CHANNEL(qio_channel_null_new()));
56e93d26 652 comp_param[i].done = true;
90e56fb4 653 comp_param[i].quit = false;
56e93d26
JQ
654 qemu_mutex_init(&comp_param[i].mutex);
655 qemu_cond_init(&comp_param[i].cond);
656 qemu_thread_create(compress_threads + i, "compress",
657 do_data_compress, comp_param + i,
658 QEMU_THREAD_JOINABLE);
659 }
dcaf446e
XG
660 return 0;
661
662exit:
663 compress_threads_save_cleanup();
664 return -1;
56e93d26
JQ
665}
666
667/**
3d0684b2 668 * save_page_header: write page header to wire
56e93d26
JQ
669 *
670 * If this is the 1st block, it also writes the block identification
671 *
3d0684b2 672 * Returns the number of bytes written
56e93d26 673 *
ec6f3ab9 674 * @pss: current PSS channel status
56e93d26
JQ
675 * @block: block that contains the page we want to send
676 * @offset: offset inside the block for the page
677 * in the lower bits, it contains flags
678 */
37502df3
LS
679static size_t save_page_header(PageSearchStatus *pss, QEMUFile *f,
680 RAMBlock *block, ram_addr_t offset)
56e93d26 681{
9f5f380b 682 size_t size, len;
ec6f3ab9 683 bool same_block = (block == pss->last_sent_block);
56e93d26 684
10661f11 685 if (same_block) {
24795694
JQ
686 offset |= RAM_SAVE_FLAG_CONTINUE;
687 }
2bf3aa85 688 qemu_put_be64(f, offset);
56e93d26
JQ
689 size = 8;
690
10661f11 691 if (!same_block) {
9f5f380b 692 len = strlen(block->idstr);
2bf3aa85
JQ
693 qemu_put_byte(f, len);
694 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
9f5f380b 695 size += 1 + len;
ec6f3ab9 696 pss->last_sent_block = block;
56e93d26
JQ
697 }
698 return size;
699}
700
3d0684b2 701/**
179a8080 702 * mig_throttle_guest_down: throttle down the guest
3d0684b2
JQ
703 *
704 * Reduce amount of guest cpu execution to hopefully slow down memory
705 * writes. If guest dirty memory rate is reduced below the rate at
706 * which we can transfer pages to the destination then we should be
707 * able to complete migration. Some workloads dirty memory way too
708 * fast and will not effectively converge, even with auto-converge.
070afca2 709 */
cbbf8182
KZ
710static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
711 uint64_t bytes_dirty_threshold)
070afca2
JH
712{
713 MigrationState *s = migrate_get_current();
2594f56d 714 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
cbbf8182
KZ
715 uint64_t pct_increment = s->parameters.cpu_throttle_increment;
716 bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
4cbc9c7f 717 int pct_max = s->parameters.max_cpu_throttle;
070afca2 718
cbbf8182
KZ
719 uint64_t throttle_now = cpu_throttle_get_percentage();
720 uint64_t cpu_now, cpu_ideal, throttle_inc;
721
070afca2
JH
722 /* We have not started throttling yet. Let's start it. */
723 if (!cpu_throttle_active()) {
724 cpu_throttle_set(pct_initial);
725 } else {
726 /* Throttling already on, just increase the rate */
cbbf8182
KZ
727 if (!pct_tailslow) {
728 throttle_inc = pct_increment;
729 } else {
730 /* Compute the ideal CPU percentage used by Guest, which may
731 * make the dirty rate match the dirty rate threshold. */
732 cpu_now = 100 - throttle_now;
733 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
734 bytes_dirty_period);
735 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
736 }
737 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
070afca2
JH
738 }
739}
740
91fe9a8d
RL
741void mig_throttle_counter_reset(void)
742{
743 RAMState *rs = ram_state;
744
745 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
746 rs->num_dirty_pages_period = 0;
abce5fa1 747 rs->bytes_xfer_prev = stat64_get(&ram_counters.transferred);
91fe9a8d
RL
748}
749
3d0684b2
JQ
750/**
751 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
752 *
6f37bb8b 753 * @rs: current RAM state
3d0684b2
JQ
754 * @current_addr: address for the zero page
755 *
756 * Update the xbzrle cache to reflect a page that's been sent as all 0.
56e93d26
JQ
757 * The important thing is that a stale (not-yet-0'd) page be replaced
758 * by the new data.
759 * As a bonus, if the page wasn't in the cache it gets added so that
3d0684b2 760 * when a small write is made into the 0'd page it gets XBZRLE sent.
56e93d26 761 */
6f37bb8b 762static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
56e93d26 763{
56e93d26
JQ
764 /* We don't care if this fails to allocate a new cache page
765 * as long as it updated an old one */
c00e0928 766 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
9360447d 767 ram_counters.dirty_sync_count);
56e93d26
JQ
768}
769
770#define ENCODING_FLAG_XBZRLE 0x1
771
772/**
773 * save_xbzrle_page: compress and send current page
774 *
775 * Returns: 1 means that we wrote the page
776 * 0 means that page is identical to the one already sent
777 * -1 means that xbzrle would be longer than normal
778 *
5a987738 779 * @rs: current RAM state
ec6f3ab9 780 * @pss: current PSS channel
3d0684b2
JQ
781 * @current_data: pointer to the address of the page contents
782 * @current_addr: addr of the page
56e93d26
JQ
783 * @block: block that contains the page we want to send
784 * @offset: offset inside the block for the page
56e93d26 785 */
ec6f3ab9 786static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
61717ea9
PX
787 uint8_t **current_data, ram_addr_t current_addr,
788 RAMBlock *block, ram_addr_t offset)
56e93d26
JQ
789{
790 int encoded_len = 0, bytes_xbzrle;
791 uint8_t *prev_cached_page;
ec6f3ab9 792 QEMUFile *file = pss->pss_channel;
56e93d26 793
9360447d
JQ
794 if (!cache_is_cached(XBZRLE.cache, current_addr,
795 ram_counters.dirty_sync_count)) {
796 xbzrle_counters.cache_miss++;
05931ec5 797 if (!rs->last_stage) {
56e93d26 798 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
9360447d 799 ram_counters.dirty_sync_count) == -1) {
56e93d26
JQ
800 return -1;
801 } else {
802 /* update *current_data when the page has been
803 inserted into cache */
804 *current_data = get_cached_data(XBZRLE.cache, current_addr);
805 }
806 }
807 return -1;
808 }
809
e460a4b1
WW
810 /*
811 * Reaching here means the page has hit the xbzrle cache, no matter what
812 * encoding result it is (normal encoding, overflow or skipping the page),
3a4452d8 813 * count the page as encoded. This is used to calculate the encoding rate.
e460a4b1
WW
814 *
815 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
816 * 2nd page turns out to be skipped (i.e. no new bytes written to the
817 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
818 * skipped page included. In this way, the encoding rate can tell if the
819 * guest page is good for xbzrle encoding.
820 */
821 xbzrle_counters.pages++;
56e93d26
JQ
822 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
823
824 /* save current buffer into memory */
825 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
826
827 /* XBZRLE encoding (if there is no overflow) */
04ffce13 828 encoded_len = xbzrle_encode_buffer_func(prev_cached_page, XBZRLE.current_buf,
829 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
830 TARGET_PAGE_SIZE);
ca353803
WY
831
832 /*
833 * Update the cache contents, so that it corresponds to the data
834 * sent, in all cases except where we skip the page.
835 */
05931ec5 836 if (!rs->last_stage && encoded_len != 0) {
ca353803
WY
837 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
838 /*
839 * In the case where we couldn't compress, ensure that the caller
840 * sends the data from the cache, since the guest might have
841 * changed the RAM since we copied it.
842 */
843 *current_data = prev_cached_page;
844 }
845
56e93d26 846 if (encoded_len == 0) {
55c4446b 847 trace_save_xbzrle_page_skipping();
56e93d26
JQ
848 return 0;
849 } else if (encoded_len == -1) {
55c4446b 850 trace_save_xbzrle_page_overflow();
9360447d 851 xbzrle_counters.overflow++;
e460a4b1 852 xbzrle_counters.bytes += TARGET_PAGE_SIZE;
56e93d26
JQ
853 return -1;
854 }
855
56e93d26 856 /* Send XBZRLE based compressed page */
37502df3 857 bytes_xbzrle = save_page_header(pss, pss->pss_channel, block,
204b88b8 858 offset | RAM_SAVE_FLAG_XBZRLE);
61717ea9
PX
859 qemu_put_byte(file, ENCODING_FLAG_XBZRLE);
860 qemu_put_be16(file, encoded_len);
861 qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len);
56e93d26 862 bytes_xbzrle += encoded_len + 1 + 2;
e460a4b1
WW
863 /*
864 * Like compressed_size (please see update_compress_thread_counts),
865 * the xbzrle encoded bytes don't count the 8 byte header with
866 * RAM_SAVE_FLAG_CONTINUE.
867 */
868 xbzrle_counters.bytes += bytes_xbzrle - 8;
4c2d0f6d 869 ram_transferred_add(bytes_xbzrle);
56e93d26
JQ
870
871 return 1;
872}
873
3d0684b2 874/**
d9e474ea 875 * pss_find_next_dirty: find the next dirty page of current ramblock
f3f491fc 876 *
d9e474ea
PX
877 * This function updates pss->page to point to the next dirty page index
878 * within the ramblock to migrate, or the end of ramblock when nothing
879 * found. Note that when pss->host_page_sending==true it means we're
880 * during sending a host page, so we won't look for dirty page that is
881 * outside the host page boundary.
3d0684b2 882 *
d9e474ea 883 * @pss: the current page search status
f3f491fc 884 */
d9e474ea 885static void pss_find_next_dirty(PageSearchStatus *pss)
56e93d26 886{
d9e474ea 887 RAMBlock *rb = pss->block;
6b6712ef
JQ
888 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
889 unsigned long *bitmap = rb->bmap;
56e93d26 890
fbd162e6 891 if (ramblock_is_ignored(rb)) {
d9e474ea
PX
892 /* Points directly to the end, so we know no dirty page */
893 pss->page = size;
894 return;
895 }
896
897 /*
898 * If during sending a host page, only look for dirty pages within the
899 * current host page being send.
900 */
901 if (pss->host_page_sending) {
902 assert(pss->host_page_end);
903 size = MIN(size, pss->host_page_end);
b895de50
CLG
904 }
905
d9e474ea 906 pss->page = find_next_bit(bitmap, size, pss->page);
56e93d26
JQ
907}
908
1230a25f 909static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
3143577d
WW
910 unsigned long page)
911{
912 uint8_t shift;
913 hwaddr size, start;
914
915 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
916 return;
917 }
918
919 shift = rb->clear_bmap_shift;
920 /*
921 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
922 * can make things easier sometimes since then start address
923 * of the small chunk will always be 64 pages aligned so the
924 * bitmap will always be aligned to unsigned long. We should
925 * even be able to remove this restriction but I'm simply
926 * keeping it.
927 */
928 assert(shift >= 6);
929
930 size = 1ULL << (TARGET_PAGE_BITS + shift);
7648297d 931 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
3143577d
WW
932 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
933 memory_region_clear_dirty_bitmap(rb->mr, start, size);
934}
935
936static void
1230a25f 937migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
3143577d
WW
938 unsigned long start,
939 unsigned long npages)
940{
941 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
942 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
943 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
944
945 /*
946 * Clear pages from start to start + npages - 1, so the end boundary is
947 * exclusive.
948 */
949 for (i = chunk_start; i < chunk_end; i += chunk_pages) {
1230a25f 950 migration_clear_memory_region_dirty_bitmap(rb, i);
3143577d
WW
951 }
952}
953
a6a83cef
RL
954/*
955 * colo_bitmap_find_diry:find contiguous dirty pages from start
956 *
957 * Returns the page offset within memory region of the start of the contiguout
958 * dirty page
959 *
960 * @rs: current RAM state
961 * @rb: RAMBlock where to search for dirty pages
962 * @start: page where we start the search
963 * @num: the number of contiguous dirty pages
964 */
965static inline
966unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
967 unsigned long start, unsigned long *num)
968{
969 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
970 unsigned long *bitmap = rb->bmap;
971 unsigned long first, next;
972
973 *num = 0;
974
975 if (ramblock_is_ignored(rb)) {
976 return size;
977 }
978
979 first = find_next_bit(bitmap, size, start);
980 if (first >= size) {
981 return first;
982 }
983 next = find_next_zero_bit(bitmap, size, first + 1);
984 assert(next >= first);
985 *num = next - first;
986 return first;
987}
988
06b10688 989static inline bool migration_bitmap_clear_dirty(RAMState *rs,
f20e2865
JQ
990 RAMBlock *rb,
991 unsigned long page)
a82d593b
DDAG
992{
993 bool ret;
a82d593b 994
002cad6b
PX
995 /*
996 * Clear dirty bitmap if needed. This _must_ be called before we
997 * send any of the page in the chunk because we need to make sure
998 * we can capture further page content changes when we sync dirty
999 * log the next time. So as long as we are going to send any of
1000 * the page in the chunk we clear the remote dirty bitmap for all.
1001 * Clearing it earlier won't be a problem, but too late will.
1002 */
1230a25f 1003 migration_clear_memory_region_dirty_bitmap(rb, page);
002cad6b 1004
6b6712ef 1005 ret = test_and_clear_bit(page, rb->bmap);
a82d593b 1006 if (ret) {
0d8ec885 1007 rs->migration_dirty_pages--;
a82d593b 1008 }
386a907b 1009
a82d593b
DDAG
1010 return ret;
1011}
1012
be39b4cd
DH
1013static void dirty_bitmap_clear_section(MemoryRegionSection *section,
1014 void *opaque)
1015{
1016 const hwaddr offset = section->offset_within_region;
1017 const hwaddr size = int128_get64(section->size);
1018 const unsigned long start = offset >> TARGET_PAGE_BITS;
1019 const unsigned long npages = size >> TARGET_PAGE_BITS;
1020 RAMBlock *rb = section->mr->ram_block;
1021 uint64_t *cleared_bits = opaque;
1022
1023 /*
1024 * We don't grab ram_state->bitmap_mutex because we expect to run
1025 * only when starting migration or during postcopy recovery where
1026 * we don't have concurrent access.
1027 */
1028 if (!migration_in_postcopy() && !migrate_background_snapshot()) {
1029 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
1030 }
1031 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
1032 bitmap_clear(rb->bmap, start, npages);
1033}
1034
1035/*
1036 * Exclude all dirty pages from migration that fall into a discarded range as
1037 * managed by a RamDiscardManager responsible for the mapped memory region of
1038 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
1039 *
1040 * Discarded pages ("logically unplugged") have undefined content and must
1041 * not get migrated, because even reading these pages for migration might
1042 * result in undesired behavior.
1043 *
1044 * Returns the number of cleared bits in the RAMBlock dirty bitmap.
1045 *
1046 * Note: The result is only stable while migrating (precopy/postcopy).
1047 */
1048static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
1049{
1050 uint64_t cleared_bits = 0;
1051
1052 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
1053 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1054 MemoryRegionSection section = {
1055 .mr = rb->mr,
1056 .offset_within_region = 0,
1057 .size = int128_make64(qemu_ram_get_used_length(rb)),
1058 };
1059
1060 ram_discard_manager_replay_discarded(rdm, &section,
1061 dirty_bitmap_clear_section,
1062 &cleared_bits);
1063 }
1064 return cleared_bits;
1065}
1066
9470c5e0
DH
1067/*
1068 * Check if a host-page aligned page falls into a discarded range as managed by
1069 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
1070 *
1071 * Note: The result is only stable while migrating (precopy/postcopy).
1072 */
1073bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
1074{
1075 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1076 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1077 MemoryRegionSection section = {
1078 .mr = rb->mr,
1079 .offset_within_region = start,
1080 .size = int128_make64(qemu_ram_pagesize(rb)),
1081 };
1082
1083 return !ram_discard_manager_is_populated(rdm, &section);
1084 }
1085 return false;
1086}
1087
267691b6 1088/* Called with RCU critical section */
7a3e9571 1089static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
56e93d26 1090{
fb613580
KZ
1091 uint64_t new_dirty_pages =
1092 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
1093
1094 rs->migration_dirty_pages += new_dirty_pages;
1095 rs->num_dirty_pages_period += new_dirty_pages;
56e93d26
JQ
1096}
1097
3d0684b2
JQ
1098/**
1099 * ram_pagesize_summary: calculate all the pagesizes of a VM
1100 *
1101 * Returns a summary bitmap of the page sizes of all RAMBlocks
1102 *
1103 * For VMs with just normal pages this is equivalent to the host page
1104 * size. If it's got some huge pages then it's the OR of all the
1105 * different page sizes.
e8ca1db2
DDAG
1106 */
1107uint64_t ram_pagesize_summary(void)
1108{
1109 RAMBlock *block;
1110 uint64_t summary = 0;
1111
fbd162e6 1112 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
e8ca1db2
DDAG
1113 summary |= block->page_size;
1114 }
1115
1116 return summary;
1117}
1118
aecbfe9c
XG
1119uint64_t ram_get_total_transferred_pages(void)
1120{
abce5fa1
JQ
1121 return stat64_get(&ram_counters.normal) +
1122 stat64_get(&ram_counters.duplicate) +
23b7576d 1123 compression_counters.pages + xbzrle_counters.pages;
aecbfe9c
XG
1124}
1125
b734035b
XG
1126static void migration_update_rates(RAMState *rs, int64_t end_time)
1127{
be8b02ed 1128 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
76e03000 1129 double compressed_size;
b734035b
XG
1130
1131 /* calculate period counters */
1132 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1133 / (end_time - rs->time_last_bitmap_sync);
1134
be8b02ed 1135 if (!page_count) {
b734035b
XG
1136 return;
1137 }
1138
1139 if (migrate_use_xbzrle()) {
e460a4b1
WW
1140 double encoded_size, unencoded_size;
1141
b734035b 1142 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
be8b02ed 1143 rs->xbzrle_cache_miss_prev) / page_count;
b734035b 1144 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
e460a4b1
WW
1145 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1146 TARGET_PAGE_SIZE;
1147 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
92271402 1148 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
e460a4b1 1149 xbzrle_counters.encoding_rate = 0;
e460a4b1
WW
1150 } else {
1151 xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1152 }
1153 rs->xbzrle_pages_prev = xbzrle_counters.pages;
1154 rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
b734035b 1155 }
76e03000
XG
1156
1157 if (migrate_use_compression()) {
1158 compression_counters.busy_rate = (double)(compression_counters.busy -
1159 rs->compress_thread_busy_prev) / page_count;
1160 rs->compress_thread_busy_prev = compression_counters.busy;
1161
1162 compressed_size = compression_counters.compressed_size -
1163 rs->compressed_size_prev;
1164 if (compressed_size) {
1165 double uncompressed_size = (compression_counters.pages -
1166 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1167
1168 /* Compression-Ratio = Uncompressed-size / Compressed-size */
1169 compression_counters.compression_rate =
1170 uncompressed_size / compressed_size;
1171
1172 rs->compress_pages_prev = compression_counters.pages;
1173 rs->compressed_size_prev = compression_counters.compressed_size;
1174 }
1175 }
b734035b
XG
1176}
1177
dc14a470
KZ
1178static void migration_trigger_throttle(RAMState *rs)
1179{
1180 MigrationState *s = migrate_get_current();
1181 uint64_t threshold = s->parameters.throttle_trigger_threshold;
23b7576d 1182 uint64_t bytes_xfer_period =
abce5fa1 1183 stat64_get(&ram_counters.transferred) - rs->bytes_xfer_prev;
dc14a470
KZ
1184 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1185 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1186
1187 /* During block migration the auto-converge logic incorrectly detects
1188 * that ram migration makes no progress. Avoid this by disabling the
1189 * throttling logic during the bulk phase of block migration. */
1190 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1191 /* The following detection logic can be refined later. For now:
1192 Check to see if the ratio between dirtied bytes and the approx.
1193 amount of bytes that just got transferred since the last time
1194 we were in this routine reaches the threshold. If that happens
1195 twice, start or increase throttling. */
1196
1197 if ((bytes_dirty_period > bytes_dirty_threshold) &&
1198 (++rs->dirty_rate_high_cnt >= 2)) {
1199 trace_migration_throttle();
1200 rs->dirty_rate_high_cnt = 0;
cbbf8182
KZ
1201 mig_throttle_guest_down(bytes_dirty_period,
1202 bytes_dirty_threshold);
dc14a470
KZ
1203 }
1204 }
1205}
1206
8d820d6f 1207static void migration_bitmap_sync(RAMState *rs)
56e93d26
JQ
1208{
1209 RAMBlock *block;
56e93d26 1210 int64_t end_time;
56e93d26 1211
9360447d 1212 ram_counters.dirty_sync_count++;
56e93d26 1213
f664da80
JQ
1214 if (!rs->time_last_bitmap_sync) {
1215 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
56e93d26
JQ
1216 }
1217
1218 trace_migration_bitmap_sync_start();
9c1f8f44 1219 memory_global_dirty_log_sync();
56e93d26 1220
108cfae0 1221 qemu_mutex_lock(&rs->bitmap_mutex);
89ac5a1d
DDAG
1222 WITH_RCU_READ_LOCK_GUARD() {
1223 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1224 ramblock_sync_dirty_bitmap(rs, block);
1225 }
1226 ram_counters.remaining = ram_bytes_remaining();
56e93d26 1227 }
108cfae0 1228 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26 1229
9458a9a1 1230 memory_global_after_dirty_log_sync();
a66cd90c 1231 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1ffb5dfd 1232
56e93d26
JQ
1233 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1234
1235 /* more than 1 second = 1000 millisecons */
f664da80 1236 if (end_time > rs->time_last_bitmap_sync + 1000) {
dc14a470 1237 migration_trigger_throttle(rs);
070afca2 1238
b734035b
XG
1239 migration_update_rates(rs, end_time);
1240
be8b02ed 1241 rs->target_page_count_prev = rs->target_page_count;
d693c6f1
FF
1242
1243 /* reset period counters */
f664da80 1244 rs->time_last_bitmap_sync = end_time;
a66cd90c 1245 rs->num_dirty_pages_period = 0;
abce5fa1 1246 rs->bytes_xfer_prev = stat64_get(&ram_counters.transferred);
56e93d26 1247 }
4addcd4f 1248 if (migrate_use_events()) {
3ab72385 1249 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
4addcd4f 1250 }
56e93d26
JQ
1251}
1252
bd227060
WW
1253static void migration_bitmap_sync_precopy(RAMState *rs)
1254{
1255 Error *local_err = NULL;
1256
1257 /*
1258 * The current notifier usage is just an optimization to migration, so we
1259 * don't stop the normal migration process in the error case.
1260 */
1261 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1262 error_report_err(local_err);
b4a1733c 1263 local_err = NULL;
bd227060
WW
1264 }
1265
1266 migration_bitmap_sync(rs);
1267
1268 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1269 error_report_err(local_err);
1270 }
1271}
1272
a4dbaf8e 1273void ram_release_page(const char *rbname, uint64_t offset)
47fe16ff
JQ
1274{
1275 if (!migrate_release_ram() || !migration_in_postcopy()) {
1276 return;
1277 }
1278
1279 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1280}
1281
6c97ec5f
XG
1282/**
1283 * save_zero_page_to_file: send the zero page to the file
1284 *
1285 * Returns the size of data written to the file, 0 means the page is not
1286 * a zero page
1287 *
ec6f3ab9 1288 * @pss: current PSS channel
6c97ec5f
XG
1289 * @block: block that contains the page we want to send
1290 * @offset: offset inside the block for the page
1291 */
37502df3 1292static int save_zero_page_to_file(PageSearchStatus *pss, QEMUFile *file,
6c97ec5f
XG
1293 RAMBlock *block, ram_addr_t offset)
1294{
1295 uint8_t *p = block->host + offset;
1296 int len = 0;
1297
bad452a7 1298 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
37502df3 1299 len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO);
6c97ec5f
XG
1300 qemu_put_byte(file, 0);
1301 len += 1;
47fe16ff 1302 ram_release_page(block->idstr, offset);
6c97ec5f
XG
1303 }
1304 return len;
1305}
1306
56e93d26 1307/**
3d0684b2 1308 * save_zero_page: send the zero page to the stream
56e93d26 1309 *
3d0684b2 1310 * Returns the number of pages written.
56e93d26 1311 *
ec6f3ab9 1312 * @pss: current PSS channel
56e93d26
JQ
1313 * @block: block that contains the page we want to send
1314 * @offset: offset inside the block for the page
56e93d26 1315 */
37502df3 1316static int save_zero_page(PageSearchStatus *pss, QEMUFile *f, RAMBlock *block,
61717ea9 1317 ram_addr_t offset)
56e93d26 1318{
37502df3 1319 int len = save_zero_page_to_file(pss, f, block, offset);
56e93d26 1320
6c97ec5f 1321 if (len) {
abce5fa1 1322 stat64_add(&ram_counters.duplicate, 1);
4c2d0f6d 1323 ram_transferred_add(len);
6c97ec5f 1324 return 1;
56e93d26 1325 }
6c97ec5f 1326 return -1;
56e93d26
JQ
1327}
1328
059ff0fb
XG
1329/*
1330 * @pages: the number of pages written by the control path,
1331 * < 0 - error
1332 * > 0 - number of pages written
1333 *
1334 * Return true if the pages has been saved, otherwise false is returned.
1335 */
61717ea9
PX
1336static bool control_save_page(PageSearchStatus *pss, RAMBlock *block,
1337 ram_addr_t offset, int *pages)
059ff0fb
XG
1338{
1339 uint64_t bytes_xmit = 0;
1340 int ret;
1341
1342 *pages = -1;
61717ea9
PX
1343 ret = ram_control_save_page(pss->pss_channel, block->offset, offset,
1344 TARGET_PAGE_SIZE, &bytes_xmit);
059ff0fb
XG
1345 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1346 return false;
1347 }
1348
1349 if (bytes_xmit) {
4c2d0f6d 1350 ram_transferred_add(bytes_xmit);
059ff0fb
XG
1351 *pages = 1;
1352 }
1353
1354 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1355 return true;
1356 }
1357
1358 if (bytes_xmit > 0) {
abce5fa1 1359 stat64_add(&ram_counters.normal, 1);
059ff0fb 1360 } else if (bytes_xmit == 0) {
abce5fa1 1361 stat64_add(&ram_counters.duplicate, 1);
059ff0fb
XG
1362 }
1363
1364 return true;
1365}
1366
65dacaa0
XG
1367/*
1368 * directly send the page to the stream
1369 *
1370 * Returns the number of pages written.
1371 *
ec6f3ab9 1372 * @pss: current PSS channel
65dacaa0
XG
1373 * @block: block that contains the page we want to send
1374 * @offset: offset inside the block for the page
1375 * @buf: the page to be sent
1376 * @async: send to page asyncly
1377 */
ec6f3ab9 1378static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
61717ea9 1379 ram_addr_t offset, uint8_t *buf, bool async)
65dacaa0 1380{
ec6f3ab9
PX
1381 QEMUFile *file = pss->pss_channel;
1382
37502df3 1383 ram_transferred_add(save_page_header(pss, pss->pss_channel, block,
4c2d0f6d 1384 offset | RAM_SAVE_FLAG_PAGE));
65dacaa0 1385 if (async) {
61717ea9 1386 qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
f912ec5b 1387 migrate_release_ram() &&
65dacaa0
XG
1388 migration_in_postcopy());
1389 } else {
61717ea9 1390 qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
65dacaa0 1391 }
4c2d0f6d 1392 ram_transferred_add(TARGET_PAGE_SIZE);
abce5fa1 1393 stat64_add(&ram_counters.normal, 1);
65dacaa0
XG
1394 return 1;
1395}
1396
56e93d26 1397/**
3d0684b2 1398 * ram_save_page: send the given page to the stream
56e93d26 1399 *
3d0684b2 1400 * Returns the number of pages written.
3fd3c4b3
DDAG
1401 * < 0 - error
1402 * >=0 - Number of pages written - this might legally be 0
1403 * if xbzrle noticed the page was the same.
56e93d26 1404 *
6f37bb8b 1405 * @rs: current RAM state
56e93d26
JQ
1406 * @block: block that contains the page we want to send
1407 * @offset: offset inside the block for the page
56e93d26 1408 */
05931ec5 1409static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
56e93d26
JQ
1410{
1411 int pages = -1;
56e93d26 1412 uint8_t *p;
56e93d26 1413 bool send_async = true;
a08f6890 1414 RAMBlock *block = pss->block;
8bba004c 1415 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
059ff0fb 1416 ram_addr_t current_addr = block->offset + offset;
56e93d26 1417
2f68e399 1418 p = block->host + offset;
1db9d8e5 1419 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
56e93d26 1420
56e93d26 1421 XBZRLE_cache_lock();
1a373522 1422 if (rs->xbzrle_enabled && !migration_in_postcopy()) {
ec6f3ab9 1423 pages = save_xbzrle_page(rs, pss, &p, current_addr,
61717ea9 1424 block, offset);
05931ec5 1425 if (!rs->last_stage) {
059ff0fb
XG
1426 /* Can't send this cached data async, since the cache page
1427 * might get updated before it gets to the wire
56e93d26 1428 */
059ff0fb 1429 send_async = false;
56e93d26
JQ
1430 }
1431 }
1432
1433 /* XBZRLE overflow or normal page */
1434 if (pages == -1) {
ec6f3ab9 1435 pages = save_normal_page(pss, block, offset, p, send_async);
56e93d26
JQ
1436 }
1437
1438 XBZRLE_cache_unlock();
1439
1440 return pages;
1441}
1442
61717ea9 1443static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block,
b9ee2f7d
JQ
1444 ram_addr_t offset)
1445{
61717ea9 1446 if (multifd_queue_page(file, block, offset) < 0) {
713f762a
IR
1447 return -1;
1448 }
abce5fa1 1449 stat64_add(&ram_counters.normal, 1);
b9ee2f7d
JQ
1450
1451 return 1;
1452}
1453
5e5fdcff 1454static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
6ef3771c 1455 ram_addr_t offset, uint8_t *source_buf)
56e93d26 1456{
53518d94 1457 RAMState *rs = ram_state;
ec6f3ab9 1458 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
20d549cb 1459 uint8_t *p = block->host + offset;
6ef3771c 1460 int ret;
56e93d26 1461
37502df3 1462 if (save_zero_page_to_file(pss, f, block, offset)) {
e7f2e190 1463 return true;
5e5fdcff
XG
1464 }
1465
37502df3 1466 save_page_header(pss, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
34ab9e97
XG
1467
1468 /*
1469 * copy it to a internal buffer to avoid it being modified by VM
1470 * so that we can catch up the error during compression and
1471 * decompression
1472 */
1473 memcpy(source_buf, p, TARGET_PAGE_SIZE);
6ef3771c
XG
1474 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1475 if (ret < 0) {
1476 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
b3be2896 1477 error_report("compressed data failed!");
b3be2896 1478 }
e7f2e190 1479 return false;
5e5fdcff
XG
1480}
1481
1482static void
1483update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1484{
4c2d0f6d 1485 ram_transferred_add(bytes_xmit);
76e03000 1486
5e5fdcff 1487 if (param->zero_page) {
abce5fa1 1488 stat64_add(&ram_counters.duplicate, 1);
76e03000 1489 return;
5e5fdcff 1490 }
76e03000
XG
1491
1492 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1493 compression_counters.compressed_size += bytes_xmit - 8;
1494 compression_counters.pages++;
56e93d26
JQ
1495}
1496
32b05495
XG
1497static bool save_page_use_compression(RAMState *rs);
1498
ce25d337 1499static void flush_compressed_data(RAMState *rs)
56e93d26 1500{
eaa238ab 1501 MigrationState *ms = migrate_get_current();
56e93d26
JQ
1502 int idx, len, thread_count;
1503
32b05495 1504 if (!save_page_use_compression(rs)) {
56e93d26
JQ
1505 return;
1506 }
1507 thread_count = migrate_compress_threads();
a7a9a88f 1508
0d9f9a5c 1509 qemu_mutex_lock(&comp_done_lock);
56e93d26 1510 for (idx = 0; idx < thread_count; idx++) {
a7a9a88f 1511 while (!comp_param[idx].done) {
0d9f9a5c 1512 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26 1513 }
a7a9a88f 1514 }
0d9f9a5c 1515 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
1516
1517 for (idx = 0; idx < thread_count; idx++) {
1518 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 1519 if (!comp_param[idx].quit) {
eaa238ab 1520 len = qemu_put_qemu_file(ms->to_dst_file, comp_param[idx].file);
5e5fdcff
XG
1521 /*
1522 * it's safe to fetch zero_page without holding comp_done_lock
1523 * as there is no further request submitted to the thread,
1524 * i.e, the thread should be waiting for a request at this point.
1525 */
1526 update_compress_thread_counts(&comp_param[idx], len);
56e93d26 1527 }
a7a9a88f 1528 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26
JQ
1529 }
1530}
1531
1532static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1533 ram_addr_t offset)
1534{
1535 param->block = block;
1536 param->offset = offset;
1537}
1538
eaa238ab 1539static int compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset)
56e93d26
JQ
1540{
1541 int idx, thread_count, bytes_xmit = -1, pages = -1;
1d58872a 1542 bool wait = migrate_compress_wait_thread();
eaa238ab 1543 MigrationState *ms = migrate_get_current();
56e93d26
JQ
1544
1545 thread_count = migrate_compress_threads();
0d9f9a5c 1546 qemu_mutex_lock(&comp_done_lock);
1d58872a
XG
1547retry:
1548 for (idx = 0; idx < thread_count; idx++) {
1549 if (comp_param[idx].done) {
1550 comp_param[idx].done = false;
eaa238ab
PX
1551 bytes_xmit = qemu_put_qemu_file(ms->to_dst_file,
1552 comp_param[idx].file);
1d58872a
XG
1553 qemu_mutex_lock(&comp_param[idx].mutex);
1554 set_compress_params(&comp_param[idx], block, offset);
1555 qemu_cond_signal(&comp_param[idx].cond);
1556 qemu_mutex_unlock(&comp_param[idx].mutex);
1557 pages = 1;
5e5fdcff 1558 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
56e93d26 1559 break;
56e93d26
JQ
1560 }
1561 }
1d58872a
XG
1562
1563 /*
1564 * wait for the free thread if the user specifies 'compress-wait-thread',
1565 * otherwise we will post the page out in the main thread as normal page.
1566 */
1567 if (pages < 0 && wait) {
1568 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1569 goto retry;
1570 }
0d9f9a5c 1571 qemu_mutex_unlock(&comp_done_lock);
56e93d26
JQ
1572
1573 return pages;
1574}
1575
31e2ac74
JQ
1576#define PAGE_ALL_CLEAN 0
1577#define PAGE_TRY_AGAIN 1
1578#define PAGE_DIRTY_FOUND 2
3d0684b2
JQ
1579/**
1580 * find_dirty_block: find the next dirty page and update any state
1581 * associated with the search process.
b9e60928 1582 *
31e2ac74
JQ
1583 * Returns:
1584 * PAGE_ALL_CLEAN: no dirty page found, give up
1585 * PAGE_TRY_AGAIN: no dirty page found, retry for next block
1586 * PAGE_DIRTY_FOUND: dirty page found
b9e60928 1587 *
6f37bb8b 1588 * @rs: current RAM state
3d0684b2
JQ
1589 * @pss: data about the state of the current dirty page scan
1590 * @again: set to false if the search has scanned the whole of RAM
b9e60928 1591 */
31e2ac74 1592static int find_dirty_block(RAMState *rs, PageSearchStatus *pss)
b9e60928 1593{
d9e474ea
PX
1594 /* Update pss->page for the next dirty bit in ramblock */
1595 pss_find_next_dirty(pss);
1596
6f37bb8b 1597 if (pss->complete_round && pss->block == rs->last_seen_block &&
a935e30f 1598 pss->page >= rs->last_page) {
b9e60928
DDAG
1599 /*
1600 * We've been once around the RAM and haven't found anything.
1601 * Give up.
1602 */
31e2ac74 1603 return PAGE_ALL_CLEAN;
b9e60928 1604 }
542147f4
DH
1605 if (!offset_in_ramblock(pss->block,
1606 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
b9e60928 1607 /* Didn't find anything in this RAM Block */
a935e30f 1608 pss->page = 0;
b9e60928
DDAG
1609 pss->block = QLIST_NEXT_RCU(pss->block, next);
1610 if (!pss->block) {
48df9d80
XG
1611 /*
1612 * If memory migration starts over, we will meet a dirtied page
1613 * which may still exists in compression threads's ring, so we
1614 * should flush the compressed data to make sure the new page
1615 * is not overwritten by the old one in the destination.
1616 *
1617 * Also If xbzrle is on, stop using the data compression at this
1618 * point. In theory, xbzrle can do better than compression.
1619 */
1620 flush_compressed_data(rs);
1621
b9e60928
DDAG
1622 /* Hit the end of the list */
1623 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1624 /* Flag that we've looped */
1625 pss->complete_round = true;
1a373522
DH
1626 /* After the first round, enable XBZRLE. */
1627 if (migrate_use_xbzrle()) {
1628 rs->xbzrle_enabled = true;
1629 }
b9e60928
DDAG
1630 }
1631 /* Didn't find anything this time, but try again on the new block */
31e2ac74 1632 return PAGE_TRY_AGAIN;
b9e60928 1633 } else {
31e2ac74
JQ
1634 /* We've found something */
1635 return PAGE_DIRTY_FOUND;
b9e60928
DDAG
1636 }
1637}
1638
3d0684b2
JQ
1639/**
1640 * unqueue_page: gets a page of the queue
1641 *
a82d593b 1642 * Helper for 'get_queued_page' - gets a page off the queue
a82d593b 1643 *
3d0684b2
JQ
1644 * Returns the block of the page (or NULL if none available)
1645 *
ec481c6c 1646 * @rs: current RAM state
3d0684b2 1647 * @offset: used to return the offset within the RAMBlock
a82d593b 1648 */
f20e2865 1649static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
a82d593b 1650{
a1fe28df 1651 struct RAMSrcPageRequest *entry;
a82d593b
DDAG
1652 RAMBlock *block = NULL;
1653
a1fe28df 1654 if (!postcopy_has_request(rs)) {
ae526e32
XG
1655 return NULL;
1656 }
1657
6e8a355d 1658 QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
a1fe28df
PX
1659
1660 /*
1661 * This should _never_ change even after we take the lock, because no one
1662 * should be taking anything off the request list other than us.
1663 */
1664 assert(postcopy_has_request(rs));
1665
1666 entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1667 block = entry->rb;
1668 *offset = entry->offset;
1669
777f53c7
TH
1670 if (entry->len > TARGET_PAGE_SIZE) {
1671 entry->len -= TARGET_PAGE_SIZE;
1672 entry->offset += TARGET_PAGE_SIZE;
a1fe28df
PX
1673 } else {
1674 memory_region_unref(block->mr);
1675 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1676 g_free(entry);
1677 migration_consume_urgent_request();
a82d593b 1678 }
a82d593b
DDAG
1679
1680 return block;
1681}
1682
278e2f55
AG
1683#if defined(__linux__)
1684/**
1685 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1686 * is found, return RAM block pointer and page offset
1687 *
1688 * Returns pointer to the RAMBlock containing faulting page,
1689 * NULL if no write faults are pending
1690 *
1691 * @rs: current RAM state
1692 * @offset: page offset from the beginning of the block
1693 */
1694static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1695{
1696 struct uffd_msg uffd_msg;
1697 void *page_address;
82ea3e3b 1698 RAMBlock *block;
278e2f55
AG
1699 int res;
1700
1701 if (!migrate_background_snapshot()) {
1702 return NULL;
1703 }
1704
1705 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1706 if (res <= 0) {
1707 return NULL;
1708 }
1709
1710 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
82ea3e3b
AG
1711 block = qemu_ram_block_from_host(page_address, false, offset);
1712 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1713 return block;
278e2f55
AG
1714}
1715
1716/**
1717 * ram_save_release_protection: release UFFD write protection after
1718 * a range of pages has been saved
1719 *
1720 * @rs: current RAM state
1721 * @pss: page-search-status structure
1722 * @start_page: index of the first page in the range relative to pss->block
1723 *
1724 * Returns 0 on success, negative value in case of an error
1725*/
1726static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1727 unsigned long start_page)
1728{
1729 int res = 0;
1730
1731 /* Check if page is from UFFD-managed region. */
1732 if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1733 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
258f5c98 1734 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
278e2f55
AG
1735
1736 /* Flush async buffers before un-protect. */
61717ea9 1737 qemu_fflush(pss->pss_channel);
278e2f55
AG
1738 /* Un-protect memory range. */
1739 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1740 false, false);
1741 }
1742
1743 return res;
1744}
1745
1746/* ram_write_tracking_available: check if kernel supports required UFFD features
1747 *
1748 * Returns true if supports, false otherwise
1749 */
1750bool ram_write_tracking_available(void)
1751{
1752 uint64_t uffd_features;
1753 int res;
1754
1755 res = uffd_query_features(&uffd_features);
1756 return (res == 0 &&
1757 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1758}
1759
1760/* ram_write_tracking_compatible: check if guest configuration is
1761 * compatible with 'write-tracking'
1762 *
1763 * Returns true if compatible, false otherwise
1764 */
1765bool ram_write_tracking_compatible(void)
1766{
1767 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1768 int uffd_fd;
82ea3e3b 1769 RAMBlock *block;
278e2f55
AG
1770 bool ret = false;
1771
1772 /* Open UFFD file descriptor */
1773 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1774 if (uffd_fd < 0) {
1775 return false;
1776 }
1777
1778 RCU_READ_LOCK_GUARD();
1779
82ea3e3b 1780 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
278e2f55
AG
1781 uint64_t uffd_ioctls;
1782
1783 /* Nothing to do with read-only and MMIO-writable regions */
82ea3e3b 1784 if (block->mr->readonly || block->mr->rom_device) {
278e2f55
AG
1785 continue;
1786 }
1787 /* Try to register block memory via UFFD-IO to track writes */
82ea3e3b 1788 if (uffd_register_memory(uffd_fd, block->host, block->max_length,
278e2f55
AG
1789 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1790 goto out;
1791 }
1792 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1793 goto out;
1794 }
1795 }
1796 ret = true;
1797
1798out:
1799 uffd_close_fd(uffd_fd);
1800 return ret;
1801}
1802
f7b9dcfb
DH
1803static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1804 ram_addr_t size)
1805{
5f19a449
DH
1806 const ram_addr_t end = offset + size;
1807
f7b9dcfb
DH
1808 /*
1809 * We read one byte of each page; this will preallocate page tables if
1810 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1811 * where no page was populated yet. This might require adaption when
1812 * supporting other mappings, like shmem.
1813 */
5f19a449 1814 for (; offset < end; offset += block->page_size) {
f7b9dcfb
DH
1815 char tmp = *((char *)block->host + offset);
1816
1817 /* Don't optimize the read out */
1818 asm volatile("" : "+r" (tmp));
1819 }
1820}
1821
6fee3a1f
DH
1822static inline int populate_read_section(MemoryRegionSection *section,
1823 void *opaque)
1824{
1825 const hwaddr size = int128_get64(section->size);
1826 hwaddr offset = section->offset_within_region;
1827 RAMBlock *block = section->mr->ram_block;
1828
1829 populate_read_range(block, offset, size);
1830 return 0;
1831}
1832
eeccb99c 1833/*
f7b9dcfb
DH
1834 * ram_block_populate_read: preallocate page tables and populate pages in the
1835 * RAM block by reading a byte of each page.
eeccb99c
AG
1836 *
1837 * Since it's solely used for userfault_fd WP feature, here we just
1838 * hardcode page size to qemu_real_host_page_size.
1839 *
82ea3e3b 1840 * @block: RAM block to populate
eeccb99c 1841 */
6fee3a1f 1842static void ram_block_populate_read(RAMBlock *rb)
eeccb99c 1843{
6fee3a1f
DH
1844 /*
1845 * Skip populating all pages that fall into a discarded range as managed by
1846 * a RamDiscardManager responsible for the mapped memory region of the
1847 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1848 * must not get populated automatically. We don't have to track
1849 * modifications via userfaultfd WP reliably, because these pages will
1850 * not be part of the migration stream either way -- see
1851 * ramblock_dirty_bitmap_exclude_discarded_pages().
1852 *
1853 * Note: The result is only stable while migrating (precopy/postcopy).
1854 */
1855 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1856 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1857 MemoryRegionSection section = {
1858 .mr = rb->mr,
1859 .offset_within_region = 0,
1860 .size = rb->mr->size,
1861 };
1862
1863 ram_discard_manager_replay_populated(rdm, &section,
1864 populate_read_section, NULL);
1865 } else {
1866 populate_read_range(rb, 0, rb->used_length);
1867 }
eeccb99c
AG
1868}
1869
1870/*
1871 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1872 */
1873void ram_write_tracking_prepare(void)
1874{
82ea3e3b 1875 RAMBlock *block;
eeccb99c
AG
1876
1877 RCU_READ_LOCK_GUARD();
1878
82ea3e3b 1879 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
eeccb99c 1880 /* Nothing to do with read-only and MMIO-writable regions */
82ea3e3b 1881 if (block->mr->readonly || block->mr->rom_device) {
eeccb99c
AG
1882 continue;
1883 }
1884
1885 /*
1886 * Populate pages of the RAM block before enabling userfault_fd
1887 * write protection.
1888 *
1889 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1890 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1891 * pages with pte_none() entries in page table.
1892 */
f7b9dcfb 1893 ram_block_populate_read(block);
eeccb99c
AG
1894 }
1895}
1896
e41c5770
DH
1897static inline int uffd_protect_section(MemoryRegionSection *section,
1898 void *opaque)
1899{
1900 const hwaddr size = int128_get64(section->size);
1901 const hwaddr offset = section->offset_within_region;
1902 RAMBlock *rb = section->mr->ram_block;
1903 int uffd_fd = (uintptr_t)opaque;
1904
1905 return uffd_change_protection(uffd_fd, rb->host + offset, size, true,
1906 false);
1907}
1908
1909static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd)
1910{
1911 assert(rb->flags & RAM_UF_WRITEPROTECT);
1912
1913 /* See ram_block_populate_read() */
1914 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1915 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1916 MemoryRegionSection section = {
1917 .mr = rb->mr,
1918 .offset_within_region = 0,
1919 .size = rb->mr->size,
1920 };
1921
1922 return ram_discard_manager_replay_populated(rdm, &section,
1923 uffd_protect_section,
1924 (void *)(uintptr_t)uffd_fd);
1925 }
1926 return uffd_change_protection(uffd_fd, rb->host,
1927 rb->used_length, true, false);
1928}
1929
278e2f55
AG
1930/*
1931 * ram_write_tracking_start: start UFFD-WP memory tracking
1932 *
1933 * Returns 0 for success or negative value in case of error
1934 */
1935int ram_write_tracking_start(void)
1936{
1937 int uffd_fd;
1938 RAMState *rs = ram_state;
82ea3e3b 1939 RAMBlock *block;
278e2f55
AG
1940
1941 /* Open UFFD file descriptor */
1942 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1943 if (uffd_fd < 0) {
1944 return uffd_fd;
1945 }
1946 rs->uffdio_fd = uffd_fd;
1947
1948 RCU_READ_LOCK_GUARD();
1949
82ea3e3b 1950 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
278e2f55 1951 /* Nothing to do with read-only and MMIO-writable regions */
82ea3e3b 1952 if (block->mr->readonly || block->mr->rom_device) {
278e2f55
AG
1953 continue;
1954 }
1955
1956 /* Register block memory with UFFD to track writes */
82ea3e3b
AG
1957 if (uffd_register_memory(rs->uffdio_fd, block->host,
1958 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
278e2f55
AG
1959 goto fail;
1960 }
72ef3a37
DH
1961 block->flags |= RAM_UF_WRITEPROTECT;
1962 memory_region_ref(block->mr);
1963
278e2f55 1964 /* Apply UFFD write protection to the block memory range */
e41c5770 1965 if (ram_block_uffd_protect(block, uffd_fd)) {
278e2f55
AG
1966 goto fail;
1967 }
278e2f55 1968
82ea3e3b
AG
1969 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1970 block->host, block->max_length);
278e2f55
AG
1971 }
1972
1973 return 0;
1974
1975fail:
1976 error_report("ram_write_tracking_start() failed: restoring initial memory state");
1977
82ea3e3b
AG
1978 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1979 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
278e2f55
AG
1980 continue;
1981 }
82ea3e3b 1982 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
278e2f55 1983 /* Cleanup flags and remove reference */
82ea3e3b
AG
1984 block->flags &= ~RAM_UF_WRITEPROTECT;
1985 memory_region_unref(block->mr);
278e2f55
AG
1986 }
1987
1988 uffd_close_fd(uffd_fd);
1989 rs->uffdio_fd = -1;
1990 return -1;
1991}
1992
1993/**
1994 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1995 */
1996void ram_write_tracking_stop(void)
1997{
1998 RAMState *rs = ram_state;
82ea3e3b 1999 RAMBlock *block;
278e2f55
AG
2000
2001 RCU_READ_LOCK_GUARD();
2002
82ea3e3b
AG
2003 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2004 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
278e2f55
AG
2005 continue;
2006 }
82ea3e3b 2007 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
278e2f55 2008
82ea3e3b
AG
2009 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
2010 block->host, block->max_length);
278e2f55
AG
2011
2012 /* Cleanup flags and remove reference */
82ea3e3b
AG
2013 block->flags &= ~RAM_UF_WRITEPROTECT;
2014 memory_region_unref(block->mr);
278e2f55
AG
2015 }
2016
2017 /* Finally close UFFD file descriptor */
2018 uffd_close_fd(rs->uffdio_fd);
2019 rs->uffdio_fd = -1;
2020}
2021
2022#else
2023/* No target OS support, stubs just fail or ignore */
2024
2025static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
2026{
2027 (void) rs;
2028 (void) offset;
2029
2030 return NULL;
2031}
2032
2033static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
2034 unsigned long start_page)
2035{
2036 (void) rs;
2037 (void) pss;
2038 (void) start_page;
2039
2040 return 0;
2041}
2042
2043bool ram_write_tracking_available(void)
2044{
2045 return false;
2046}
2047
2048bool ram_write_tracking_compatible(void)
2049{
2050 assert(0);
2051 return false;
2052}
2053
2054int ram_write_tracking_start(void)
2055{
2056 assert(0);
2057 return -1;
2058}
2059
2060void ram_write_tracking_stop(void)
2061{
2062 assert(0);
2063}
2064#endif /* defined(__linux__) */
2065
3d0684b2 2066/**
ff1543af 2067 * get_queued_page: unqueue a page from the postcopy requests
3d0684b2
JQ
2068 *
2069 * Skips pages that are already sent (!dirty)
a82d593b 2070 *
a5f7b1a6 2071 * Returns true if a queued page is found
a82d593b 2072 *
6f37bb8b 2073 * @rs: current RAM state
3d0684b2 2074 * @pss: data about the state of the current dirty page scan
a82d593b 2075 */
f20e2865 2076static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
a82d593b
DDAG
2077{
2078 RAMBlock *block;
2079 ram_addr_t offset;
777f53c7
TH
2080 bool dirty;
2081
2082 do {
2083 block = unqueue_page(rs, &offset);
2084 /*
2085 * We're sending this page, and since it's postcopy nothing else
2086 * will dirty it, and we must make sure it doesn't get sent again
2087 * even if this queue request was received after the background
2088 * search already sent it.
2089 */
2090 if (block) {
2091 unsigned long page;
2092
2093 page = offset >> TARGET_PAGE_BITS;
2094 dirty = test_bit(page, block->bmap);
2095 if (!dirty) {
2096 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2097 page);
2098 } else {
2099 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2100 }
2101 }
a82d593b 2102
777f53c7 2103 } while (block && !dirty);
a82d593b 2104
b062106d 2105 if (!block) {
278e2f55
AG
2106 /*
2107 * Poll write faults too if background snapshot is enabled; that's
2108 * when we have vcpus got blocked by the write protected pages.
2109 */
2110 block = poll_fault_page(rs, &offset);
2111 }
2112
a82d593b 2113 if (block) {
a82d593b
DDAG
2114 /*
2115 * We want the background search to continue from the queued page
2116 * since the guest is likely to want other pages near to the page
2117 * it just requested.
2118 */
2119 pss->block = block;
a935e30f 2120 pss->page = offset >> TARGET_PAGE_BITS;
422314e7
WY
2121
2122 /*
2123 * This unqueued page would break the "one round" check, even is
2124 * really rare.
2125 */
2126 pss->complete_round = false;
a82d593b
DDAG
2127 }
2128
2129 return !!block;
2130}
2131
6c595cde 2132/**
5e58f968
JQ
2133 * migration_page_queue_free: drop any remaining pages in the ram
2134 * request queue
6c595cde 2135 *
3d0684b2
JQ
2136 * It should be empty at the end anyway, but in error cases there may
2137 * be some left. in case that there is any page left, we drop it.
2138 *
6c595cde 2139 */
83c13382 2140static void migration_page_queue_free(RAMState *rs)
6c595cde 2141{
ec481c6c 2142 struct RAMSrcPageRequest *mspr, *next_mspr;
6c595cde
DDAG
2143 /* This queue generally should be empty - but in the case of a failed
2144 * migration might have some droppings in.
2145 */
89ac5a1d 2146 RCU_READ_LOCK_GUARD();
ec481c6c 2147 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
6c595cde 2148 memory_region_unref(mspr->rb->mr);
ec481c6c 2149 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
6c595cde
DDAG
2150 g_free(mspr);
2151 }
6c595cde
DDAG
2152}
2153
2154/**
3d0684b2
JQ
2155 * ram_save_queue_pages: queue the page for transmission
2156 *
2157 * A request from postcopy destination for example.
2158 *
2159 * Returns zero on success or negative on error
2160 *
3d0684b2
JQ
2161 * @rbname: Name of the RAMBLock of the request. NULL means the
2162 * same that last one.
2163 * @start: starting address from the start of the RAMBlock
2164 * @len: length (in bytes) to send
6c595cde 2165 */
96506894 2166int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
6c595cde
DDAG
2167{
2168 RAMBlock *ramblock;
53518d94 2169 RAMState *rs = ram_state;
6c595cde 2170
9360447d 2171 ram_counters.postcopy_requests++;
89ac5a1d
DDAG
2172 RCU_READ_LOCK_GUARD();
2173
6c595cde
DDAG
2174 if (!rbname) {
2175 /* Reuse last RAMBlock */
68a098f3 2176 ramblock = rs->last_req_rb;
6c595cde
DDAG
2177
2178 if (!ramblock) {
2179 /*
2180 * Shouldn't happen, we can't reuse the last RAMBlock if
2181 * it's the 1st request.
2182 */
2183 error_report("ram_save_queue_pages no previous block");
03acb4e9 2184 return -1;
6c595cde
DDAG
2185 }
2186 } else {
2187 ramblock = qemu_ram_block_by_name(rbname);
2188
2189 if (!ramblock) {
2190 /* We shouldn't be asked for a non-existent RAMBlock */
2191 error_report("ram_save_queue_pages no block '%s'", rbname);
03acb4e9 2192 return -1;
6c595cde 2193 }
68a098f3 2194 rs->last_req_rb = ramblock;
6c595cde
DDAG
2195 }
2196 trace_ram_save_queue_pages(ramblock->idstr, start, len);
542147f4 2197 if (!offset_in_ramblock(ramblock, start + len - 1)) {
9458ad6b
JQ
2198 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2199 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
6c595cde 2200 __func__, start, len, ramblock->used_length);
03acb4e9 2201 return -1;
6c595cde
DDAG
2202 }
2203
93589827
PX
2204 /*
2205 * When with postcopy preempt, we send back the page directly in the
2206 * rp-return thread.
2207 */
2208 if (postcopy_preempt_active()) {
2209 ram_addr_t page_start = start >> TARGET_PAGE_BITS;
2210 size_t page_size = qemu_ram_pagesize(ramblock);
2211 PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY];
2212 int ret = 0;
2213
2214 qemu_mutex_lock(&rs->bitmap_mutex);
2215
2216 pss_init(pss, ramblock, page_start);
2217 /*
2218 * Always use the preempt channel, and make sure it's there. It's
2219 * safe to access without lock, because when rp-thread is running
2220 * we should be the only one who operates on the qemufile
2221 */
2222 pss->pss_channel = migrate_get_current()->postcopy_qemufile_src;
93589827
PX
2223 assert(pss->pss_channel);
2224
2225 /*
2226 * It must be either one or multiple of host page size. Just
2227 * assert; if something wrong we're mostly split brain anyway.
2228 */
2229 assert(len % page_size == 0);
2230 while (len) {
2231 if (ram_save_host_page_urgent(pss)) {
2232 error_report("%s: ram_save_host_page_urgent() failed: "
2233 "ramblock=%s, start_addr=0x"RAM_ADDR_FMT,
2234 __func__, ramblock->idstr, start);
2235 ret = -1;
2236 break;
2237 }
2238 /*
2239 * NOTE: after ram_save_host_page_urgent() succeeded, pss->page
2240 * will automatically be moved and point to the next host page
2241 * we're going to send, so no need to update here.
2242 *
2243 * Normally QEMU never sends >1 host page in requests, so
2244 * logically we don't even need that as the loop should only
2245 * run once, but just to be consistent.
2246 */
2247 len -= page_size;
2248 };
2249 qemu_mutex_unlock(&rs->bitmap_mutex);
2250
2251 return ret;
2252 }
2253
ec481c6c 2254 struct RAMSrcPageRequest *new_entry =
b21e2380 2255 g_new0(struct RAMSrcPageRequest, 1);
6c595cde
DDAG
2256 new_entry->rb = ramblock;
2257 new_entry->offset = start;
2258 new_entry->len = len;
2259
2260 memory_region_ref(ramblock->mr);
ec481c6c
JQ
2261 qemu_mutex_lock(&rs->src_page_req_mutex);
2262 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
e03a34f8 2263 migration_make_urgent_request();
ec481c6c 2264 qemu_mutex_unlock(&rs->src_page_req_mutex);
6c595cde
DDAG
2265
2266 return 0;
6c595cde
DDAG
2267}
2268
d7400a34
XG
2269static bool save_page_use_compression(RAMState *rs)
2270{
2271 if (!migrate_use_compression()) {
2272 return false;
2273 }
2274
2275 /*
1a373522
DH
2276 * If xbzrle is enabled (e.g., after first round of migration), stop
2277 * using the data compression. In theory, xbzrle can do better than
2278 * compression.
d7400a34 2279 */
1a373522
DH
2280 if (rs->xbzrle_enabled) {
2281 return false;
d7400a34
XG
2282 }
2283
1a373522 2284 return true;
d7400a34
XG
2285}
2286
5e5fdcff
XG
2287/*
2288 * try to compress the page before posting it out, return true if the page
2289 * has been properly handled by compression, otherwise needs other
2290 * paths to handle it
2291 */
ec6f3ab9
PX
2292static bool save_compress_page(RAMState *rs, PageSearchStatus *pss,
2293 RAMBlock *block, ram_addr_t offset)
5e5fdcff
XG
2294{
2295 if (!save_page_use_compression(rs)) {
2296 return false;
2297 }
2298
2299 /*
2300 * When starting the process of a new block, the first page of
2301 * the block should be sent out before other pages in the same
2302 * block, and all the pages in last block should have been sent
2303 * out, keeping this order is important, because the 'cont' flag
2304 * is used to avoid resending the block name.
2305 *
2306 * We post the fist page as normal page as compression will take
2307 * much CPU resource.
2308 */
ec6f3ab9 2309 if (block != pss->last_sent_block) {
5e5fdcff
XG
2310 flush_compressed_data(rs);
2311 return false;
2312 }
2313
eaa238ab 2314 if (compress_page_with_multi_thread(block, offset) > 0) {
5e5fdcff
XG
2315 return true;
2316 }
2317
76e03000 2318 compression_counters.busy++;
5e5fdcff
XG
2319 return false;
2320}
2321
a82d593b 2322/**
4010ba38 2323 * ram_save_target_page_legacy: save one target page
a82d593b 2324 *
3d0684b2 2325 * Returns the number of pages written
a82d593b 2326 *
6f37bb8b 2327 * @rs: current RAM state
3d0684b2 2328 * @pss: data about the page we want to send
a82d593b 2329 */
4010ba38 2330static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss)
a82d593b 2331{
a8ec91f9 2332 RAMBlock *block = pss->block;
8bba004c 2333 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
a8ec91f9
XG
2334 int res;
2335
61717ea9 2336 if (control_save_page(pss, block, offset, &res)) {
a8ec91f9
XG
2337 return res;
2338 }
2339
ec6f3ab9 2340 if (save_compress_page(rs, pss, block, offset)) {
5e5fdcff 2341 return 1;
d7400a34
XG
2342 }
2343
37502df3 2344 res = save_zero_page(pss, pss->pss_channel, block, offset);
d7400a34
XG
2345 if (res > 0) {
2346 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2347 * page would be stale
2348 */
ef5c3d13 2349 if (rs->xbzrle_enabled) {
d7400a34
XG
2350 XBZRLE_cache_lock();
2351 xbzrle_cache_zero_page(rs, block->offset + offset);
2352 XBZRLE_cache_unlock();
2353 }
d7400a34
XG
2354 return res;
2355 }
2356
da3f56cb 2357 /*
6f39c90b
PX
2358 * Do not use multifd in postcopy as one whole host page should be
2359 * placed. Meanwhile postcopy requires atomic update of pages, so even
2360 * if host page size == guest page size the dest guest during run may
2361 * still see partially copied pages which is data corruption.
da3f56cb 2362 */
6f39c90b 2363 if (migrate_use_multifd() && !migration_in_postcopy()) {
61717ea9 2364 return ram_save_multifd_page(pss->pss_channel, block, offset);
a82d593b
DDAG
2365 }
2366
05931ec5 2367 return ram_save_page(rs, pss);
a82d593b
DDAG
2368}
2369
d9e474ea
PX
2370/* Should be called before sending a host page */
2371static void pss_host_page_prepare(PageSearchStatus *pss)
2372{
2373 /* How many guest pages are there in one host page? */
2374 size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2375
2376 pss->host_page_sending = true;
301d7ffe
PX
2377 if (guest_pfns <= 1) {
2378 /*
2379 * This covers both when guest psize == host psize, or when guest
2380 * has larger psize than the host (guest_pfns==0).
2381 *
2382 * For the latter, we always send one whole guest page per
2383 * iteration of the host page (example: an Alpha VM on x86 host
2384 * will have guest psize 8K while host psize 4K).
2385 */
2386 pss->host_page_start = pss->page;
2387 pss->host_page_end = pss->page + 1;
2388 } else {
2389 /*
2390 * The host page spans over multiple guest pages, we send them
2391 * within the same host page iteration.
2392 */
2393 pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
2394 pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
2395 }
d9e474ea
PX
2396}
2397
2398/*
2399 * Whether the page pointed by PSS is within the host page being sent.
2400 * Must be called after a previous pss_host_page_prepare().
2401 */
2402static bool pss_within_range(PageSearchStatus *pss)
2403{
2404 ram_addr_t ram_addr;
2405
2406 assert(pss->host_page_sending);
2407
2408 /* Over host-page boundary? */
2409 if (pss->page >= pss->host_page_end) {
2410 return false;
2411 }
2412
2413 ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2414
2415 return offset_in_ramblock(pss->block, ram_addr);
2416}
2417
2418static void pss_host_page_finish(PageSearchStatus *pss)
2419{
2420 pss->host_page_sending = false;
2421 /* This is not needed, but just to reset it */
2422 pss->host_page_start = pss->host_page_end = 0;
2423}
2424
93589827
PX
2425/*
2426 * Send an urgent host page specified by `pss'. Need to be called with
2427 * bitmap_mutex held.
2428 *
2429 * Returns 0 if save host page succeeded, false otherwise.
2430 */
2431static int ram_save_host_page_urgent(PageSearchStatus *pss)
2432{
2433 bool page_dirty, sent = false;
2434 RAMState *rs = ram_state;
2435 int ret = 0;
2436
2437 trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2438 pss_host_page_prepare(pss);
2439
2440 /*
2441 * If precopy is sending the same page, let it be done in precopy, or
2442 * we could send the same page in two channels and none of them will
2443 * receive the whole page.
2444 */
2445 if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) {
2446 trace_postcopy_preempt_hit(pss->block->idstr,
2447 pss->page << TARGET_PAGE_BITS);
2448 return 0;
2449 }
2450
2451 do {
2452 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2453
2454 if (page_dirty) {
2455 /* Be strict to return code; it must be 1, or what else? */
4010ba38 2456 if (migration_ops->ram_save_target_page(rs, pss) != 1) {
93589827
PX
2457 error_report_once("%s: ram_save_target_page failed", __func__);
2458 ret = -1;
2459 goto out;
2460 }
2461 sent = true;
2462 }
2463 pss_find_next_dirty(pss);
2464 } while (pss_within_range(pss));
2465out:
2466 pss_host_page_finish(pss);
2467 /* For urgent requests, flush immediately if sent */
2468 if (sent) {
2469 qemu_fflush(pss->pss_channel);
2470 }
2471 return ret;
2472}
2473
a82d593b 2474/**
3d0684b2 2475 * ram_save_host_page: save a whole host page
a82d593b 2476 *
3d0684b2
JQ
2477 * Starting at *offset send pages up to the end of the current host
2478 * page. It's valid for the initial offset to point into the middle of
2479 * a host page in which case the remainder of the hostpage is sent.
2480 * Only dirty target pages are sent. Note that the host page size may
2481 * be a huge page for this block.
f3321554 2482 *
1eb3fc0a
DDAG
2483 * The saving stops at the boundary of the used_length of the block
2484 * if the RAMBlock isn't a multiple of the host page size.
a82d593b 2485 *
f3321554
PX
2486 * The caller must be with ram_state.bitmap_mutex held to call this
2487 * function. Note that this function can temporarily release the lock, but
2488 * when the function is returned it'll make sure the lock is still held.
2489 *
3d0684b2
JQ
2490 * Returns the number of pages written or negative on error
2491 *
6f37bb8b 2492 * @rs: current RAM state
3d0684b2 2493 * @pss: data about the page we want to send
a82d593b 2494 */
05931ec5 2495static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
a82d593b 2496{
f3321554 2497 bool page_dirty, preempt_active = postcopy_preempt_active();
a82d593b 2498 int tmppages, pages = 0;
a935e30f
JQ
2499 size_t pagesize_bits =
2500 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
278e2f55
AG
2501 unsigned long start_page = pss->page;
2502 int res;
4c011c37 2503
fbd162e6 2504 if (ramblock_is_ignored(pss->block)) {
b895de50
CLG
2505 error_report("block %s should not be migrated !", pss->block->idstr);
2506 return 0;
2507 }
2508
d9e474ea
PX
2509 /* Update host page boundary information */
2510 pss_host_page_prepare(pss);
2511
a82d593b 2512 do {
f3321554 2513 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
a82d593b 2514
f3321554
PX
2515 /* Check the pages is dirty and if it is send it */
2516 if (page_dirty) {
ba1b7c81 2517 /*
f3321554
PX
2518 * Properly yield the lock only in postcopy preempt mode
2519 * because both migration thread and rp-return thread can
2520 * operate on the bitmaps.
ba1b7c81 2521 */
f3321554
PX
2522 if (preempt_active) {
2523 qemu_mutex_unlock(&rs->bitmap_mutex);
ba1b7c81 2524 }
4010ba38 2525 tmppages = migration_ops->ram_save_target_page(rs, pss);
f3321554
PX
2526 if (tmppages >= 0) {
2527 pages += tmppages;
2528 /*
2529 * Allow rate limiting to happen in the middle of huge pages if
2530 * something is sent in the current iteration.
2531 */
2532 if (pagesize_bits > 1 && tmppages > 0) {
2533 migration_rate_limit();
2534 }
2535 }
2536 if (preempt_active) {
2537 qemu_mutex_lock(&rs->bitmap_mutex);
2538 }
2539 } else {
2540 tmppages = 0;
23feba90 2541 }
f3321554
PX
2542
2543 if (tmppages < 0) {
d9e474ea 2544 pss_host_page_finish(pss);
f3321554
PX
2545 return tmppages;
2546 }
2547
d9e474ea
PX
2548 pss_find_next_dirty(pss);
2549 } while (pss_within_range(pss));
2550
2551 pss_host_page_finish(pss);
278e2f55
AG
2552
2553 res = ram_save_release_protection(rs, pss, start_page);
2554 return (res < 0 ? res : pages);
a82d593b 2555}
6c595cde 2556
56e93d26 2557/**
3d0684b2 2558 * ram_find_and_save_block: finds a dirty page and sends it to f
56e93d26
JQ
2559 *
2560 * Called within an RCU critical section.
2561 *
e8f3735f
XG
2562 * Returns the number of pages written where zero means no dirty pages,
2563 * or negative on error
56e93d26 2564 *
6f37bb8b 2565 * @rs: current RAM state
a82d593b
DDAG
2566 *
2567 * On systems where host-page-size > target-page-size it will send all the
2568 * pages in a host page that are dirty.
56e93d26 2569 */
05931ec5 2570static int ram_find_and_save_block(RAMState *rs)
56e93d26 2571{
f1668764 2572 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
56e93d26 2573 int pages = 0;
56e93d26 2574
0827b9e9 2575 /* No dirty page as there is zero RAM */
8d80e195 2576 if (!rs->ram_bytes_total) {
0827b9e9
AA
2577 return pages;
2578 }
2579
4934a5dd
PX
2580 /*
2581 * Always keep last_seen_block/last_page valid during this procedure,
2582 * because find_dirty_block() relies on these values (e.g., we compare
2583 * last_seen_block with pss.block to see whether we searched all the
2584 * ramblocks) to detect the completion of migration. Having NULL value
2585 * of last_seen_block can conditionally cause below loop to run forever.
2586 */
2587 if (!rs->last_seen_block) {
2588 rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks);
2589 rs->last_page = 0;
2590 }
2591
f1668764 2592 pss_init(pss, rs->last_seen_block, rs->last_page);
b8fb8cb7 2593
31e2ac74 2594 while (true){
51efd36f 2595 if (!get_queued_page(rs, pss)) {
b062106d 2596 /* priority queue empty, so just search for something dirty */
31e2ac74
JQ
2597 int res = find_dirty_block(rs, pss);
2598 if (res != PAGE_DIRTY_FOUND) {
2599 if (res == PAGE_ALL_CLEAN) {
51efd36f 2600 break;
31e2ac74
JQ
2601 } else if (res == PAGE_TRY_AGAIN) {
2602 continue;
51efd36f
JQ
2603 }
2604 }
56e93d26 2605 }
51efd36f 2606 pages = ram_save_host_page(rs, pss);
31e2ac74
JQ
2607 if (pages) {
2608 break;
2609 }
2610 }
56e93d26 2611
f1668764
PX
2612 rs->last_seen_block = pss->block;
2613 rs->last_page = pss->page;
56e93d26
JQ
2614
2615 return pages;
2616}
2617
2618void acct_update_position(QEMUFile *f, size_t size, bool zero)
2619{
2620 uint64_t pages = size / TARGET_PAGE_SIZE;
f7ccd61b 2621
56e93d26 2622 if (zero) {
abce5fa1 2623 stat64_add(&ram_counters.duplicate, pages);
56e93d26 2624 } else {
abce5fa1 2625 stat64_add(&ram_counters.normal, pages);
4c2d0f6d 2626 ram_transferred_add(size);
1a93bd2f 2627 qemu_file_credit_transfer(f, size);
56e93d26
JQ
2628 }
2629}
2630
8008a272 2631static uint64_t ram_bytes_total_with_ignored(void)
56e93d26
JQ
2632{
2633 RAMBlock *block;
2634 uint64_t total = 0;
2635
89ac5a1d
DDAG
2636 RCU_READ_LOCK_GUARD();
2637
8008a272
JQ
2638 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2639 total += block->used_length;
99e15582 2640 }
56e93d26
JQ
2641 return total;
2642}
2643
fbd162e6
YK
2644uint64_t ram_bytes_total(void)
2645{
8008a272
JQ
2646 RAMBlock *block;
2647 uint64_t total = 0;
2648
2649 RCU_READ_LOCK_GUARD();
2650
2651 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2652 total += block->used_length;
2653 }
2654 return total;
fbd162e6
YK
2655}
2656
f265e0e4 2657static void xbzrle_load_setup(void)
56e93d26 2658{
f265e0e4 2659 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
56e93d26
JQ
2660}
2661
f265e0e4
JQ
2662static void xbzrle_load_cleanup(void)
2663{
2664 g_free(XBZRLE.decoded_buf);
2665 XBZRLE.decoded_buf = NULL;
2666}
2667
7d7c96be
PX
2668static void ram_state_cleanup(RAMState **rsp)
2669{
b9ccaf6d
DDAG
2670 if (*rsp) {
2671 migration_page_queue_free(*rsp);
2672 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2673 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2674 g_free(*rsp);
2675 *rsp = NULL;
2676 }
7d7c96be
PX
2677}
2678
84593a08
PX
2679static void xbzrle_cleanup(void)
2680{
2681 XBZRLE_cache_lock();
2682 if (XBZRLE.cache) {
2683 cache_fini(XBZRLE.cache);
2684 g_free(XBZRLE.encoded_buf);
2685 g_free(XBZRLE.current_buf);
2686 g_free(XBZRLE.zero_target_page);
2687 XBZRLE.cache = NULL;
2688 XBZRLE.encoded_buf = NULL;
2689 XBZRLE.current_buf = NULL;
2690 XBZRLE.zero_target_page = NULL;
2691 }
2692 XBZRLE_cache_unlock();
2693}
2694
f265e0e4 2695static void ram_save_cleanup(void *opaque)
56e93d26 2696{
53518d94 2697 RAMState **rsp = opaque;
6b6712ef 2698 RAMBlock *block;
eb859c53 2699
278e2f55
AG
2700 /* We don't use dirty log with background snapshots */
2701 if (!migrate_background_snapshot()) {
2702 /* caller have hold iothread lock or is in a bh, so there is
2703 * no writing race against the migration bitmap
2704 */
63b41db4
HH
2705 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2706 /*
2707 * do not stop dirty log without starting it, since
2708 * memory_global_dirty_log_stop will assert that
2709 * memory_global_dirty_log_start/stop used in pairs
2710 */
2711 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2712 }
278e2f55 2713 }
6b6712ef 2714
fbd162e6 2715 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
002cad6b
PX
2716 g_free(block->clear_bmap);
2717 block->clear_bmap = NULL;
6b6712ef
JQ
2718 g_free(block->bmap);
2719 block->bmap = NULL;
56e93d26
JQ
2720 }
2721
84593a08 2722 xbzrle_cleanup();
f0afa331 2723 compress_threads_save_cleanup();
7d7c96be 2724 ram_state_cleanup(rsp);
4010ba38
JQ
2725 g_free(migration_ops);
2726 migration_ops = NULL;
56e93d26
JQ
2727}
2728
6f37bb8b 2729static void ram_state_reset(RAMState *rs)
56e93d26 2730{
ec6f3ab9
PX
2731 int i;
2732
2733 for (i = 0; i < RAM_CHANNEL_MAX; i++) {
2734 rs->pss[i].last_sent_block = NULL;
2735 }
2736
6f37bb8b 2737 rs->last_seen_block = NULL;
269ace29 2738 rs->last_page = 0;
6f37bb8b 2739 rs->last_version = ram_list.version;
1a373522 2740 rs->xbzrle_enabled = false;
56e93d26
JQ
2741}
2742
2743#define MAX_WAIT 50 /* ms, half buffered_file limit */
2744
e0b266f0
DDAG
2745/* **** functions for postcopy ***** */
2746
ced1c616
PB
2747void ram_postcopy_migrated_memory_release(MigrationState *ms)
2748{
2749 struct RAMBlock *block;
ced1c616 2750
fbd162e6 2751 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
6b6712ef
JQ
2752 unsigned long *bitmap = block->bmap;
2753 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2754 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
ced1c616
PB
2755
2756 while (run_start < range) {
2757 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
8bba004c
AR
2758 ram_discard_range(block->idstr,
2759 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2760 ((ram_addr_t)(run_end - run_start))
2761 << TARGET_PAGE_BITS);
ced1c616
PB
2762 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2763 }
2764 }
2765}
2766
3d0684b2
JQ
2767/**
2768 * postcopy_send_discard_bm_ram: discard a RAMBlock
2769 *
e0b266f0 2770 * Callback from postcopy_each_ram_send_discard for each RAMBlock
3d0684b2
JQ
2771 *
2772 * @ms: current migration state
89dab31b 2773 * @block: RAMBlock to discard
e0b266f0 2774 */
9e7d1223 2775static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
e0b266f0 2776{
6b6712ef 2777 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
e0b266f0 2778 unsigned long current;
1e7cf8c3 2779 unsigned long *bitmap = block->bmap;
e0b266f0 2780
6b6712ef 2781 for (current = 0; current < end; ) {
1e7cf8c3 2782 unsigned long one = find_next_bit(bitmap, end, current);
33a5cb62 2783 unsigned long zero, discard_length;
e0b266f0 2784
33a5cb62
WY
2785 if (one >= end) {
2786 break;
2787 }
e0b266f0 2788
1e7cf8c3 2789 zero = find_next_zero_bit(bitmap, end, one + 1);
33a5cb62
WY
2790
2791 if (zero >= end) {
2792 discard_length = end - one;
e0b266f0 2793 } else {
33a5cb62
WY
2794 discard_length = zero - one;
2795 }
810cf2bb 2796 postcopy_discard_send_range(ms, one, discard_length);
33a5cb62 2797 current = one + discard_length;
e0b266f0 2798 }
e0b266f0
DDAG
2799}
2800
f30c2e5b
PX
2801static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2802
3d0684b2
JQ
2803/**
2804 * postcopy_each_ram_send_discard: discard all RAMBlocks
2805 *
e0b266f0
DDAG
2806 * Utility for the outgoing postcopy code.
2807 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2808 * passing it bitmap indexes and name.
e0b266f0
DDAG
2809 * (qemu_ram_foreach_block ends up passing unscaled lengths
2810 * which would mean postcopy code would have to deal with target page)
3d0684b2
JQ
2811 *
2812 * @ms: current migration state
e0b266f0 2813 */
739fcc1b 2814static void postcopy_each_ram_send_discard(MigrationState *ms)
e0b266f0
DDAG
2815{
2816 struct RAMBlock *block;
e0b266f0 2817
fbd162e6 2818 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
810cf2bb 2819 postcopy_discard_send_init(ms, block->idstr);
e0b266f0 2820
f30c2e5b
PX
2821 /*
2822 * Deal with TPS != HPS and huge pages. It discard any partially sent
2823 * host-page size chunks, mark any partially dirty host-page size
2824 * chunks as all dirty. In this case the host-page is the host-page
2825 * for the particular RAMBlock, i.e. it might be a huge page.
2826 */
2827 postcopy_chunk_hostpages_pass(ms, block);
2828
e0b266f0
DDAG
2829 /*
2830 * Postcopy sends chunks of bitmap over the wire, but it
2831 * just needs indexes at this point, avoids it having
2832 * target page specific code.
2833 */
739fcc1b 2834 postcopy_send_discard_bm_ram(ms, block);
810cf2bb 2835 postcopy_discard_send_finish(ms);
e0b266f0 2836 }
e0b266f0
DDAG
2837}
2838
3d0684b2 2839/**
8324ef86 2840 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
3d0684b2
JQ
2841 *
2842 * Helper for postcopy_chunk_hostpages; it's called twice to
2843 * canonicalize the two bitmaps, that are similar, but one is
2844 * inverted.
99e314eb 2845 *
3d0684b2
JQ
2846 * Postcopy requires that all target pages in a hostpage are dirty or
2847 * clean, not a mix. This function canonicalizes the bitmaps.
99e314eb 2848 *
3d0684b2 2849 * @ms: current migration state
3d0684b2 2850 * @block: block that contains the page we want to canonicalize
99e314eb 2851 */
1e7cf8c3 2852static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
99e314eb 2853{
53518d94 2854 RAMState *rs = ram_state;
6b6712ef 2855 unsigned long *bitmap = block->bmap;
29c59172 2856 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
6b6712ef 2857 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
99e314eb
DDAG
2858 unsigned long run_start;
2859
29c59172
DDAG
2860 if (block->page_size == TARGET_PAGE_SIZE) {
2861 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2862 return;
2863 }
2864
1e7cf8c3
WY
2865 /* Find a dirty page */
2866 run_start = find_next_bit(bitmap, pages, 0);
99e314eb 2867
6b6712ef 2868 while (run_start < pages) {
99e314eb
DDAG
2869
2870 /*
2871 * If the start of this run of pages is in the middle of a host
2872 * page, then we need to fixup this host page.
2873 */
9dec3cc3 2874 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
99e314eb 2875 /* Find the end of this run */
1e7cf8c3 2876 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
99e314eb
DDAG
2877 /*
2878 * If the end isn't at the start of a host page, then the
2879 * run doesn't finish at the end of a host page
2880 * and we need to discard.
2881 */
99e314eb
DDAG
2882 }
2883
9dec3cc3 2884 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
99e314eb 2885 unsigned long page;
dad45ab2
WY
2886 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2887 host_ratio);
2888 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
99e314eb 2889
99e314eb
DDAG
2890 /* Clean up the bitmap */
2891 for (page = fixup_start_addr;
2892 page < fixup_start_addr + host_ratio; page++) {
99e314eb
DDAG
2893 /*
2894 * Remark them as dirty, updating the count for any pages
2895 * that weren't previously dirty.
2896 */
0d8ec885 2897 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
99e314eb
DDAG
2898 }
2899 }
2900
1e7cf8c3
WY
2901 /* Find the next dirty page for the next iteration */
2902 run_start = find_next_bit(bitmap, pages, run_start);
99e314eb
DDAG
2903 }
2904}
2905
3d0684b2
JQ
2906/**
2907 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2908 *
e0b266f0
DDAG
2909 * Transmit the set of pages to be discarded after precopy to the target
2910 * these are pages that:
2911 * a) Have been previously transmitted but are now dirty again
2912 * b) Pages that have never been transmitted, this ensures that
2913 * any pages on the destination that have been mapped by background
2914 * tasks get discarded (transparent huge pages is the specific concern)
2915 * Hopefully this is pretty sparse
3d0684b2
JQ
2916 *
2917 * @ms: current migration state
e0b266f0 2918 */
739fcc1b 2919void ram_postcopy_send_discard_bitmap(MigrationState *ms)
e0b266f0 2920{
53518d94 2921 RAMState *rs = ram_state;
e0b266f0 2922
89ac5a1d 2923 RCU_READ_LOCK_GUARD();
e0b266f0
DDAG
2924
2925 /* This should be our last sync, the src is now paused */
eb859c53 2926 migration_bitmap_sync(rs);
e0b266f0 2927
6b6712ef 2928 /* Easiest way to make sure we don't resume in the middle of a host-page */
ec6f3ab9 2929 rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL;
6b6712ef 2930 rs->last_seen_block = NULL;
6b6712ef 2931 rs->last_page = 0;
e0b266f0 2932
739fcc1b 2933 postcopy_each_ram_send_discard(ms);
e0b266f0 2934
739fcc1b 2935 trace_ram_postcopy_send_discard_bitmap();
e0b266f0
DDAG
2936}
2937
3d0684b2
JQ
2938/**
2939 * ram_discard_range: discard dirtied pages at the beginning of postcopy
e0b266f0 2940 *
3d0684b2 2941 * Returns zero on success
e0b266f0 2942 *
36449157
JQ
2943 * @rbname: name of the RAMBlock of the request. NULL means the
2944 * same that last one.
3d0684b2
JQ
2945 * @start: RAMBlock starting page
2946 * @length: RAMBlock size
e0b266f0 2947 */
aaa2064c 2948int ram_discard_range(const char *rbname, uint64_t start, size_t length)
e0b266f0 2949{
36449157 2950 trace_ram_discard_range(rbname, start, length);
d3a5038c 2951
89ac5a1d 2952 RCU_READ_LOCK_GUARD();
36449157 2953 RAMBlock *rb = qemu_ram_block_by_name(rbname);
e0b266f0
DDAG
2954
2955 if (!rb) {
36449157 2956 error_report("ram_discard_range: Failed to find block '%s'", rbname);
03acb4e9 2957 return -1;
e0b266f0
DDAG
2958 }
2959
814bb08f
PX
2960 /*
2961 * On source VM, we don't need to update the received bitmap since
2962 * we don't even have one.
2963 */
2964 if (rb->receivedmap) {
2965 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2966 length >> qemu_target_page_bits());
2967 }
2968
03acb4e9 2969 return ram_block_discard_range(rb, start, length);
e0b266f0
DDAG
2970}
2971
84593a08
PX
2972/*
2973 * For every allocation, we will try not to crash the VM if the
2974 * allocation failed.
2975 */
2976static int xbzrle_init(void)
2977{
2978 Error *local_err = NULL;
2979
2980 if (!migrate_use_xbzrle()) {
2981 return 0;
2982 }
2983
2984 XBZRLE_cache_lock();
2985
2986 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2987 if (!XBZRLE.zero_target_page) {
2988 error_report("%s: Error allocating zero page", __func__);
2989 goto err_out;
2990 }
2991
2992 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2993 TARGET_PAGE_SIZE, &local_err);
2994 if (!XBZRLE.cache) {
2995 error_report_err(local_err);
2996 goto free_zero_page;
2997 }
2998
2999 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3000 if (!XBZRLE.encoded_buf) {
3001 error_report("%s: Error allocating encoded_buf", __func__);
3002 goto free_cache;
3003 }
3004
3005 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3006 if (!XBZRLE.current_buf) {
3007 error_report("%s: Error allocating current_buf", __func__);
3008 goto free_encoded_buf;
3009 }
3010
3011 /* We are all good */
3012 XBZRLE_cache_unlock();
3013 return 0;
3014
3015free_encoded_buf:
3016 g_free(XBZRLE.encoded_buf);
3017 XBZRLE.encoded_buf = NULL;
3018free_cache:
3019 cache_fini(XBZRLE.cache);
3020 XBZRLE.cache = NULL;
3021free_zero_page:
3022 g_free(XBZRLE.zero_target_page);
3023 XBZRLE.zero_target_page = NULL;
3024err_out:
3025 XBZRLE_cache_unlock();
3026 return -ENOMEM;
3027}
3028
53518d94 3029static int ram_state_init(RAMState **rsp)
56e93d26 3030{
7d00ee6a
PX
3031 *rsp = g_try_new0(RAMState, 1);
3032
3033 if (!*rsp) {
3034 error_report("%s: Init ramstate fail", __func__);
3035 return -1;
3036 }
53518d94
JQ
3037
3038 qemu_mutex_init(&(*rsp)->bitmap_mutex);
3039 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3040 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
8d80e195 3041 (*rsp)->ram_bytes_total = ram_bytes_total();
56e93d26 3042
7d00ee6a 3043 /*
40c4d4a8
IR
3044 * Count the total number of pages used by ram blocks not including any
3045 * gaps due to alignment or unplugs.
03158519 3046 * This must match with the initial values of dirty bitmap.
7d00ee6a 3047 */
8d80e195 3048 (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS;
7d00ee6a
PX
3049 ram_state_reset(*rsp);
3050
3051 return 0;
3052}
3053
d6eff5d7 3054static void ram_list_init_bitmaps(void)
7d00ee6a 3055{
002cad6b 3056 MigrationState *ms = migrate_get_current();
d6eff5d7
PX
3057 RAMBlock *block;
3058 unsigned long pages;
002cad6b 3059 uint8_t shift;
56e93d26 3060
0827b9e9
AA
3061 /* Skip setting bitmap if there is no RAM */
3062 if (ram_bytes_total()) {
002cad6b
PX
3063 shift = ms->clear_bitmap_shift;
3064 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3065 error_report("clear_bitmap_shift (%u) too big, using "
3066 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3067 shift = CLEAR_BITMAP_SHIFT_MAX;
3068 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3069 error_report("clear_bitmap_shift (%u) too small, using "
3070 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3071 shift = CLEAR_BITMAP_SHIFT_MIN;
3072 }
3073
fbd162e6 3074 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
d6eff5d7 3075 pages = block->max_length >> TARGET_PAGE_BITS;
03158519
WY
3076 /*
3077 * The initial dirty bitmap for migration must be set with all
3078 * ones to make sure we'll migrate every guest RAM page to
3079 * destination.
40c4d4a8
IR
3080 * Here we set RAMBlock.bmap all to 1 because when rebegin a
3081 * new migration after a failed migration, ram_list.
3082 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3083 * guest memory.
03158519 3084 */
6b6712ef 3085 block->bmap = bitmap_new(pages);
40c4d4a8 3086 bitmap_set(block->bmap, 0, pages);
002cad6b
PX
3087 block->clear_bmap_shift = shift;
3088 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
0827b9e9 3089 }
f3f491fc 3090 }
d6eff5d7
PX
3091}
3092
be39b4cd
DH
3093static void migration_bitmap_clear_discarded_pages(RAMState *rs)
3094{
3095 unsigned long pages;
3096 RAMBlock *rb;
3097
3098 RCU_READ_LOCK_GUARD();
3099
3100 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3101 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
3102 rs->migration_dirty_pages -= pages;
3103 }
3104}
3105
d6eff5d7
PX
3106static void ram_init_bitmaps(RAMState *rs)
3107{
3108 /* For memory_global_dirty_log_start below. */
3109 qemu_mutex_lock_iothread();
3110 qemu_mutex_lock_ramlist();
f3f491fc 3111
89ac5a1d
DDAG
3112 WITH_RCU_READ_LOCK_GUARD() {
3113 ram_list_init_bitmaps();
278e2f55
AG
3114 /* We don't use dirty log with background snapshots */
3115 if (!migrate_background_snapshot()) {
63b41db4 3116 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
278e2f55
AG
3117 migration_bitmap_sync_precopy(rs);
3118 }
89ac5a1d 3119 }
56e93d26 3120 qemu_mutex_unlock_ramlist();
49877834 3121 qemu_mutex_unlock_iothread();
be39b4cd
DH
3122
3123 /*
3124 * After an eventual first bitmap sync, fixup the initial bitmap
3125 * containing all 1s to exclude any discarded pages from migration.
3126 */
3127 migration_bitmap_clear_discarded_pages(rs);
d6eff5d7
PX
3128}
3129
3130static int ram_init_all(RAMState **rsp)
3131{
3132 if (ram_state_init(rsp)) {
3133 return -1;
3134 }
3135
3136 if (xbzrle_init()) {
3137 ram_state_cleanup(rsp);
3138 return -1;
3139 }
3140
3141 ram_init_bitmaps(*rsp);
a91246c9
HZ
3142
3143 return 0;
3144}
3145
08614f34
PX
3146static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3147{
3148 RAMBlock *block;
3149 uint64_t pages = 0;
3150
3151 /*
3152 * Postcopy is not using xbzrle/compression, so no need for that.
3153 * Also, since source are already halted, we don't need to care
3154 * about dirty page logging as well.
3155 */
3156
fbd162e6 3157 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
08614f34
PX
3158 pages += bitmap_count_one(block->bmap,
3159 block->used_length >> TARGET_PAGE_BITS);
3160 }
3161
3162 /* This may not be aligned with current bitmaps. Recalculate. */
3163 rs->migration_dirty_pages = pages;
3164
1a373522 3165 ram_state_reset(rs);
08614f34
PX
3166
3167 /* Update RAMState cache of output QEMUFile */
7f401b80 3168 rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out;
08614f34
PX
3169
3170 trace_ram_state_resume_prepare(pages);
3171}
3172
6bcb05fc
WW
3173/*
3174 * This function clears bits of the free pages reported by the caller from the
3175 * migration dirty bitmap. @addr is the host address corresponding to the
3176 * start of the continuous guest free pages, and @len is the total bytes of
3177 * those pages.
3178 */
3179void qemu_guest_free_page_hint(void *addr, size_t len)
3180{
3181 RAMBlock *block;
3182 ram_addr_t offset;
3183 size_t used_len, start, npages;
3184 MigrationState *s = migrate_get_current();
3185
3186 /* This function is currently expected to be used during live migration */
3187 if (!migration_is_setup_or_active(s->state)) {
3188 return;
3189 }
3190
3191 for (; len > 0; len -= used_len, addr += used_len) {
3192 block = qemu_ram_block_from_host(addr, false, &offset);
3193 if (unlikely(!block || offset >= block->used_length)) {
3194 /*
3195 * The implementation might not support RAMBlock resize during
3196 * live migration, but it could happen in theory with future
3197 * updates. So we add a check here to capture that case.
3198 */
3199 error_report_once("%s unexpected error", __func__);
3200 return;
3201 }
3202
3203 if (len <= block->used_length - offset) {
3204 used_len = len;
3205 } else {
3206 used_len = block->used_length - offset;
3207 }
3208
3209 start = offset >> TARGET_PAGE_BITS;
3210 npages = used_len >> TARGET_PAGE_BITS;
3211
3212 qemu_mutex_lock(&ram_state->bitmap_mutex);
3143577d
WW
3213 /*
3214 * The skipped free pages are equavalent to be sent from clear_bmap's
3215 * perspective, so clear the bits from the memory region bitmap which
3216 * are initially set. Otherwise those skipped pages will be sent in
3217 * the next round after syncing from the memory region bitmap.
3218 */
1230a25f 3219 migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
6bcb05fc
WW
3220 ram_state->migration_dirty_pages -=
3221 bitmap_count_one_with_offset(block->bmap, start, npages);
3222 bitmap_clear(block->bmap, start, npages);
3223 qemu_mutex_unlock(&ram_state->bitmap_mutex);
3224 }
3225}
3226
3d0684b2
JQ
3227/*
3228 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
a91246c9
HZ
3229 * long-running RCU critical section. When rcu-reclaims in the code
3230 * start to become numerous it will be necessary to reduce the
3231 * granularity of these critical sections.
3232 */
3233
3d0684b2
JQ
3234/**
3235 * ram_save_setup: Setup RAM for migration
3236 *
3237 * Returns zero to indicate success and negative for error
3238 *
3239 * @f: QEMUFile where to send the data
3240 * @opaque: RAMState pointer
3241 */
a91246c9
HZ
3242static int ram_save_setup(QEMUFile *f, void *opaque)
3243{
53518d94 3244 RAMState **rsp = opaque;
a91246c9 3245 RAMBlock *block;
33d70973 3246 int ret;
a91246c9 3247
dcaf446e
XG
3248 if (compress_threads_save_setup()) {
3249 return -1;
3250 }
3251
a91246c9
HZ
3252 /* migration has already setup the bitmap, reuse it. */
3253 if (!migration_in_colo_state()) {
7d00ee6a 3254 if (ram_init_all(rsp) != 0) {
dcaf446e 3255 compress_threads_save_cleanup();
a91246c9 3256 return -1;
53518d94 3257 }
a91246c9 3258 }
7f401b80 3259 (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f;
a91246c9 3260
0e6ebd48 3261 WITH_RCU_READ_LOCK_GUARD() {
8008a272
JQ
3262 qemu_put_be64(f, ram_bytes_total_with_ignored()
3263 | RAM_SAVE_FLAG_MEM_SIZE);
56e93d26 3264
0e6ebd48
DDAG
3265 RAMBLOCK_FOREACH_MIGRATABLE(block) {
3266 qemu_put_byte(f, strlen(block->idstr));
3267 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3268 qemu_put_be64(f, block->used_length);
3269 if (migrate_postcopy_ram() && block->page_size !=
3270 qemu_host_page_size) {
3271 qemu_put_be64(f, block->page_size);
3272 }
3273 if (migrate_ignore_shared()) {
3274 qemu_put_be64(f, block->mr->addr);
3275 }
fbd162e6 3276 }
56e93d26
JQ
3277 }
3278
56e93d26
JQ
3279 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3280 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3281
4010ba38
JQ
3282 migration_ops = g_malloc0(sizeof(MigrationOps));
3283 migration_ops->ram_save_target_page = ram_save_target_page_legacy;
8ebb6ecc 3284 ret = multifd_send_sync_main(f);
33d70973
LB
3285 if (ret < 0) {
3286 return ret;
3287 }
3288
56e93d26 3289 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
35374cbd 3290 qemu_fflush(f);
56e93d26
JQ
3291
3292 return 0;
3293}
3294
3d0684b2
JQ
3295/**
3296 * ram_save_iterate: iterative stage for migration
3297 *
3298 * Returns zero to indicate success and negative for error
3299 *
3300 * @f: QEMUFile where to send the data
3301 * @opaque: RAMState pointer
3302 */
56e93d26
JQ
3303static int ram_save_iterate(QEMUFile *f, void *opaque)
3304{
53518d94
JQ
3305 RAMState **temp = opaque;
3306 RAMState *rs = *temp;
3d4095b2 3307 int ret = 0;
56e93d26
JQ
3308 int i;
3309 int64_t t0;
5c90308f 3310 int done = 0;
56e93d26 3311
b2557345
PL
3312 if (blk_mig_bulk_active()) {
3313 /* Avoid transferring ram during bulk phase of block migration as
3314 * the bulk phase will usually take a long time and transferring
3315 * ram updates during that time is pointless. */
3316 goto out;
3317 }
3318
63268c49
PX
3319 /*
3320 * We'll take this lock a little bit long, but it's okay for two reasons.
3321 * Firstly, the only possible other thread to take it is who calls
3322 * qemu_guest_free_page_hint(), which should be rare; secondly, see
3323 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3324 * guarantees that we'll at least released it in a regular basis.
3325 */
3326 qemu_mutex_lock(&rs->bitmap_mutex);
89ac5a1d
DDAG
3327 WITH_RCU_READ_LOCK_GUARD() {
3328 if (ram_list.version != rs->last_version) {
3329 ram_state_reset(rs);
3330 }
56e93d26 3331
89ac5a1d
DDAG
3332 /* Read version before ram_list.blocks */
3333 smp_rmb();
56e93d26 3334
89ac5a1d 3335 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
56e93d26 3336
89ac5a1d
DDAG
3337 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3338 i = 0;
3339 while ((ret = qemu_file_rate_limit(f)) == 0 ||
a1fe28df 3340 postcopy_has_request(rs)) {
89ac5a1d 3341 int pages;
e03a34f8 3342
89ac5a1d
DDAG
3343 if (qemu_file_get_error(f)) {
3344 break;
3345 }
e8f3735f 3346
05931ec5 3347 pages = ram_find_and_save_block(rs);
89ac5a1d
DDAG
3348 /* no more pages to sent */
3349 if (pages == 0) {
3350 done = 1;
3351 break;
3352 }
e8f3735f 3353
89ac5a1d
DDAG
3354 if (pages < 0) {
3355 qemu_file_set_error(f, pages);
56e93d26
JQ
3356 break;
3357 }
89ac5a1d
DDAG
3358
3359 rs->target_page_count += pages;
3360
644acf99
WY
3361 /*
3362 * During postcopy, it is necessary to make sure one whole host
3363 * page is sent in one chunk.
3364 */
3365 if (migrate_postcopy_ram()) {
3366 flush_compressed_data(rs);
3367 }
3368
89ac5a1d
DDAG
3369 /*
3370 * we want to check in the 1st loop, just in case it was the 1st
3371 * time and we had to sync the dirty bitmap.
3372 * qemu_clock_get_ns() is a bit expensive, so we only check each
3373 * some iterations
3374 */
3375 if ((i & 63) == 0) {
3376 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3377 1000000;
3378 if (t1 > MAX_WAIT) {
3379 trace_ram_save_iterate_big_wait(t1, i);
3380 break;
3381 }
3382 }
3383 i++;
56e93d26 3384 }
56e93d26 3385 }
63268c49 3386 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26
JQ
3387
3388 /*
3389 * Must occur before EOS (or any QEMUFile operation)
3390 * because of RDMA protocol.
3391 */
3392 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3393
b2557345 3394out:
b69a0227
JQ
3395 if (ret >= 0
3396 && migration_is_setup_or_active(migrate_get_current()->state)) {
7f401b80 3397 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
33d70973
LB
3398 if (ret < 0) {
3399 return ret;
3400 }
3401
3d4095b2
JQ
3402 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3403 qemu_fflush(f);
4c2d0f6d 3404 ram_transferred_add(8);
56e93d26 3405
3d4095b2
JQ
3406 ret = qemu_file_get_error(f);
3407 }
56e93d26
JQ
3408 if (ret < 0) {
3409 return ret;
3410 }
3411
5c90308f 3412 return done;
56e93d26
JQ
3413}
3414
3d0684b2
JQ
3415/**
3416 * ram_save_complete: function called to send the remaining amount of ram
3417 *
e8f3735f 3418 * Returns zero to indicate success or negative on error
3d0684b2
JQ
3419 *
3420 * Called with iothread lock
3421 *
3422 * @f: QEMUFile where to send the data
3423 * @opaque: RAMState pointer
3424 */
56e93d26
JQ
3425static int ram_save_complete(QEMUFile *f, void *opaque)
3426{
53518d94
JQ
3427 RAMState **temp = opaque;
3428 RAMState *rs = *temp;
e8f3735f 3429 int ret = 0;
6f37bb8b 3430
05931ec5
JQ
3431 rs->last_stage = !migration_in_colo_state();
3432
89ac5a1d
DDAG
3433 WITH_RCU_READ_LOCK_GUARD() {
3434 if (!migration_in_postcopy()) {
3435 migration_bitmap_sync_precopy(rs);
3436 }
56e93d26 3437
89ac5a1d 3438 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
56e93d26 3439
89ac5a1d 3440 /* try transferring iterative blocks of memory */
56e93d26 3441
89ac5a1d 3442 /* flush all remaining blocks regardless of rate limiting */
c13221b5 3443 qemu_mutex_lock(&rs->bitmap_mutex);
89ac5a1d
DDAG
3444 while (true) {
3445 int pages;
56e93d26 3446
05931ec5 3447 pages = ram_find_and_save_block(rs);
89ac5a1d
DDAG
3448 /* no more blocks to sent */
3449 if (pages == 0) {
3450 break;
3451 }
3452 if (pages < 0) {
3453 ret = pages;
3454 break;
3455 }
e8f3735f 3456 }
c13221b5 3457 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26 3458
89ac5a1d
DDAG
3459 flush_compressed_data(rs);
3460 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3461 }
d09a6fde 3462
33d70973
LB
3463 if (ret < 0) {
3464 return ret;
3d4095b2 3465 }
56e93d26 3466
7f401b80 3467 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
33d70973
LB
3468 if (ret < 0) {
3469 return ret;
3470 }
3471
3472 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3473 qemu_fflush(f);
3474
3475 return 0;
56e93d26
JQ
3476}
3477
24beea4e
JQ
3478static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy,
3479 uint64_t *can_postcopy)
56e93d26 3480{
53518d94
JQ
3481 RAMState **temp = opaque;
3482 RAMState *rs = *temp;
56e93d26 3483
c8df4a7a 3484 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 3485
c8df4a7a
JQ
3486 if (migrate_postcopy_ram()) {
3487 /* We can do postcopy, and all the data is postcopiable */
24beea4e 3488 *can_postcopy += remaining_size;
c8df4a7a 3489 } else {
24beea4e 3490 *must_precopy += remaining_size;
c8df4a7a
JQ
3491 }
3492}
3493
24beea4e
JQ
3494static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy,
3495 uint64_t *can_postcopy)
c8df4a7a 3496{
28ef5339 3497 MigrationState *s = migrate_get_current();
c8df4a7a
JQ
3498 RAMState **temp = opaque;
3499 RAMState *rs = *temp;
3500
3501 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3502
28ef5339 3503 if (!migration_in_postcopy() && remaining_size < s->threshold_size) {
56e93d26 3504 qemu_mutex_lock_iothread();
89ac5a1d
DDAG
3505 WITH_RCU_READ_LOCK_GUARD() {
3506 migration_bitmap_sync_precopy(rs);
3507 }
56e93d26 3508 qemu_mutex_unlock_iothread();
9edabd4d 3509 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 3510 }
c31b098f 3511
86e1167e
VSO
3512 if (migrate_postcopy_ram()) {
3513 /* We can do postcopy, and all the data is postcopiable */
24beea4e 3514 *can_postcopy += remaining_size;
86e1167e 3515 } else {
24beea4e 3516 *must_precopy += remaining_size;
86e1167e 3517 }
56e93d26
JQ
3518}
3519
3520static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3521{
3522 unsigned int xh_len;
3523 int xh_flags;
063e760a 3524 uint8_t *loaded_data;
56e93d26 3525
56e93d26
JQ
3526 /* extract RLE header */
3527 xh_flags = qemu_get_byte(f);
3528 xh_len = qemu_get_be16(f);
3529
3530 if (xh_flags != ENCODING_FLAG_XBZRLE) {
3531 error_report("Failed to load XBZRLE page - wrong compression!");
3532 return -1;
3533 }
3534
3535 if (xh_len > TARGET_PAGE_SIZE) {
3536 error_report("Failed to load XBZRLE page - len overflow!");
3537 return -1;
3538 }
f265e0e4 3539 loaded_data = XBZRLE.decoded_buf;
56e93d26 3540 /* load data and decode */
f265e0e4 3541 /* it can change loaded_data to point to an internal buffer */
063e760a 3542 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
56e93d26
JQ
3543
3544 /* decode RLE */
063e760a 3545 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
56e93d26
JQ
3546 TARGET_PAGE_SIZE) == -1) {
3547 error_report("Failed to load XBZRLE page - decode error!");
3548 return -1;
3549 }
3550
3551 return 0;
3552}
3553
3d0684b2
JQ
3554/**
3555 * ram_block_from_stream: read a RAMBlock id from the migration stream
3556 *
3557 * Must be called from within a rcu critical section.
3558 *
56e93d26 3559 * Returns a pointer from within the RCU-protected ram_list.
a7180877 3560 *
755e8d7c 3561 * @mis: the migration incoming state pointer
3d0684b2
JQ
3562 * @f: QEMUFile where to read the data from
3563 * @flags: Page flags (mostly to see if it's a continuation of previous block)
c01b16ed 3564 * @channel: the channel we're using
a7180877 3565 */
755e8d7c 3566static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
c01b16ed
PX
3567 QEMUFile *f, int flags,
3568 int channel)
56e93d26 3569{
c01b16ed 3570 RAMBlock *block = mis->last_recv_block[channel];
56e93d26
JQ
3571 char id[256];
3572 uint8_t len;
3573
3574 if (flags & RAM_SAVE_FLAG_CONTINUE) {
4c4bad48 3575 if (!block) {
56e93d26
JQ
3576 error_report("Ack, bad migration stream!");
3577 return NULL;
3578 }
4c4bad48 3579 return block;
56e93d26
JQ
3580 }
3581
3582 len = qemu_get_byte(f);
3583 qemu_get_buffer(f, (uint8_t *)id, len);
3584 id[len] = 0;
3585
e3dd7493 3586 block = qemu_ram_block_by_name(id);
4c4bad48
HZ
3587 if (!block) {
3588 error_report("Can't find block %s", id);
3589 return NULL;
56e93d26
JQ
3590 }
3591
fbd162e6 3592 if (ramblock_is_ignored(block)) {
b895de50
CLG
3593 error_report("block %s should not be migrated !", id);
3594 return NULL;
3595 }
3596
c01b16ed 3597 mis->last_recv_block[channel] = block;
755e8d7c 3598
4c4bad48
HZ
3599 return block;
3600}
3601
3602static inline void *host_from_ram_block_offset(RAMBlock *block,
3603 ram_addr_t offset)
3604{
3605 if (!offset_in_ramblock(block, offset)) {
3606 return NULL;
3607 }
3608
3609 return block->host + offset;
56e93d26
JQ
3610}
3611
6a23f639
DH
3612static void *host_page_from_ram_block_offset(RAMBlock *block,
3613 ram_addr_t offset)
3614{
3615 /* Note: Explicitly no check against offset_in_ramblock(). */
3616 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3617 block->page_size);
3618}
3619
3620static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3621 ram_addr_t offset)
3622{
3623 return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3624}
3625
13af18f2 3626static inline void *colo_cache_from_block_offset(RAMBlock *block,
8af66371 3627 ram_addr_t offset, bool record_bitmap)
13af18f2
ZC
3628{
3629 if (!offset_in_ramblock(block, offset)) {
3630 return NULL;
3631 }
3632 if (!block->colo_cache) {
3633 error_report("%s: colo_cache is NULL in block :%s",
3634 __func__, block->idstr);
3635 return NULL;
3636 }
7d9acafa
ZC
3637
3638 /*
3639 * During colo checkpoint, we need bitmap of these migrated pages.
3640 * It help us to decide which pages in ram cache should be flushed
3641 * into VM's RAM later.
3642 */
8af66371
HZ
3643 if (record_bitmap &&
3644 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
7d9acafa
ZC
3645 ram_state->migration_dirty_pages++;
3646 }
13af18f2
ZC
3647 return block->colo_cache + offset;
3648}
3649
3d0684b2
JQ
3650/**
3651 * ram_handle_compressed: handle the zero page case
3652 *
56e93d26
JQ
3653 * If a page (or a whole RDMA chunk) has been
3654 * determined to be zero, then zap it.
3d0684b2
JQ
3655 *
3656 * @host: host address for the zero page
3657 * @ch: what the page is filled from. We only support zero
3658 * @size: size of the zero page
56e93d26
JQ
3659 */
3660void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3661{
bad452a7 3662 if (ch != 0 || !buffer_is_zero(host, size)) {
56e93d26
JQ
3663 memset(host, ch, size);
3664 }
3665}
3666
797ca154
XG
3667/* return the size after decompression, or negative value on error */
3668static int
3669qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3670 const uint8_t *source, size_t source_len)
3671{
3672 int err;
3673
3674 err = inflateReset(stream);
3675 if (err != Z_OK) {
3676 return -1;
3677 }
3678
3679 stream->avail_in = source_len;
3680 stream->next_in = (uint8_t *)source;
3681 stream->avail_out = dest_len;
3682 stream->next_out = dest;
3683
3684 err = inflate(stream, Z_NO_FLUSH);
3685 if (err != Z_STREAM_END) {
3686 return -1;
3687 }
3688
3689 return stream->total_out;
3690}
3691
56e93d26
JQ
3692static void *do_data_decompress(void *opaque)
3693{
3694 DecompressParam *param = opaque;
3695 unsigned long pagesize;
33d151f4 3696 uint8_t *des;
34ab9e97 3697 int len, ret;
56e93d26 3698
33d151f4 3699 qemu_mutex_lock(&param->mutex);
90e56fb4 3700 while (!param->quit) {
33d151f4
LL
3701 if (param->des) {
3702 des = param->des;
3703 len = param->len;
3704 param->des = 0;
3705 qemu_mutex_unlock(&param->mutex);
3706
56e93d26 3707 pagesize = TARGET_PAGE_SIZE;
34ab9e97
XG
3708
3709 ret = qemu_uncompress_data(&param->stream, des, pagesize,
3710 param->compbuf, len);
f548222c 3711 if (ret < 0 && migrate_get_current()->decompress_error_check) {
34ab9e97
XG
3712 error_report("decompress data failed");
3713 qemu_file_set_error(decomp_file, ret);
3714 }
73a8912b 3715
33d151f4
LL
3716 qemu_mutex_lock(&decomp_done_lock);
3717 param->done = true;
3718 qemu_cond_signal(&decomp_done_cond);
3719 qemu_mutex_unlock(&decomp_done_lock);
3720
3721 qemu_mutex_lock(&param->mutex);
3722 } else {
3723 qemu_cond_wait(&param->cond, &param->mutex);
3724 }
56e93d26 3725 }
33d151f4 3726 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
3727
3728 return NULL;
3729}
3730
34ab9e97 3731static int wait_for_decompress_done(void)
5533b2e9
LL
3732{
3733 int idx, thread_count;
3734
3735 if (!migrate_use_compression()) {
34ab9e97 3736 return 0;
5533b2e9
LL
3737 }
3738
3739 thread_count = migrate_decompress_threads();
3740 qemu_mutex_lock(&decomp_done_lock);
3741 for (idx = 0; idx < thread_count; idx++) {
3742 while (!decomp_param[idx].done) {
3743 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3744 }
3745 }
3746 qemu_mutex_unlock(&decomp_done_lock);
34ab9e97 3747 return qemu_file_get_error(decomp_file);
5533b2e9
LL
3748}
3749
f0afa331 3750static void compress_threads_load_cleanup(void)
56e93d26
JQ
3751{
3752 int i, thread_count;
3753
3416ab5b
JQ
3754 if (!migrate_use_compression()) {
3755 return;
3756 }
56e93d26
JQ
3757 thread_count = migrate_decompress_threads();
3758 for (i = 0; i < thread_count; i++) {
797ca154
XG
3759 /*
3760 * we use it as a indicator which shows if the thread is
3761 * properly init'd or not
3762 */
3763 if (!decomp_param[i].compbuf) {
3764 break;
3765 }
3766
56e93d26 3767 qemu_mutex_lock(&decomp_param[i].mutex);
90e56fb4 3768 decomp_param[i].quit = true;
56e93d26
JQ
3769 qemu_cond_signal(&decomp_param[i].cond);
3770 qemu_mutex_unlock(&decomp_param[i].mutex);
3771 }
3772 for (i = 0; i < thread_count; i++) {
797ca154
XG
3773 if (!decomp_param[i].compbuf) {
3774 break;
3775 }
3776
56e93d26
JQ
3777 qemu_thread_join(decompress_threads + i);
3778 qemu_mutex_destroy(&decomp_param[i].mutex);
3779 qemu_cond_destroy(&decomp_param[i].cond);
797ca154 3780 inflateEnd(&decomp_param[i].stream);
56e93d26 3781 g_free(decomp_param[i].compbuf);
797ca154 3782 decomp_param[i].compbuf = NULL;
56e93d26
JQ
3783 }
3784 g_free(decompress_threads);
3785 g_free(decomp_param);
56e93d26
JQ
3786 decompress_threads = NULL;
3787 decomp_param = NULL;
34ab9e97 3788 decomp_file = NULL;
56e93d26
JQ
3789}
3790
34ab9e97 3791static int compress_threads_load_setup(QEMUFile *f)
797ca154
XG
3792{
3793 int i, thread_count;
3794
3795 if (!migrate_use_compression()) {
3796 return 0;
3797 }
3798
3799 thread_count = migrate_decompress_threads();
3800 decompress_threads = g_new0(QemuThread, thread_count);
3801 decomp_param = g_new0(DecompressParam, thread_count);
3802 qemu_mutex_init(&decomp_done_lock);
3803 qemu_cond_init(&decomp_done_cond);
34ab9e97 3804 decomp_file = f;
797ca154
XG
3805 for (i = 0; i < thread_count; i++) {
3806 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3807 goto exit;
3808 }
3809
3810 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3811 qemu_mutex_init(&decomp_param[i].mutex);
3812 qemu_cond_init(&decomp_param[i].cond);
3813 decomp_param[i].done = true;
3814 decomp_param[i].quit = false;
3815 qemu_thread_create(decompress_threads + i, "decompress",
3816 do_data_decompress, decomp_param + i,
3817 QEMU_THREAD_JOINABLE);
3818 }
3819 return 0;
3820exit:
3821 compress_threads_load_cleanup();
3822 return -1;
3823}
3824
c1bc6626 3825static void decompress_data_with_multi_threads(QEMUFile *f,
56e93d26
JQ
3826 void *host, int len)
3827{
3828 int idx, thread_count;
3829
3830 thread_count = migrate_decompress_threads();
37396950 3831 QEMU_LOCK_GUARD(&decomp_done_lock);
56e93d26
JQ
3832 while (true) {
3833 for (idx = 0; idx < thread_count; idx++) {
73a8912b 3834 if (decomp_param[idx].done) {
33d151f4
LL
3835 decomp_param[idx].done = false;
3836 qemu_mutex_lock(&decomp_param[idx].mutex);
c1bc6626 3837 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
56e93d26
JQ
3838 decomp_param[idx].des = host;
3839 decomp_param[idx].len = len;
33d151f4
LL
3840 qemu_cond_signal(&decomp_param[idx].cond);
3841 qemu_mutex_unlock(&decomp_param[idx].mutex);
56e93d26
JQ
3842 break;
3843 }
3844 }
3845 if (idx < thread_count) {
3846 break;
73a8912b
LL
3847 } else {
3848 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
56e93d26
JQ
3849 }
3850 }
3851}
3852
b70cb3b4
RL
3853static void colo_init_ram_state(void)
3854{
3855 ram_state_init(&ram_state);
b70cb3b4
RL
3856}
3857
13af18f2
ZC
3858/*
3859 * colo cache: this is for secondary VM, we cache the whole
3860 * memory of the secondary VM, it is need to hold the global lock
3861 * to call this helper.
3862 */
3863int colo_init_ram_cache(void)
3864{
3865 RAMBlock *block;
3866
44901b5a
PB
3867 WITH_RCU_READ_LOCK_GUARD() {
3868 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3869 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
8dbe22c6 3870 NULL, false, false);
44901b5a
PB
3871 if (!block->colo_cache) {
3872 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3873 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3874 block->used_length);
3875 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3876 if (block->colo_cache) {
3877 qemu_anon_ram_free(block->colo_cache, block->used_length);
3878 block->colo_cache = NULL;
3879 }
89ac5a1d 3880 }
44901b5a 3881 return -errno;
89ac5a1d 3882 }
e5fdf920
LS
3883 if (!machine_dump_guest_core(current_machine)) {
3884 qemu_madvise(block->colo_cache, block->used_length,
3885 QEMU_MADV_DONTDUMP);
3886 }
13af18f2 3887 }
13af18f2 3888 }
44901b5a 3889
7d9acafa
ZC
3890 /*
3891 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3892 * with to decide which page in cache should be flushed into SVM's RAM. Here
3893 * we use the same name 'ram_bitmap' as for migration.
3894 */
3895 if (ram_bytes_total()) {
3896 RAMBlock *block;
3897
fbd162e6 3898 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7d9acafa 3899 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
7d9acafa 3900 block->bmap = bitmap_new(pages);
7d9acafa
ZC
3901 }
3902 }
7d9acafa 3903
b70cb3b4 3904 colo_init_ram_state();
13af18f2 3905 return 0;
13af18f2
ZC
3906}
3907
0393031a
HZ
3908/* TODO: duplicated with ram_init_bitmaps */
3909void colo_incoming_start_dirty_log(void)
3910{
3911 RAMBlock *block = NULL;
3912 /* For memory_global_dirty_log_start below. */
3913 qemu_mutex_lock_iothread();
3914 qemu_mutex_lock_ramlist();
3915
3916 memory_global_dirty_log_sync();
3917 WITH_RCU_READ_LOCK_GUARD() {
3918 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3919 ramblock_sync_dirty_bitmap(ram_state, block);
3920 /* Discard this dirty bitmap record */
3921 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3922 }
63b41db4 3923 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
0393031a
HZ
3924 }
3925 ram_state->migration_dirty_pages = 0;
3926 qemu_mutex_unlock_ramlist();
3927 qemu_mutex_unlock_iothread();
3928}
3929
13af18f2
ZC
3930/* It is need to hold the global lock to call this helper */
3931void colo_release_ram_cache(void)
3932{
3933 RAMBlock *block;
3934
63b41db4 3935 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
fbd162e6 3936 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7d9acafa
ZC
3937 g_free(block->bmap);
3938 block->bmap = NULL;
3939 }
3940
89ac5a1d
DDAG
3941 WITH_RCU_READ_LOCK_GUARD() {
3942 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3943 if (block->colo_cache) {
3944 qemu_anon_ram_free(block->colo_cache, block->used_length);
3945 block->colo_cache = NULL;
3946 }
13af18f2
ZC
3947 }
3948 }
0393031a 3949 ram_state_cleanup(&ram_state);
13af18f2
ZC
3950}
3951
f265e0e4
JQ
3952/**
3953 * ram_load_setup: Setup RAM for migration incoming side
3954 *
3955 * Returns zero to indicate success and negative for error
3956 *
3957 * @f: QEMUFile where to receive the data
3958 * @opaque: RAMState pointer
3959 */
3960static int ram_load_setup(QEMUFile *f, void *opaque)
3961{
34ab9e97 3962 if (compress_threads_load_setup(f)) {
797ca154
XG
3963 return -1;
3964 }
3965
f265e0e4 3966 xbzrle_load_setup();
f9494614 3967 ramblock_recv_map_init();
13af18f2 3968
f265e0e4
JQ
3969 return 0;
3970}
3971
3972static int ram_load_cleanup(void *opaque)
3973{
f9494614 3974 RAMBlock *rb;
56eb90af 3975
fbd162e6 3976 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
bd108a44 3977 qemu_ram_block_writeback(rb);
56eb90af
JH
3978 }
3979
f265e0e4 3980 xbzrle_load_cleanup();
f0afa331 3981 compress_threads_load_cleanup();
f9494614 3982
fbd162e6 3983 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
f9494614
AP
3984 g_free(rb->receivedmap);
3985 rb->receivedmap = NULL;
3986 }
13af18f2 3987
f265e0e4
JQ
3988 return 0;
3989}
3990
3d0684b2
JQ
3991/**
3992 * ram_postcopy_incoming_init: allocate postcopy data structures
3993 *
3994 * Returns 0 for success and negative if there was one error
3995 *
3996 * @mis: current migration incoming state
3997 *
3998 * Allocate data structures etc needed by incoming migration with
3999 * postcopy-ram. postcopy-ram's similarly names
4000 * postcopy_ram_incoming_init does the work.
1caddf8a
DDAG
4001 */
4002int ram_postcopy_incoming_init(MigrationIncomingState *mis)
4003{
c136180c 4004 return postcopy_ram_incoming_init(mis);
1caddf8a
DDAG
4005}
4006
3d0684b2
JQ
4007/**
4008 * ram_load_postcopy: load a page in postcopy case
4009 *
4010 * Returns 0 for success or -errno in case of error
4011 *
a7180877
DDAG
4012 * Called in postcopy mode by ram_load().
4013 * rcu_read_lock is taken prior to this being called.
3d0684b2
JQ
4014 *
4015 * @f: QEMUFile where to send the data
36f62f11 4016 * @channel: the channel to use for loading
a7180877 4017 */
36f62f11 4018int ram_load_postcopy(QEMUFile *f, int channel)
a7180877
DDAG
4019{
4020 int flags = 0, ret = 0;
4021 bool place_needed = false;
1aa83678 4022 bool matches_target_page_size = false;
a7180877 4023 MigrationIncomingState *mis = migration_incoming_get_current();
36f62f11 4024 PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
a7180877
DDAG
4025
4026 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4027 ram_addr_t addr;
a7180877
DDAG
4028 void *page_buffer = NULL;
4029 void *place_source = NULL;
df9ff5e1 4030 RAMBlock *block = NULL;
a7180877 4031 uint8_t ch;
644acf99 4032 int len;
a7180877
DDAG
4033
4034 addr = qemu_get_be64(f);
7a9ddfbf
PX
4035
4036 /*
4037 * If qemu file error, we should stop here, and then "addr"
4038 * may be invalid
4039 */
4040 ret = qemu_file_get_error(f);
4041 if (ret) {
4042 break;
4043 }
4044
a7180877
DDAG
4045 flags = addr & ~TARGET_PAGE_MASK;
4046 addr &= TARGET_PAGE_MASK;
4047
36f62f11 4048 trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
644acf99
WY
4049 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4050 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
c01b16ed 4051 block = ram_block_from_stream(mis, f, flags, channel);
6a23f639
DH
4052 if (!block) {
4053 ret = -EINVAL;
4054 break;
4055 }
4c4bad48 4056
898ba906
DH
4057 /*
4058 * Relying on used_length is racy and can result in false positives.
4059 * We might place pages beyond used_length in case RAM was shrunk
4060 * while in postcopy, which is fine - trying to place via
4061 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
4062 */
4063 if (!block->host || addr >= block->postcopy_length) {
a7180877
DDAG
4064 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4065 ret = -EINVAL;
4066 break;
4067 }
77dadc3f 4068 tmp_page->target_pages++;
1aa83678 4069 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
a7180877 4070 /*
28abd200
DDAG
4071 * Postcopy requires that we place whole host pages atomically;
4072 * these may be huge pages for RAMBlocks that are backed by
4073 * hugetlbfs.
a7180877
DDAG
4074 * To make it atomic, the data is read into a temporary page
4075 * that's moved into place later.
4076 * The migration protocol uses, possibly smaller, target-pages
4077 * however the source ensures it always sends all the components
91ba442f 4078 * of a host page in one chunk.
a7180877 4079 */
77dadc3f 4080 page_buffer = tmp_page->tmp_huge_page +
6a23f639
DH
4081 host_page_offset_from_ram_block_offset(block, addr);
4082 /* If all TP are zero then we can optimise the place */
77dadc3f
PX
4083 if (tmp_page->target_pages == 1) {
4084 tmp_page->host_addr =
4085 host_page_from_ram_block_offset(block, addr);
4086 } else if (tmp_page->host_addr !=
4087 host_page_from_ram_block_offset(block, addr)) {
c53b7ddc 4088 /* not the 1st TP within the HP */
36f62f11 4089 error_report("Non-same host page detected on channel %d: "
cfc7dc8a
PX
4090 "Target host page %p, received host page %p "
4091 "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
36f62f11 4092 channel, tmp_page->host_addr,
cfc7dc8a
PX
4093 host_page_from_ram_block_offset(block, addr),
4094 block->idstr, addr, tmp_page->target_pages);
6a23f639
DH
4095 ret = -EINVAL;
4096 break;
a7180877
DDAG
4097 }
4098
4099 /*
4100 * If it's the last part of a host page then we place the host
4101 * page
4102 */
77dadc3f
PX
4103 if (tmp_page->target_pages ==
4104 (block->page_size / TARGET_PAGE_SIZE)) {
4cbb3c63 4105 place_needed = true;
4cbb3c63 4106 }
77dadc3f 4107 place_source = tmp_page->tmp_huge_page;
a7180877
DDAG
4108 }
4109
4110 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
bb890ed5 4111 case RAM_SAVE_FLAG_ZERO:
a7180877 4112 ch = qemu_get_byte(f);
2e36bc1b
WY
4113 /*
4114 * Can skip to set page_buffer when
4115 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
4116 */
4117 if (ch || !matches_target_page_size) {
4118 memset(page_buffer, ch, TARGET_PAGE_SIZE);
4119 }
a7180877 4120 if (ch) {
77dadc3f 4121 tmp_page->all_zero = false;
a7180877
DDAG
4122 }
4123 break;
4124
4125 case RAM_SAVE_FLAG_PAGE:
77dadc3f 4126 tmp_page->all_zero = false;
1aa83678
PX
4127 if (!matches_target_page_size) {
4128 /* For huge pages, we always use temporary buffer */
a7180877
DDAG
4129 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4130 } else {
1aa83678
PX
4131 /*
4132 * For small pages that matches target page size, we
4133 * avoid the qemu_file copy. Instead we directly use
4134 * the buffer of QEMUFile to place the page. Note: we
4135 * cannot do any QEMUFile operation before using that
4136 * buffer to make sure the buffer is valid when
4137 * placing the page.
a7180877
DDAG
4138 */
4139 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4140 TARGET_PAGE_SIZE);
4141 }
4142 break;
644acf99 4143 case RAM_SAVE_FLAG_COMPRESS_PAGE:
77dadc3f 4144 tmp_page->all_zero = false;
644acf99
WY
4145 len = qemu_get_be32(f);
4146 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4147 error_report("Invalid compressed data length: %d", len);
4148 ret = -EINVAL;
4149 break;
4150 }
4151 decompress_data_with_multi_threads(f, page_buffer, len);
4152 break;
4153
a7180877
DDAG
4154 case RAM_SAVE_FLAG_EOS:
4155 /* normal exit */
6df264ac 4156 multifd_recv_sync_main();
a7180877
DDAG
4157 break;
4158 default:
29fccade 4159 error_report("Unknown combination of migration flags: 0x%x"
a7180877
DDAG
4160 " (postcopy mode)", flags);
4161 ret = -EINVAL;
7a9ddfbf
PX
4162 break;
4163 }
4164
644acf99
WY
4165 /* Got the whole host page, wait for decompress before placing. */
4166 if (place_needed) {
4167 ret |= wait_for_decompress_done();
4168 }
4169
7a9ddfbf
PX
4170 /* Detect for any possible file errors */
4171 if (!ret && qemu_file_get_error(f)) {
4172 ret = qemu_file_get_error(f);
a7180877
DDAG
4173 }
4174
7a9ddfbf 4175 if (!ret && place_needed) {
77dadc3f
PX
4176 if (tmp_page->all_zero) {
4177 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
a7180877 4178 } else {
77dadc3f
PX
4179 ret = postcopy_place_page(mis, tmp_page->host_addr,
4180 place_source, block);
a7180877 4181 }
ddf35bdf 4182 place_needed = false;
77dadc3f 4183 postcopy_temp_page_reset(tmp_page);
a7180877 4184 }
a7180877
DDAG
4185 }
4186
4187 return ret;
4188}
4189
acab30b8
DHB
4190static bool postcopy_is_running(void)
4191{
4192 PostcopyState ps = postcopy_state_get();
4193 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4194}
4195
e6f4aa18
ZC
4196/*
4197 * Flush content of RAM cache into SVM's memory.
4198 * Only flush the pages that be dirtied by PVM or SVM or both.
4199 */
24fa16f8 4200void colo_flush_ram_cache(void)
e6f4aa18
ZC
4201{
4202 RAMBlock *block = NULL;
4203 void *dst_host;
4204 void *src_host;
4205 unsigned long offset = 0;
4206
d1955d22 4207 memory_global_dirty_log_sync();
89ac5a1d
DDAG
4208 WITH_RCU_READ_LOCK_GUARD() {
4209 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4210 ramblock_sync_dirty_bitmap(ram_state, block);
4211 }
d1955d22 4212 }
d1955d22 4213
e6f4aa18 4214 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
89ac5a1d
DDAG
4215 WITH_RCU_READ_LOCK_GUARD() {
4216 block = QLIST_FIRST_RCU(&ram_list.blocks);
e6f4aa18 4217
89ac5a1d 4218 while (block) {
a6a83cef 4219 unsigned long num = 0;
e6f4aa18 4220
a6a83cef 4221 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
542147f4
DH
4222 if (!offset_in_ramblock(block,
4223 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
89ac5a1d 4224 offset = 0;
a6a83cef 4225 num = 0;
89ac5a1d
DDAG
4226 block = QLIST_NEXT_RCU(block, next);
4227 } else {
a6a83cef
RL
4228 unsigned long i = 0;
4229
4230 for (i = 0; i < num; i++) {
4231 migration_bitmap_clear_dirty(ram_state, block, offset + i);
4232 }
8bba004c
AR
4233 dst_host = block->host
4234 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4235 src_host = block->colo_cache
4236 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
a6a83cef
RL
4237 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
4238 offset += num;
89ac5a1d 4239 }
e6f4aa18
ZC
4240 }
4241 }
e6f4aa18
ZC
4242 trace_colo_flush_ram_cache_end();
4243}
4244
10da4a36
WY
4245/**
4246 * ram_load_precopy: load pages in precopy case
4247 *
4248 * Returns 0 for success or -errno in case of error
4249 *
4250 * Called in precopy mode by ram_load().
4251 * rcu_read_lock is taken prior to this being called.
4252 *
4253 * @f: QEMUFile where to send the data
4254 */
4255static int ram_load_precopy(QEMUFile *f)
56e93d26 4256{
755e8d7c 4257 MigrationIncomingState *mis = migration_incoming_get_current();
e65cec5e 4258 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
ef08fb38 4259 /* ADVISE is earlier, it shows the source has the postcopy capability on */
80fe315c 4260 bool postcopy_advised = migration_incoming_postcopy_advised();
edc60127
JQ
4261 if (!migrate_use_compression()) {
4262 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4263 }
a7180877 4264
10da4a36 4265 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
56e93d26 4266 ram_addr_t addr, total_ram_bytes;
0393031a 4267 void *host = NULL, *host_bak = NULL;
56e93d26
JQ
4268 uint8_t ch;
4269
e65cec5e
YK
4270 /*
4271 * Yield periodically to let main loop run, but an iteration of
4272 * the main loop is expensive, so do it each some iterations
4273 */
4274 if ((i & 32767) == 0 && qemu_in_coroutine()) {
4275 aio_co_schedule(qemu_get_current_aio_context(),
4276 qemu_coroutine_self());
4277 qemu_coroutine_yield();
4278 }
4279 i++;
4280
56e93d26
JQ
4281 addr = qemu_get_be64(f);
4282 flags = addr & ~TARGET_PAGE_MASK;
4283 addr &= TARGET_PAGE_MASK;
4284
edc60127
JQ
4285 if (flags & invalid_flags) {
4286 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4287 error_report("Received an unexpected compressed page");
4288 }
4289
4290 ret = -EINVAL;
4291 break;
4292 }
4293
bb890ed5 4294 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
a776aa15 4295 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
c01b16ed
PX
4296 RAMBlock *block = ram_block_from_stream(mis, f, flags,
4297 RAM_CHANNEL_PRECOPY);
4c4bad48 4298
0393031a 4299 host = host_from_ram_block_offset(block, addr);
13af18f2 4300 /*
0393031a
HZ
4301 * After going into COLO stage, we should not load the page
4302 * into SVM's memory directly, we put them into colo_cache firstly.
4303 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
4304 * Previously, we copied all these memory in preparing stage of COLO
4305 * while we need to stop VM, which is a time-consuming process.
4306 * Here we optimize it by a trick, back-up every page while in
4307 * migration process while COLO is enabled, though it affects the
4308 * speed of the migration, but it obviously reduce the downtime of
4309 * back-up all SVM'S memory in COLO preparing stage.
13af18f2 4310 */
0393031a
HZ
4311 if (migration_incoming_colo_enabled()) {
4312 if (migration_incoming_in_colo_state()) {
4313 /* In COLO stage, put all pages into cache temporarily */
8af66371 4314 host = colo_cache_from_block_offset(block, addr, true);
0393031a
HZ
4315 } else {
4316 /*
4317 * In migration stage but before COLO stage,
4318 * Put all pages into both cache and SVM's memory.
4319 */
8af66371 4320 host_bak = colo_cache_from_block_offset(block, addr, false);
0393031a 4321 }
13af18f2 4322 }
a776aa15
DDAG
4323 if (!host) {
4324 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4325 ret = -EINVAL;
4326 break;
4327 }
13af18f2
ZC
4328 if (!migration_incoming_in_colo_state()) {
4329 ramblock_recv_bitmap_set(block, host);
4330 }
4331
1db9d8e5 4332 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
a776aa15
DDAG
4333 }
4334
56e93d26
JQ
4335 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4336 case RAM_SAVE_FLAG_MEM_SIZE:
4337 /* Synchronize RAM block list */
4338 total_ram_bytes = addr;
4339 while (!ret && total_ram_bytes) {
4340 RAMBlock *block;
56e93d26
JQ
4341 char id[256];
4342 ram_addr_t length;
4343
4344 len = qemu_get_byte(f);
4345 qemu_get_buffer(f, (uint8_t *)id, len);
4346 id[len] = 0;
4347 length = qemu_get_be64(f);
4348
e3dd7493 4349 block = qemu_ram_block_by_name(id);
b895de50
CLG
4350 if (block && !qemu_ram_is_migratable(block)) {
4351 error_report("block %s should not be migrated !", id);
4352 ret = -EINVAL;
4353 } else if (block) {
e3dd7493
DDAG
4354 if (length != block->used_length) {
4355 Error *local_err = NULL;
56e93d26 4356
fa53a0e5 4357 ret = qemu_ram_resize(block, length,
e3dd7493
DDAG
4358 &local_err);
4359 if (local_err) {
4360 error_report_err(local_err);
56e93d26 4361 }
56e93d26 4362 }
ef08fb38 4363 /* For postcopy we need to check hugepage sizes match */
e846b746 4364 if (postcopy_advised && migrate_postcopy_ram() &&
ef08fb38
DDAG
4365 block->page_size != qemu_host_page_size) {
4366 uint64_t remote_page_size = qemu_get_be64(f);
4367 if (remote_page_size != block->page_size) {
4368 error_report("Mismatched RAM page size %s "
4369 "(local) %zd != %" PRId64,
4370 id, block->page_size,
4371 remote_page_size);
4372 ret = -EINVAL;
4373 }
4374 }
fbd162e6
YK
4375 if (migrate_ignore_shared()) {
4376 hwaddr addr = qemu_get_be64(f);
fbd162e6
YK
4377 if (ramblock_is_ignored(block) &&
4378 block->mr->addr != addr) {
4379 error_report("Mismatched GPAs for block %s "
4380 "%" PRId64 "!= %" PRId64,
4381 id, (uint64_t)addr,
4382 (uint64_t)block->mr->addr);
4383 ret = -EINVAL;
4384 }
4385 }
e3dd7493
DDAG
4386 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4387 block->idstr);
4388 } else {
56e93d26
JQ
4389 error_report("Unknown ramblock \"%s\", cannot "
4390 "accept migration", id);
4391 ret = -EINVAL;
4392 }
4393
4394 total_ram_bytes -= length;
4395 }
4396 break;
a776aa15 4397
bb890ed5 4398 case RAM_SAVE_FLAG_ZERO:
56e93d26
JQ
4399 ch = qemu_get_byte(f);
4400 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4401 break;
a776aa15 4402
56e93d26 4403 case RAM_SAVE_FLAG_PAGE:
56e93d26
JQ
4404 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4405 break;
56e93d26 4406
a776aa15 4407 case RAM_SAVE_FLAG_COMPRESS_PAGE:
56e93d26
JQ
4408 len = qemu_get_be32(f);
4409 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4410 error_report("Invalid compressed data length: %d", len);
4411 ret = -EINVAL;
4412 break;
4413 }
c1bc6626 4414 decompress_data_with_multi_threads(f, host, len);
56e93d26 4415 break;
a776aa15 4416
56e93d26 4417 case RAM_SAVE_FLAG_XBZRLE:
56e93d26
JQ
4418 if (load_xbzrle(f, addr, host) < 0) {
4419 error_report("Failed to decompress XBZRLE page at "
4420 RAM_ADDR_FMT, addr);
4421 ret = -EINVAL;
4422 break;
4423 }
4424 break;
4425 case RAM_SAVE_FLAG_EOS:
4426 /* normal exit */
6df264ac 4427 multifd_recv_sync_main();
56e93d26
JQ
4428 break;
4429 default:
4430 if (flags & RAM_SAVE_FLAG_HOOK) {
632e3a5c 4431 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
56e93d26 4432 } else {
29fccade 4433 error_report("Unknown combination of migration flags: 0x%x",
56e93d26
JQ
4434 flags);
4435 ret = -EINVAL;
4436 }
4437 }
4438 if (!ret) {
4439 ret = qemu_file_get_error(f);
4440 }
0393031a
HZ
4441 if (!ret && host_bak) {
4442 memcpy(host_bak, host, TARGET_PAGE_SIZE);
4443 }
56e93d26
JQ
4444 }
4445
ca1a6b70 4446 ret |= wait_for_decompress_done();
10da4a36
WY
4447 return ret;
4448}
4449
4450static int ram_load(QEMUFile *f, void *opaque, int version_id)
4451{
4452 int ret = 0;
4453 static uint64_t seq_iter;
4454 /*
4455 * If system is running in postcopy mode, page inserts to host memory must
4456 * be atomic
4457 */
4458 bool postcopy_running = postcopy_is_running();
4459
4460 seq_iter++;
4461
4462 if (version_id != 4) {
4463 return -EINVAL;
4464 }
4465
4466 /*
4467 * This RCU critical section can be very long running.
4468 * When RCU reclaims in the code start to become numerous,
4469 * it will be necessary to reduce the granularity of this
4470 * critical section.
4471 */
89ac5a1d
DDAG
4472 WITH_RCU_READ_LOCK_GUARD() {
4473 if (postcopy_running) {
36f62f11
PX
4474 /*
4475 * Note! Here RAM_CHANNEL_PRECOPY is the precopy channel of
4476 * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
4477 * service fast page faults.
4478 */
4479 ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
89ac5a1d
DDAG
4480 } else {
4481 ret = ram_load_precopy(f);
4482 }
10da4a36 4483 }
55c4446b 4484 trace_ram_load_complete(ret, seq_iter);
e6f4aa18 4485
56e93d26
JQ
4486 return ret;
4487}
4488
c6467627
VSO
4489static bool ram_has_postcopy(void *opaque)
4490{
469dd51b 4491 RAMBlock *rb;
fbd162e6 4492 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
469dd51b
JH
4493 if (ramblock_is_pmem(rb)) {
4494 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4495 "is not supported now!", rb->idstr, rb->host);
4496 return false;
4497 }
4498 }
4499
c6467627
VSO
4500 return migrate_postcopy_ram();
4501}
4502
edd090c7
PX
4503/* Sync all the dirty bitmap with destination VM. */
4504static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4505{
4506 RAMBlock *block;
4507 QEMUFile *file = s->to_dst_file;
4508 int ramblock_count = 0;
4509
4510 trace_ram_dirty_bitmap_sync_start();
4511
fbd162e6 4512 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
edd090c7
PX
4513 qemu_savevm_send_recv_bitmap(file, block->idstr);
4514 trace_ram_dirty_bitmap_request(block->idstr);
4515 ramblock_count++;
4516 }
4517
4518 trace_ram_dirty_bitmap_sync_wait();
4519
4520 /* Wait until all the ramblocks' dirty bitmap synced */
4521 while (ramblock_count--) {
4522 qemu_sem_wait(&s->rp_state.rp_sem);
4523 }
4524
4525 trace_ram_dirty_bitmap_sync_complete();
4526
4527 return 0;
4528}
4529
4530static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4531{
4532 qemu_sem_post(&s->rp_state.rp_sem);
4533}
4534
a335debb
PX
4535/*
4536 * Read the received bitmap, revert it as the initial dirty bitmap.
4537 * This is only used when the postcopy migration is paused but wants
4538 * to resume from a middle point.
4539 */
4540int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4541{
4542 int ret = -EINVAL;
43044ac0 4543 /* from_dst_file is always valid because we're within rp_thread */
a335debb
PX
4544 QEMUFile *file = s->rp_state.from_dst_file;
4545 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
a725ef9f 4546 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
4547 uint64_t size, end_mark;
4548
4549 trace_ram_dirty_bitmap_reload_begin(block->idstr);
4550
4551 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4552 error_report("%s: incorrect state %s", __func__,
4553 MigrationStatus_str(s->state));
4554 return -EINVAL;
4555 }
4556
4557 /*
4558 * Note: see comments in ramblock_recv_bitmap_send() on why we
3a4452d8 4559 * need the endianness conversion, and the paddings.
a335debb
PX
4560 */
4561 local_size = ROUND_UP(local_size, 8);
4562
4563 /* Add paddings */
4564 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4565
4566 size = qemu_get_be64(file);
4567
4568 /* The size of the bitmap should match with our ramblock */
4569 if (size != local_size) {
4570 error_report("%s: ramblock '%s' bitmap size mismatch "
4571 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4572 block->idstr, size, local_size);
4573 ret = -EINVAL;
4574 goto out;
4575 }
4576
4577 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4578 end_mark = qemu_get_be64(file);
4579
4580 ret = qemu_file_get_error(file);
4581 if (ret || size != local_size) {
4582 error_report("%s: read bitmap failed for ramblock '%s': %d"
4583 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4584 __func__, block->idstr, ret, local_size, size);
4585 ret = -EIO;
4586 goto out;
4587 }
4588
4589 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
af3bbbe9 4590 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
a335debb
PX
4591 __func__, block->idstr, end_mark);
4592 ret = -EINVAL;
4593 goto out;
4594 }
4595
4596 /*
3a4452d8 4597 * Endianness conversion. We are during postcopy (though paused).
a335debb
PX
4598 * The dirty bitmap won't change. We can directly modify it.
4599 */
4600 bitmap_from_le(block->bmap, le_bitmap, nbits);
4601
4602 /*
4603 * What we received is "received bitmap". Revert it as the initial
4604 * dirty bitmap for this ramblock.
4605 */
4606 bitmap_complement(block->bmap, block->bmap, nbits);
4607
be39b4cd
DH
4608 /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4609 ramblock_dirty_bitmap_clear_discarded_pages(block);
4610
4611 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
a335debb
PX
4612 trace_ram_dirty_bitmap_reload_complete(block->idstr);
4613
edd090c7
PX
4614 /*
4615 * We succeeded to sync bitmap for current ramblock. If this is
4616 * the last one to sync, we need to notify the main send thread.
4617 */
4618 ram_dirty_bitmap_reload_notify(s);
4619
a335debb
PX
4620 ret = 0;
4621out:
bf269906 4622 g_free(le_bitmap);
a335debb
PX
4623 return ret;
4624}
4625
edd090c7
PX
4626static int ram_resume_prepare(MigrationState *s, void *opaque)
4627{
4628 RAMState *rs = *(RAMState **)opaque;
08614f34 4629 int ret;
edd090c7 4630
08614f34
PX
4631 ret = ram_dirty_bitmap_sync_all(s, rs);
4632 if (ret) {
4633 return ret;
4634 }
4635
4636 ram_state_resume_prepare(rs, s->to_dst_file);
4637
4638 return 0;
edd090c7
PX
4639}
4640
36f62f11
PX
4641void postcopy_preempt_shutdown_file(MigrationState *s)
4642{
4643 qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
4644 qemu_fflush(s->postcopy_qemufile_src);
4645}
4646
56e93d26 4647static SaveVMHandlers savevm_ram_handlers = {
9907e842 4648 .save_setup = ram_save_setup,
56e93d26 4649 .save_live_iterate = ram_save_iterate,
763c906b 4650 .save_live_complete_postcopy = ram_save_complete,
a3e06c3d 4651 .save_live_complete_precopy = ram_save_complete,
c6467627 4652 .has_postcopy = ram_has_postcopy,
c8df4a7a
JQ
4653 .state_pending_exact = ram_state_pending_exact,
4654 .state_pending_estimate = ram_state_pending_estimate,
56e93d26 4655 .load_state = ram_load,
f265e0e4
JQ
4656 .save_cleanup = ram_save_cleanup,
4657 .load_setup = ram_load_setup,
4658 .load_cleanup = ram_load_cleanup,
edd090c7 4659 .resume_prepare = ram_resume_prepare,
56e93d26
JQ
4660};
4661
c7c0e724
DH
4662static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4663 size_t old_size, size_t new_size)
4664{
cc61c703 4665 PostcopyState ps = postcopy_state_get();
c7c0e724
DH
4666 ram_addr_t offset;
4667 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4668 Error *err = NULL;
4669
4670 if (ramblock_is_ignored(rb)) {
4671 return;
4672 }
4673
4674 if (!migration_is_idle()) {
4675 /*
4676 * Precopy code on the source cannot deal with the size of RAM blocks
4677 * changing at random points in time - especially after sending the
4678 * RAM block sizes in the migration stream, they must no longer change.
4679 * Abort and indicate a proper reason.
4680 */
4681 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
458fecca 4682 migration_cancel(err);
c7c0e724 4683 error_free(err);
c7c0e724 4684 }
cc61c703
DH
4685
4686 switch (ps) {
4687 case POSTCOPY_INCOMING_ADVISE:
4688 /*
4689 * Update what ram_postcopy_incoming_init()->init_range() does at the
4690 * time postcopy was advised. Syncing RAM blocks with the source will
4691 * result in RAM resizes.
4692 */
4693 if (old_size < new_size) {
4694 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4695 error_report("RAM block '%s' discard of resized RAM failed",
4696 rb->idstr);
4697 }
4698 }
898ba906 4699 rb->postcopy_length = new_size;
cc61c703
DH
4700 break;
4701 case POSTCOPY_INCOMING_NONE:
4702 case POSTCOPY_INCOMING_RUNNING:
4703 case POSTCOPY_INCOMING_END:
4704 /*
4705 * Once our guest is running, postcopy does no longer care about
4706 * resizes. When growing, the new memory was not available on the
4707 * source, no handler needed.
4708 */
4709 break;
4710 default:
4711 error_report("RAM block '%s' resized during postcopy state: %d",
4712 rb->idstr, ps);
4713 exit(-1);
4714 }
c7c0e724
DH
4715}
4716
4717static RAMBlockNotifier ram_mig_ram_notifier = {
4718 .ram_block_resized = ram_mig_ram_block_resized,
4719};
4720
56e93d26
JQ
4721void ram_mig_init(void)
4722{
4723 qemu_mutex_init(&XBZRLE.lock);
ce62df53 4724 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
c7c0e724 4725 ram_block_notifier_add(&ram_mig_ram_notifier);
56e93d26 4726}