]> git.proxmox.com Git - mirror_qemu.git/blame - migration/ram.c
migration: Add last stage indicator to global dirty log
[mirror_qemu.git] / migration / ram.c
CommitLineData
56e93d26
JQ
1/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
76cc7b58
JQ
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
56e93d26
JQ
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
e688df6b 28
1393a485 29#include "qemu/osdep.h"
f348b6d1 30#include "qemu/cutils.h"
56e93d26
JQ
31#include "qemu/bitops.h"
32#include "qemu/bitmap.h"
b85ea5fa 33#include "qemu/madvise.h"
7205c9ec 34#include "qemu/main-loop.h"
709e3fe8 35#include "xbzrle.h"
b5ca3368 36#include "ram-compress.h"
7b1e1a22 37#include "ram.h"
6666c96a 38#include "migration.h"
947701cc 39#include "migration-stats.h"
f2a8f0a6 40#include "migration/register.h"
7b1e1a22 41#include "migration/misc.h"
08a0aee1 42#include "qemu-file.h"
be07b0ac 43#include "postcopy-ram.h"
53d37d36 44#include "page_cache.h"
56e93d26 45#include "qemu/error-report.h"
e688df6b 46#include "qapi/error.h"
ab7cbb0b 47#include "qapi/qapi-types-migration.h"
9af23989 48#include "qapi/qapi-events-migration.h"
8acabf69 49#include "qapi/qmp/qerror.h"
56e93d26 50#include "trace.h"
56e93d26 51#include "exec/ram_addr.h"
f9494614 52#include "exec/target_page.h"
56e93d26 53#include "qemu/rcu_queue.h"
a91246c9 54#include "migration/colo.h"
53d37d36 55#include "block.h"
b0c3cf94 56#include "sysemu/cpu-throttle.h"
edd090c7 57#include "savevm.h"
b9ee2f7d 58#include "qemu/iov.h"
d32ca5ad 59#include "multifd.h"
278e2f55 60#include "sysemu/runstate.h"
1f0776f1 61#include "options.h"
278e2f55 62
e5fdf920
LS
63#include "hw/boards.h" /* for machine_dump_guest_core() */
64
278e2f55
AG
65#if defined(__linux__)
66#include "qemu/userfaultfd.h"
67#endif /* defined(__linux__) */
56e93d26 68
56e93d26
JQ
69/***********************************************************/
70/* ram save/restore */
71
7b548761
JQ
72/*
73 * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
74 * worked for pages that were filled with the same char. We switched
bb890ed5 75 * it to only search for the zero value. And to avoid confusion with
7b548761 76 * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it.
bb890ed5 77 */
7b548761
JQ
78/*
79 * RAM_SAVE_FLAG_FULL was obsoleted in 2009, it can be reused now
80 */
81#define RAM_SAVE_FLAG_FULL 0x01
bb890ed5 82#define RAM_SAVE_FLAG_ZERO 0x02
56e93d26
JQ
83#define RAM_SAVE_FLAG_MEM_SIZE 0x04
84#define RAM_SAVE_FLAG_PAGE 0x08
85#define RAM_SAVE_FLAG_EOS 0x10
86#define RAM_SAVE_FLAG_CONTINUE 0x20
87#define RAM_SAVE_FLAG_XBZRLE 0x40
7b548761 88/* 0x80 is reserved in qemu-file.h for RAM_SAVE_FLAG_HOOK */
56e93d26 89#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
294e5a40 90#define RAM_SAVE_FLAG_MULTIFD_FLUSH 0x200
7b548761 91/* We can't use any flag that is bigger than 0x200 */
56e93d26 92
04ffce13 93int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int,
94 uint8_t *, int) = xbzrle_encode_buffer;
95#if defined(CONFIG_AVX512BW_OPT)
96#include "qemu/cpuid.h"
97static void __attribute__((constructor)) init_cpu_flag(void)
98{
99 unsigned max = __get_cpuid_max(0, NULL);
100 int a, b, c, d;
101 if (max >= 1) {
102 __cpuid(1, a, b, c, d);
103 /* We must check that AVX is not just available, but usable. */
104 if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
105 int bv;
106 __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
107 __cpuid_count(7, 0, a, b, c, d);
108 /* 0xe6:
109 * XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
110 * and ZMM16-ZMM31 state are enabled by OS)
111 * XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
112 */
113 if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
114 xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512;
115 }
116 }
117 }
118}
119#endif
120
9360447d
JQ
121XBZRLECacheStats xbzrle_counters;
122
f1668764
PX
123/* used by the search for pages to send */
124struct PageSearchStatus {
125 /* The migration channel used for a specific host page */
126 QEMUFile *pss_channel;
ec6f3ab9
PX
127 /* Last block from where we have sent data */
128 RAMBlock *last_sent_block;
f1668764
PX
129 /* Current block being searched */
130 RAMBlock *block;
131 /* Current page to search from */
132 unsigned long page;
133 /* Set once we wrap around */
134 bool complete_round;
f1668764
PX
135 /* Whether we're sending a host page */
136 bool host_page_sending;
137 /* The start/end of current host page. Invalid if host_page_sending==false */
138 unsigned long host_page_start;
139 unsigned long host_page_end;
140};
141typedef struct PageSearchStatus PageSearchStatus;
142
56e93d26
JQ
143/* struct contains XBZRLE cache and a static page
144 used by the compression */
145static struct {
146 /* buffer used for XBZRLE encoding */
147 uint8_t *encoded_buf;
148 /* buffer for storing page content */
149 uint8_t *current_buf;
150 /* Cache for XBZRLE, Protected by lock. */
151 PageCache *cache;
152 QemuMutex lock;
c00e0928
JQ
153 /* it will store a page full of zeros */
154 uint8_t *zero_target_page;
f265e0e4
JQ
155 /* buffer used for XBZRLE decoding */
156 uint8_t *decoded_buf;
56e93d26
JQ
157} XBZRLE;
158
56e93d26
JQ
159static void XBZRLE_cache_lock(void)
160{
87dca0c9 161 if (migrate_xbzrle()) {
56e93d26 162 qemu_mutex_lock(&XBZRLE.lock);
f4c51a6b 163 }
56e93d26
JQ
164}
165
166static void XBZRLE_cache_unlock(void)
167{
87dca0c9 168 if (migrate_xbzrle()) {
56e93d26 169 qemu_mutex_unlock(&XBZRLE.lock);
f4c51a6b 170 }
56e93d26
JQ
171}
172
3d0684b2
JQ
173/**
174 * xbzrle_cache_resize: resize the xbzrle cache
175 *
cbde7be9 176 * This function is called from migrate_params_apply in main
3d0684b2
JQ
177 * thread, possibly while a migration is in progress. A running
178 * migration may be using the cache and might finish during this call,
179 * hence changes to the cache are protected by XBZRLE.lock().
180 *
c9dede2d 181 * Returns 0 for success or -1 for error
3d0684b2
JQ
182 *
183 * @new_size: new cache size
8acabf69 184 * @errp: set *errp if the check failed, with reason
56e93d26 185 */
8b9407a0 186int xbzrle_cache_resize(uint64_t new_size, Error **errp)
56e93d26
JQ
187{
188 PageCache *new_cache;
c9dede2d 189 int64_t ret = 0;
56e93d26 190
8acabf69
JQ
191 /* Check for truncation */
192 if (new_size != (size_t)new_size) {
193 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
194 "exceeding address space");
195 return -1;
196 }
197
2a313e5c
JQ
198 if (new_size == migrate_xbzrle_cache_size()) {
199 /* nothing to do */
c9dede2d 200 return 0;
2a313e5c
JQ
201 }
202
56e93d26
JQ
203 XBZRLE_cache_lock();
204
205 if (XBZRLE.cache != NULL) {
80f8dfde 206 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
56e93d26 207 if (!new_cache) {
56e93d26
JQ
208 ret = -1;
209 goto out;
210 }
211
212 cache_fini(XBZRLE.cache);
213 XBZRLE.cache = new_cache;
214 }
56e93d26
JQ
215out:
216 XBZRLE_cache_unlock();
217 return ret;
218}
219
20123ee1
PX
220static bool postcopy_preempt_active(void)
221{
222 return migrate_postcopy_preempt() && migration_in_postcopy();
223}
224
3ded54b1 225bool ramblock_is_ignored(RAMBlock *block)
fbd162e6
YK
226{
227 return !qemu_ram_is_migratable(block) ||
228 (migrate_ignore_shared() && qemu_ram_is_shared(block));
229}
230
343f632c
DDAG
231#undef RAMBLOCK_FOREACH
232
fbd162e6
YK
233int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
234{
235 RAMBlock *block;
236 int ret = 0;
237
89ac5a1d
DDAG
238 RCU_READ_LOCK_GUARD();
239
fbd162e6
YK
240 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
241 ret = func(block, opaque);
242 if (ret) {
243 break;
244 }
245 }
fbd162e6
YK
246 return ret;
247}
248
f9494614
AP
249static void ramblock_recv_map_init(void)
250{
251 RAMBlock *rb;
252
fbd162e6 253 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
f9494614
AP
254 assert(!rb->receivedmap);
255 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
256 }
257}
258
259int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
260{
261 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
262 rb->receivedmap);
263}
264
1cba9f6e
DDAG
265bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
266{
267 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
268}
269
f9494614
AP
270void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
271{
272 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
273}
274
275void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
276 size_t nr)
277{
278 bitmap_set_atomic(rb->receivedmap,
279 ramblock_recv_bitmap_offset(host_addr, rb),
280 nr);
281}
282
a335debb
PX
283#define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
284
285/*
286 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
287 *
288 * Returns >0 if success with sent bytes, or <0 if error.
289 */
290int64_t ramblock_recv_bitmap_send(QEMUFile *file,
291 const char *block_name)
292{
293 RAMBlock *block = qemu_ram_block_by_name(block_name);
294 unsigned long *le_bitmap, nbits;
295 uint64_t size;
296
297 if (!block) {
298 error_report("%s: invalid block name: %s", __func__, block_name);
299 return -1;
300 }
301
898ba906 302 nbits = block->postcopy_length >> TARGET_PAGE_BITS;
a335debb
PX
303
304 /*
305 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
306 * machines we may need 4 more bytes for padding (see below
307 * comment). So extend it a bit before hand.
308 */
309 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
310
311 /*
312 * Always use little endian when sending the bitmap. This is
313 * required that when source and destination VMs are not using the
3a4452d8 314 * same endianness. (Note: big endian won't work.)
a335debb
PX
315 */
316 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
317
318 /* Size of the bitmap, in bytes */
a725ef9f 319 size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
320
321 /*
322 * size is always aligned to 8 bytes for 64bit machines, but it
323 * may not be true for 32bit machines. We need this padding to
324 * make sure the migration can survive even between 32bit and
325 * 64bit machines.
326 */
327 size = ROUND_UP(size, 8);
328
329 qemu_put_be64(file, size);
330 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
331 /*
332 * Mark as an end, in case the middle part is screwed up due to
3a4452d8 333 * some "mysterious" reason.
a335debb
PX
334 */
335 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
336 qemu_fflush(file);
337
bf269906 338 g_free(le_bitmap);
a335debb
PX
339
340 if (qemu_file_get_error(file)) {
341 return qemu_file_get_error(file);
342 }
343
344 return size + sizeof(size);
345}
346
ec481c6c
JQ
347/*
348 * An outstanding page request, on the source, having been received
349 * and queued
350 */
351struct RAMSrcPageRequest {
352 RAMBlock *rb;
353 hwaddr offset;
354 hwaddr len;
355
356 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
357};
358
6f37bb8b
JQ
359/* State of RAM for migration */
360struct RAMState {
f1668764
PX
361 /*
362 * PageSearchStatus structures for the channels when send pages.
363 * Protected by the bitmap_mutex.
364 */
365 PageSearchStatus pss[RAM_CHANNEL_MAX];
278e2f55
AG
366 /* UFFD file descriptor, used in 'write-tracking' migration */
367 int uffdio_fd;
8d80e195
JQ
368 /* total ram size in bytes */
369 uint64_t ram_bytes_total;
6f37bb8b
JQ
370 /* Last block that we have visited searching for dirty pages */
371 RAMBlock *last_seen_block;
269ace29
JQ
372 /* Last dirty target page we have sent */
373 ram_addr_t last_page;
6f37bb8b
JQ
374 /* last ram version we have seen */
375 uint32_t last_version;
8d820d6f
JQ
376 /* How many times we have dirty too many pages */
377 int dirty_rate_high_cnt;
f664da80
JQ
378 /* these variables are used for bitmap sync */
379 /* last time we did a full bitmap_sync */
380 int64_t time_last_bitmap_sync;
eac74159 381 /* bytes transferred at start_time */
c4bdf0cf 382 uint64_t bytes_xfer_prev;
a66cd90c 383 /* number of dirty pages since start_time */
68908ed6 384 uint64_t num_dirty_pages_period;
b5833fde
JQ
385 /* xbzrle misses since the beginning of the period */
386 uint64_t xbzrle_cache_miss_prev;
e460a4b1
WW
387 /* Amount of xbzrle pages since the beginning of the period */
388 uint64_t xbzrle_pages_prev;
389 /* Amount of xbzrle encoded bytes since the beginning of the period */
390 uint64_t xbzrle_bytes_prev;
f3095cc8
JQ
391 /* Are we really using XBZRLE (e.g., after the first round). */
392 bool xbzrle_started;
05931ec5
JQ
393 /* Are we on the last stage of migration */
394 bool last_stage;
76e03000
XG
395 /* compression statistics since the beginning of the period */
396 /* amount of count that no free thread to compress data */
397 uint64_t compress_thread_busy_prev;
398 /* amount bytes after compression */
399 uint64_t compressed_size_prev;
400 /* amount of compressed pages */
401 uint64_t compress_pages_prev;
402
be8b02ed
XG
403 /* total handled target pages at the beginning of period */
404 uint64_t target_page_count_prev;
405 /* total handled target pages since start */
406 uint64_t target_page_count;
9360447d 407 /* number of dirty bits in the bitmap */
2dfaf12e 408 uint64_t migration_dirty_pages;
f1668764
PX
409 /*
410 * Protects:
411 * - dirty/clear bitmap
412 * - migration_dirty_pages
413 * - pss structures
414 */
108cfae0 415 QemuMutex bitmap_mutex;
68a098f3
JQ
416 /* The RAMBlock used in the last src_page_requests */
417 RAMBlock *last_req_rb;
ec481c6c
JQ
418 /* Queue of outstanding page requests from the destination */
419 QemuMutex src_page_req_mutex;
b58deb34 420 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
6f37bb8b
JQ
421};
422typedef struct RAMState RAMState;
423
53518d94 424static RAMState *ram_state;
6f37bb8b 425
bd227060
WW
426static NotifierWithReturnList precopy_notifier_list;
427
a1fe28df
PX
428/* Whether postcopy has queued requests? */
429static bool postcopy_has_request(RAMState *rs)
430{
431 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
432}
433
bd227060
WW
434void precopy_infrastructure_init(void)
435{
436 notifier_with_return_list_init(&precopy_notifier_list);
437}
438
439void precopy_add_notifier(NotifierWithReturn *n)
440{
441 notifier_with_return_list_add(&precopy_notifier_list, n);
442}
443
444void precopy_remove_notifier(NotifierWithReturn *n)
445{
446 notifier_with_return_remove(n);
447}
448
449int precopy_notify(PrecopyNotifyReason reason, Error **errp)
450{
451 PrecopyNotifyData pnd;
452 pnd.reason = reason;
453 pnd.errp = errp;
454
455 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
456}
457
9edabd4d 458uint64_t ram_bytes_remaining(void)
2f4fde93 459{
bae416e5
DDAG
460 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
461 0;
2f4fde93
JQ
462}
463
26a26069 464void ram_transferred_add(uint64_t bytes)
4c2d0f6d 465{
ae680668 466 if (runstate_is_running()) {
aff3f660 467 stat64_add(&mig_stats.precopy_bytes, bytes);
ae680668 468 } else if (migration_in_postcopy()) {
aff3f660 469 stat64_add(&mig_stats.postcopy_bytes, bytes);
ae680668 470 } else {
aff3f660 471 stat64_add(&mig_stats.downtime_bytes, bytes);
ae680668 472 }
aff3f660 473 stat64_add(&mig_stats.transferred, bytes);
4c2d0f6d
DE
474}
475
4010ba38
JQ
476struct MigrationOps {
477 int (*ram_save_target_page)(RAMState *rs, PageSearchStatus *pss);
478};
479typedef struct MigrationOps MigrationOps;
480
481MigrationOps *migration_ops;
482
93589827
PX
483static int ram_save_host_page_urgent(PageSearchStatus *pss);
484
ebd88a49
PX
485/* NOTE: page is the PFN not real ram_addr_t. */
486static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page)
487{
488 pss->block = rb;
489 pss->page = page;
490 pss->complete_round = false;
491}
492
93589827
PX
493/*
494 * Check whether two PSSs are actively sending the same page. Return true
495 * if it is, false otherwise.
496 */
497static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2)
498{
499 return pss1->host_page_sending && pss2->host_page_sending &&
500 (pss1->host_page_start == pss2->host_page_start);
501}
502
56e93d26 503/**
3d0684b2 504 * save_page_header: write page header to wire
56e93d26
JQ
505 *
506 * If this is the 1st block, it also writes the block identification
507 *
3d0684b2 508 * Returns the number of bytes written
56e93d26 509 *
ec6f3ab9 510 * @pss: current PSS channel status
56e93d26
JQ
511 * @block: block that contains the page we want to send
512 * @offset: offset inside the block for the page
513 * in the lower bits, it contains flags
514 */
37502df3
LS
515static size_t save_page_header(PageSearchStatus *pss, QEMUFile *f,
516 RAMBlock *block, ram_addr_t offset)
56e93d26 517{
9f5f380b 518 size_t size, len;
ec6f3ab9 519 bool same_block = (block == pss->last_sent_block);
56e93d26 520
10661f11 521 if (same_block) {
24795694
JQ
522 offset |= RAM_SAVE_FLAG_CONTINUE;
523 }
2bf3aa85 524 qemu_put_be64(f, offset);
56e93d26
JQ
525 size = 8;
526
10661f11 527 if (!same_block) {
9f5f380b 528 len = strlen(block->idstr);
2bf3aa85
JQ
529 qemu_put_byte(f, len);
530 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
9f5f380b 531 size += 1 + len;
ec6f3ab9 532 pss->last_sent_block = block;
56e93d26
JQ
533 }
534 return size;
535}
536
3d0684b2 537/**
179a8080 538 * mig_throttle_guest_down: throttle down the guest
3d0684b2
JQ
539 *
540 * Reduce amount of guest cpu execution to hopefully slow down memory
541 * writes. If guest dirty memory rate is reduced below the rate at
542 * which we can transfer pages to the destination then we should be
543 * able to complete migration. Some workloads dirty memory way too
544 * fast and will not effectively converge, even with auto-converge.
070afca2 545 */
cbbf8182
KZ
546static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
547 uint64_t bytes_dirty_threshold)
070afca2 548{
2a8ec380 549 uint64_t pct_initial = migrate_cpu_throttle_initial();
9605c2ac 550 uint64_t pct_increment = migrate_cpu_throttle_increment();
873f674c 551 bool pct_tailslow = migrate_cpu_throttle_tailslow();
24155bd0 552 int pct_max = migrate_max_cpu_throttle();
070afca2 553
cbbf8182
KZ
554 uint64_t throttle_now = cpu_throttle_get_percentage();
555 uint64_t cpu_now, cpu_ideal, throttle_inc;
556
070afca2
JH
557 /* We have not started throttling yet. Let's start it. */
558 if (!cpu_throttle_active()) {
559 cpu_throttle_set(pct_initial);
560 } else {
561 /* Throttling already on, just increase the rate */
cbbf8182
KZ
562 if (!pct_tailslow) {
563 throttle_inc = pct_increment;
564 } else {
565 /* Compute the ideal CPU percentage used by Guest, which may
566 * make the dirty rate match the dirty rate threshold. */
567 cpu_now = 100 - throttle_now;
568 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
569 bytes_dirty_period);
570 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
571 }
572 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
070afca2
JH
573 }
574}
575
91fe9a8d
RL
576void mig_throttle_counter_reset(void)
577{
578 RAMState *rs = ram_state;
579
580 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
581 rs->num_dirty_pages_period = 0;
aff3f660 582 rs->bytes_xfer_prev = stat64_get(&mig_stats.transferred);
91fe9a8d
RL
583}
584
3d0684b2
JQ
585/**
586 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
587 *
6f37bb8b 588 * @rs: current RAM state
3d0684b2
JQ
589 * @current_addr: address for the zero page
590 *
591 * Update the xbzrle cache to reflect a page that's been sent as all 0.
56e93d26
JQ
592 * The important thing is that a stale (not-yet-0'd) page be replaced
593 * by the new data.
594 * As a bonus, if the page wasn't in the cache it gets added so that
3d0684b2 595 * when a small write is made into the 0'd page it gets XBZRLE sent.
56e93d26 596 */
6f37bb8b 597static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
56e93d26 598{
56e93d26
JQ
599 /* We don't care if this fails to allocate a new cache page
600 * as long as it updated an old one */
c00e0928 601 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
aff3f660 602 stat64_get(&mig_stats.dirty_sync_count));
56e93d26
JQ
603}
604
605#define ENCODING_FLAG_XBZRLE 0x1
606
607/**
608 * save_xbzrle_page: compress and send current page
609 *
610 * Returns: 1 means that we wrote the page
611 * 0 means that page is identical to the one already sent
612 * -1 means that xbzrle would be longer than normal
613 *
5a987738 614 * @rs: current RAM state
ec6f3ab9 615 * @pss: current PSS channel
3d0684b2
JQ
616 * @current_data: pointer to the address of the page contents
617 * @current_addr: addr of the page
56e93d26
JQ
618 * @block: block that contains the page we want to send
619 * @offset: offset inside the block for the page
56e93d26 620 */
ec6f3ab9 621static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
61717ea9
PX
622 uint8_t **current_data, ram_addr_t current_addr,
623 RAMBlock *block, ram_addr_t offset)
56e93d26
JQ
624{
625 int encoded_len = 0, bytes_xbzrle;
626 uint8_t *prev_cached_page;
ec6f3ab9 627 QEMUFile *file = pss->pss_channel;
aff3f660 628 uint64_t generation = stat64_get(&mig_stats.dirty_sync_count);
56e93d26 629
536b5a4e 630 if (!cache_is_cached(XBZRLE.cache, current_addr, generation)) {
9360447d 631 xbzrle_counters.cache_miss++;
05931ec5 632 if (!rs->last_stage) {
56e93d26 633 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
536b5a4e 634 generation) == -1) {
56e93d26
JQ
635 return -1;
636 } else {
637 /* update *current_data when the page has been
638 inserted into cache */
639 *current_data = get_cached_data(XBZRLE.cache, current_addr);
640 }
641 }
642 return -1;
643 }
644
e460a4b1
WW
645 /*
646 * Reaching here means the page has hit the xbzrle cache, no matter what
647 * encoding result it is (normal encoding, overflow or skipping the page),
3a4452d8 648 * count the page as encoded. This is used to calculate the encoding rate.
e460a4b1
WW
649 *
650 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
651 * 2nd page turns out to be skipped (i.e. no new bytes written to the
652 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
653 * skipped page included. In this way, the encoding rate can tell if the
654 * guest page is good for xbzrle encoding.
655 */
656 xbzrle_counters.pages++;
56e93d26
JQ
657 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
658
659 /* save current buffer into memory */
660 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
661
662 /* XBZRLE encoding (if there is no overflow) */
04ffce13 663 encoded_len = xbzrle_encode_buffer_func(prev_cached_page, XBZRLE.current_buf,
664 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
665 TARGET_PAGE_SIZE);
ca353803
WY
666
667 /*
668 * Update the cache contents, so that it corresponds to the data
669 * sent, in all cases except where we skip the page.
670 */
05931ec5 671 if (!rs->last_stage && encoded_len != 0) {
ca353803
WY
672 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
673 /*
674 * In the case where we couldn't compress, ensure that the caller
675 * sends the data from the cache, since the guest might have
676 * changed the RAM since we copied it.
677 */
678 *current_data = prev_cached_page;
679 }
680
56e93d26 681 if (encoded_len == 0) {
55c4446b 682 trace_save_xbzrle_page_skipping();
56e93d26
JQ
683 return 0;
684 } else if (encoded_len == -1) {
55c4446b 685 trace_save_xbzrle_page_overflow();
9360447d 686 xbzrle_counters.overflow++;
e460a4b1 687 xbzrle_counters.bytes += TARGET_PAGE_SIZE;
56e93d26
JQ
688 return -1;
689 }
690
56e93d26 691 /* Send XBZRLE based compressed page */
37502df3 692 bytes_xbzrle = save_page_header(pss, pss->pss_channel, block,
204b88b8 693 offset | RAM_SAVE_FLAG_XBZRLE);
61717ea9
PX
694 qemu_put_byte(file, ENCODING_FLAG_XBZRLE);
695 qemu_put_be16(file, encoded_len);
696 qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len);
56e93d26 697 bytes_xbzrle += encoded_len + 1 + 2;
e460a4b1
WW
698 /*
699 * Like compressed_size (please see update_compress_thread_counts),
700 * the xbzrle encoded bytes don't count the 8 byte header with
701 * RAM_SAVE_FLAG_CONTINUE.
702 */
703 xbzrle_counters.bytes += bytes_xbzrle - 8;
4c2d0f6d 704 ram_transferred_add(bytes_xbzrle);
56e93d26
JQ
705
706 return 1;
707}
708
3d0684b2 709/**
d9e474ea 710 * pss_find_next_dirty: find the next dirty page of current ramblock
f3f491fc 711 *
d9e474ea
PX
712 * This function updates pss->page to point to the next dirty page index
713 * within the ramblock to migrate, or the end of ramblock when nothing
714 * found. Note that when pss->host_page_sending==true it means we're
715 * during sending a host page, so we won't look for dirty page that is
716 * outside the host page boundary.
3d0684b2 717 *
d9e474ea 718 * @pss: the current page search status
f3f491fc 719 */
d9e474ea 720static void pss_find_next_dirty(PageSearchStatus *pss)
56e93d26 721{
d9e474ea 722 RAMBlock *rb = pss->block;
6b6712ef
JQ
723 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
724 unsigned long *bitmap = rb->bmap;
56e93d26 725
fbd162e6 726 if (ramblock_is_ignored(rb)) {
d9e474ea
PX
727 /* Points directly to the end, so we know no dirty page */
728 pss->page = size;
729 return;
730 }
731
732 /*
733 * If during sending a host page, only look for dirty pages within the
734 * current host page being send.
735 */
736 if (pss->host_page_sending) {
737 assert(pss->host_page_end);
738 size = MIN(size, pss->host_page_end);
b895de50
CLG
739 }
740
d9e474ea 741 pss->page = find_next_bit(bitmap, size, pss->page);
56e93d26
JQ
742}
743
1230a25f 744static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
3143577d
WW
745 unsigned long page)
746{
747 uint8_t shift;
748 hwaddr size, start;
749
750 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
751 return;
752 }
753
754 shift = rb->clear_bmap_shift;
755 /*
756 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
757 * can make things easier sometimes since then start address
758 * of the small chunk will always be 64 pages aligned so the
759 * bitmap will always be aligned to unsigned long. We should
760 * even be able to remove this restriction but I'm simply
761 * keeping it.
762 */
763 assert(shift >= 6);
764
765 size = 1ULL << (TARGET_PAGE_BITS + shift);
7648297d 766 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
3143577d
WW
767 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
768 memory_region_clear_dirty_bitmap(rb->mr, start, size);
769}
770
771static void
1230a25f 772migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
3143577d
WW
773 unsigned long start,
774 unsigned long npages)
775{
776 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
777 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
778 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
779
780 /*
781 * Clear pages from start to start + npages - 1, so the end boundary is
782 * exclusive.
783 */
784 for (i = chunk_start; i < chunk_end; i += chunk_pages) {
1230a25f 785 migration_clear_memory_region_dirty_bitmap(rb, i);
3143577d
WW
786 }
787}
788
a6a83cef
RL
789/*
790 * colo_bitmap_find_diry:find contiguous dirty pages from start
791 *
792 * Returns the page offset within memory region of the start of the contiguout
793 * dirty page
794 *
795 * @rs: current RAM state
796 * @rb: RAMBlock where to search for dirty pages
797 * @start: page where we start the search
798 * @num: the number of contiguous dirty pages
799 */
800static inline
801unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
802 unsigned long start, unsigned long *num)
803{
804 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
805 unsigned long *bitmap = rb->bmap;
806 unsigned long first, next;
807
808 *num = 0;
809
810 if (ramblock_is_ignored(rb)) {
811 return size;
812 }
813
814 first = find_next_bit(bitmap, size, start);
815 if (first >= size) {
816 return first;
817 }
818 next = find_next_zero_bit(bitmap, size, first + 1);
819 assert(next >= first);
820 *num = next - first;
821 return first;
822}
823
06b10688 824static inline bool migration_bitmap_clear_dirty(RAMState *rs,
f20e2865
JQ
825 RAMBlock *rb,
826 unsigned long page)
a82d593b
DDAG
827{
828 bool ret;
a82d593b 829
002cad6b
PX
830 /*
831 * Clear dirty bitmap if needed. This _must_ be called before we
832 * send any of the page in the chunk because we need to make sure
833 * we can capture further page content changes when we sync dirty
834 * log the next time. So as long as we are going to send any of
835 * the page in the chunk we clear the remote dirty bitmap for all.
836 * Clearing it earlier won't be a problem, but too late will.
837 */
1230a25f 838 migration_clear_memory_region_dirty_bitmap(rb, page);
002cad6b 839
6b6712ef 840 ret = test_and_clear_bit(page, rb->bmap);
a82d593b 841 if (ret) {
0d8ec885 842 rs->migration_dirty_pages--;
a82d593b 843 }
386a907b 844
a82d593b
DDAG
845 return ret;
846}
847
be39b4cd
DH
848static void dirty_bitmap_clear_section(MemoryRegionSection *section,
849 void *opaque)
850{
851 const hwaddr offset = section->offset_within_region;
852 const hwaddr size = int128_get64(section->size);
853 const unsigned long start = offset >> TARGET_PAGE_BITS;
854 const unsigned long npages = size >> TARGET_PAGE_BITS;
855 RAMBlock *rb = section->mr->ram_block;
856 uint64_t *cleared_bits = opaque;
857
858 /*
859 * We don't grab ram_state->bitmap_mutex because we expect to run
860 * only when starting migration or during postcopy recovery where
861 * we don't have concurrent access.
862 */
863 if (!migration_in_postcopy() && !migrate_background_snapshot()) {
864 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
865 }
866 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
867 bitmap_clear(rb->bmap, start, npages);
868}
869
870/*
871 * Exclude all dirty pages from migration that fall into a discarded range as
872 * managed by a RamDiscardManager responsible for the mapped memory region of
873 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
874 *
875 * Discarded pages ("logically unplugged") have undefined content and must
876 * not get migrated, because even reading these pages for migration might
877 * result in undesired behavior.
878 *
879 * Returns the number of cleared bits in the RAMBlock dirty bitmap.
880 *
881 * Note: The result is only stable while migrating (precopy/postcopy).
882 */
883static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
884{
885 uint64_t cleared_bits = 0;
886
887 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
888 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
889 MemoryRegionSection section = {
890 .mr = rb->mr,
891 .offset_within_region = 0,
892 .size = int128_make64(qemu_ram_get_used_length(rb)),
893 };
894
895 ram_discard_manager_replay_discarded(rdm, &section,
896 dirty_bitmap_clear_section,
897 &cleared_bits);
898 }
899 return cleared_bits;
900}
901
9470c5e0
DH
902/*
903 * Check if a host-page aligned page falls into a discarded range as managed by
904 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
905 *
906 * Note: The result is only stable while migrating (precopy/postcopy).
907 */
908bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
909{
910 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
911 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
912 MemoryRegionSection section = {
913 .mr = rb->mr,
914 .offset_within_region = start,
915 .size = int128_make64(qemu_ram_pagesize(rb)),
916 };
917
918 return !ram_discard_manager_is_populated(rdm, &section);
919 }
920 return false;
921}
922
267691b6 923/* Called with RCU critical section */
7a3e9571 924static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
56e93d26 925{
fb613580
KZ
926 uint64_t new_dirty_pages =
927 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
928
929 rs->migration_dirty_pages += new_dirty_pages;
930 rs->num_dirty_pages_period += new_dirty_pages;
56e93d26
JQ
931}
932
3d0684b2
JQ
933/**
934 * ram_pagesize_summary: calculate all the pagesizes of a VM
935 *
936 * Returns a summary bitmap of the page sizes of all RAMBlocks
937 *
938 * For VMs with just normal pages this is equivalent to the host page
939 * size. If it's got some huge pages then it's the OR of all the
940 * different page sizes.
e8ca1db2
DDAG
941 */
942uint64_t ram_pagesize_summary(void)
943{
944 RAMBlock *block;
945 uint64_t summary = 0;
946
fbd162e6 947 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
e8ca1db2
DDAG
948 summary |= block->page_size;
949 }
950
951 return summary;
952}
953
aecbfe9c
XG
954uint64_t ram_get_total_transferred_pages(void)
955{
aff3f660
JQ
956 return stat64_get(&mig_stats.normal_pages) +
957 stat64_get(&mig_stats.zero_pages) +
23b7576d 958 compression_counters.pages + xbzrle_counters.pages;
aecbfe9c
XG
959}
960
b734035b
XG
961static void migration_update_rates(RAMState *rs, int64_t end_time)
962{
be8b02ed 963 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
76e03000 964 double compressed_size;
b734035b
XG
965
966 /* calculate period counters */
aff3f660 967 stat64_set(&mig_stats.dirty_pages_rate,
72f8e587
JQ
968 rs->num_dirty_pages_period * 1000 /
969 (end_time - rs->time_last_bitmap_sync));
b734035b 970
be8b02ed 971 if (!page_count) {
b734035b
XG
972 return;
973 }
974
87dca0c9 975 if (migrate_xbzrle()) {
e460a4b1
WW
976 double encoded_size, unencoded_size;
977
b734035b 978 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
be8b02ed 979 rs->xbzrle_cache_miss_prev) / page_count;
b734035b 980 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
e460a4b1
WW
981 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
982 TARGET_PAGE_SIZE;
983 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
92271402 984 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
e460a4b1 985 xbzrle_counters.encoding_rate = 0;
e460a4b1
WW
986 } else {
987 xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
988 }
989 rs->xbzrle_pages_prev = xbzrle_counters.pages;
990 rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
b734035b 991 }
76e03000 992
a7a94d14 993 if (migrate_compress()) {
76e03000
XG
994 compression_counters.busy_rate = (double)(compression_counters.busy -
995 rs->compress_thread_busy_prev) / page_count;
996 rs->compress_thread_busy_prev = compression_counters.busy;
997
998 compressed_size = compression_counters.compressed_size -
999 rs->compressed_size_prev;
1000 if (compressed_size) {
1001 double uncompressed_size = (compression_counters.pages -
1002 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1003
1004 /* Compression-Ratio = Uncompressed-size / Compressed-size */
1005 compression_counters.compression_rate =
1006 uncompressed_size / compressed_size;
1007
1008 rs->compress_pages_prev = compression_counters.pages;
1009 rs->compressed_size_prev = compression_counters.compressed_size;
1010 }
1011 }
b734035b
XG
1012}
1013
dc14a470
KZ
1014static void migration_trigger_throttle(RAMState *rs)
1015{
6499efdb 1016 uint64_t threshold = migrate_throttle_trigger_threshold();
23b7576d 1017 uint64_t bytes_xfer_period =
aff3f660 1018 stat64_get(&mig_stats.transferred) - rs->bytes_xfer_prev;
dc14a470
KZ
1019 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1020 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1021
1022 /* During block migration the auto-converge logic incorrectly detects
1023 * that ram migration makes no progress. Avoid this by disabling the
1024 * throttling logic during the bulk phase of block migration. */
1025 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1026 /* The following detection logic can be refined later. For now:
1027 Check to see if the ratio between dirtied bytes and the approx.
1028 amount of bytes that just got transferred since the last time
1029 we were in this routine reaches the threshold. If that happens
1030 twice, start or increase throttling. */
1031
1032 if ((bytes_dirty_period > bytes_dirty_threshold) &&
1033 (++rs->dirty_rate_high_cnt >= 2)) {
1034 trace_migration_throttle();
1035 rs->dirty_rate_high_cnt = 0;
cbbf8182
KZ
1036 mig_throttle_guest_down(bytes_dirty_period,
1037 bytes_dirty_threshold);
dc14a470
KZ
1038 }
1039 }
1040}
1041
1e493be5 1042static void migration_bitmap_sync(RAMState *rs, bool last_stage)
56e93d26
JQ
1043{
1044 RAMBlock *block;
56e93d26 1045 int64_t end_time;
56e93d26 1046
aff3f660 1047 stat64_add(&mig_stats.dirty_sync_count, 1);
56e93d26 1048
f664da80
JQ
1049 if (!rs->time_last_bitmap_sync) {
1050 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
56e93d26
JQ
1051 }
1052
1053 trace_migration_bitmap_sync_start();
1e493be5 1054 memory_global_dirty_log_sync(last_stage);
56e93d26 1055
108cfae0 1056 qemu_mutex_lock(&rs->bitmap_mutex);
89ac5a1d
DDAG
1057 WITH_RCU_READ_LOCK_GUARD() {
1058 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1059 ramblock_sync_dirty_bitmap(rs, block);
1060 }
aff3f660 1061 stat64_set(&mig_stats.dirty_bytes_last_sync, ram_bytes_remaining());
56e93d26 1062 }
108cfae0 1063 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26 1064
9458a9a1 1065 memory_global_after_dirty_log_sync();
a66cd90c 1066 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1ffb5dfd 1067
56e93d26
JQ
1068 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1069
1070 /* more than 1 second = 1000 millisecons */
f664da80 1071 if (end_time > rs->time_last_bitmap_sync + 1000) {
dc14a470 1072 migration_trigger_throttle(rs);
070afca2 1073
b734035b
XG
1074 migration_update_rates(rs, end_time);
1075
be8b02ed 1076 rs->target_page_count_prev = rs->target_page_count;
d693c6f1
FF
1077
1078 /* reset period counters */
f664da80 1079 rs->time_last_bitmap_sync = end_time;
a66cd90c 1080 rs->num_dirty_pages_period = 0;
aff3f660 1081 rs->bytes_xfer_prev = stat64_get(&mig_stats.transferred);
56e93d26 1082 }
b890902c 1083 if (migrate_events()) {
aff3f660 1084 uint64_t generation = stat64_get(&mig_stats.dirty_sync_count);
536b5a4e 1085 qapi_event_send_migration_pass(generation);
4addcd4f 1086 }
56e93d26
JQ
1087}
1088
1e493be5 1089static void migration_bitmap_sync_precopy(RAMState *rs, bool last_stage)
bd227060
WW
1090{
1091 Error *local_err = NULL;
1092
1093 /*
1094 * The current notifier usage is just an optimization to migration, so we
1095 * don't stop the normal migration process in the error case.
1096 */
1097 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1098 error_report_err(local_err);
b4a1733c 1099 local_err = NULL;
bd227060
WW
1100 }
1101
1e493be5 1102 migration_bitmap_sync(rs, last_stage);
bd227060
WW
1103
1104 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1105 error_report_err(local_err);
1106 }
1107}
1108
a4dbaf8e 1109void ram_release_page(const char *rbname, uint64_t offset)
47fe16ff
JQ
1110{
1111 if (!migrate_release_ram() || !migration_in_postcopy()) {
1112 return;
1113 }
1114
1115 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1116}
1117
6c97ec5f
XG
1118/**
1119 * save_zero_page_to_file: send the zero page to the file
1120 *
1121 * Returns the size of data written to the file, 0 means the page is not
1122 * a zero page
1123 *
ec6f3ab9 1124 * @pss: current PSS channel
6c97ec5f
XG
1125 * @block: block that contains the page we want to send
1126 * @offset: offset inside the block for the page
1127 */
37502df3 1128static int save_zero_page_to_file(PageSearchStatus *pss, QEMUFile *file,
6c97ec5f
XG
1129 RAMBlock *block, ram_addr_t offset)
1130{
1131 uint8_t *p = block->host + offset;
1132 int len = 0;
1133
bad452a7 1134 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
37502df3 1135 len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO);
6c97ec5f
XG
1136 qemu_put_byte(file, 0);
1137 len += 1;
47fe16ff 1138 ram_release_page(block->idstr, offset);
6c97ec5f
XG
1139 }
1140 return len;
1141}
1142
56e93d26 1143/**
3d0684b2 1144 * save_zero_page: send the zero page to the stream
56e93d26 1145 *
3d0684b2 1146 * Returns the number of pages written.
56e93d26 1147 *
ec6f3ab9 1148 * @pss: current PSS channel
56e93d26
JQ
1149 * @block: block that contains the page we want to send
1150 * @offset: offset inside the block for the page
56e93d26 1151 */
37502df3 1152static int save_zero_page(PageSearchStatus *pss, QEMUFile *f, RAMBlock *block,
61717ea9 1153 ram_addr_t offset)
56e93d26 1154{
37502df3 1155 int len = save_zero_page_to_file(pss, f, block, offset);
56e93d26 1156
6c97ec5f 1157 if (len) {
aff3f660 1158 stat64_add(&mig_stats.zero_pages, 1);
4c2d0f6d 1159 ram_transferred_add(len);
6c97ec5f 1160 return 1;
56e93d26 1161 }
6c97ec5f 1162 return -1;
56e93d26
JQ
1163}
1164
059ff0fb
XG
1165/*
1166 * @pages: the number of pages written by the control path,
1167 * < 0 - error
1168 * > 0 - number of pages written
1169 *
1170 * Return true if the pages has been saved, otherwise false is returned.
1171 */
61717ea9
PX
1172static bool control_save_page(PageSearchStatus *pss, RAMBlock *block,
1173 ram_addr_t offset, int *pages)
059ff0fb
XG
1174{
1175 uint64_t bytes_xmit = 0;
1176 int ret;
1177
1178 *pages = -1;
61717ea9
PX
1179 ret = ram_control_save_page(pss->pss_channel, block->offset, offset,
1180 TARGET_PAGE_SIZE, &bytes_xmit);
059ff0fb
XG
1181 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1182 return false;
1183 }
1184
1185 if (bytes_xmit) {
4c2d0f6d 1186 ram_transferred_add(bytes_xmit);
059ff0fb
XG
1187 *pages = 1;
1188 }
1189
1190 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1191 return true;
1192 }
1193
1194 if (bytes_xmit > 0) {
aff3f660 1195 stat64_add(&mig_stats.normal_pages, 1);
059ff0fb 1196 } else if (bytes_xmit == 0) {
aff3f660 1197 stat64_add(&mig_stats.zero_pages, 1);
059ff0fb
XG
1198 }
1199
1200 return true;
1201}
1202
65dacaa0
XG
1203/*
1204 * directly send the page to the stream
1205 *
1206 * Returns the number of pages written.
1207 *
ec6f3ab9 1208 * @pss: current PSS channel
65dacaa0
XG
1209 * @block: block that contains the page we want to send
1210 * @offset: offset inside the block for the page
1211 * @buf: the page to be sent
1212 * @async: send to page asyncly
1213 */
ec6f3ab9 1214static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
61717ea9 1215 ram_addr_t offset, uint8_t *buf, bool async)
65dacaa0 1216{
ec6f3ab9
PX
1217 QEMUFile *file = pss->pss_channel;
1218
37502df3 1219 ram_transferred_add(save_page_header(pss, pss->pss_channel, block,
4c2d0f6d 1220 offset | RAM_SAVE_FLAG_PAGE));
65dacaa0 1221 if (async) {
61717ea9 1222 qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
f912ec5b 1223 migrate_release_ram() &&
65dacaa0
XG
1224 migration_in_postcopy());
1225 } else {
61717ea9 1226 qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
65dacaa0 1227 }
4c2d0f6d 1228 ram_transferred_add(TARGET_PAGE_SIZE);
aff3f660 1229 stat64_add(&mig_stats.normal_pages, 1);
65dacaa0
XG
1230 return 1;
1231}
1232
56e93d26 1233/**
3d0684b2 1234 * ram_save_page: send the given page to the stream
56e93d26 1235 *
3d0684b2 1236 * Returns the number of pages written.
3fd3c4b3
DDAG
1237 * < 0 - error
1238 * >=0 - Number of pages written - this might legally be 0
1239 * if xbzrle noticed the page was the same.
56e93d26 1240 *
6f37bb8b 1241 * @rs: current RAM state
56e93d26
JQ
1242 * @block: block that contains the page we want to send
1243 * @offset: offset inside the block for the page
56e93d26 1244 */
05931ec5 1245static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
56e93d26
JQ
1246{
1247 int pages = -1;
56e93d26 1248 uint8_t *p;
56e93d26 1249 bool send_async = true;
a08f6890 1250 RAMBlock *block = pss->block;
8bba004c 1251 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
059ff0fb 1252 ram_addr_t current_addr = block->offset + offset;
56e93d26 1253
2f68e399 1254 p = block->host + offset;
1db9d8e5 1255 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
56e93d26 1256
56e93d26 1257 XBZRLE_cache_lock();
f3095cc8 1258 if (rs->xbzrle_started && !migration_in_postcopy()) {
ec6f3ab9 1259 pages = save_xbzrle_page(rs, pss, &p, current_addr,
61717ea9 1260 block, offset);
05931ec5 1261 if (!rs->last_stage) {
059ff0fb
XG
1262 /* Can't send this cached data async, since the cache page
1263 * might get updated before it gets to the wire
56e93d26 1264 */
059ff0fb 1265 send_async = false;
56e93d26
JQ
1266 }
1267 }
1268
1269 /* XBZRLE overflow or normal page */
1270 if (pages == -1) {
ec6f3ab9 1271 pages = save_normal_page(pss, block, offset, p, send_async);
56e93d26
JQ
1272 }
1273
1274 XBZRLE_cache_unlock();
1275
1276 return pages;
1277}
1278
61717ea9 1279static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block,
b9ee2f7d
JQ
1280 ram_addr_t offset)
1281{
61717ea9 1282 if (multifd_queue_page(file, block, offset) < 0) {
713f762a
IR
1283 return -1;
1284 }
aff3f660 1285 stat64_add(&mig_stats.normal_pages, 1);
b9ee2f7d
JQ
1286
1287 return 1;
1288}
1289
5e5fdcff
XG
1290static void
1291update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1292{
4c2d0f6d 1293 ram_transferred_add(bytes_xmit);
76e03000 1294
97274a87 1295 if (param->result == RES_ZEROPAGE) {
aff3f660 1296 stat64_add(&mig_stats.zero_pages, 1);
76e03000 1297 return;
5e5fdcff 1298 }
76e03000
XG
1299
1300 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1301 compression_counters.compressed_size += bytes_xmit - 8;
1302 compression_counters.pages++;
56e93d26
JQ
1303}
1304
32b05495
XG
1305static bool save_page_use_compression(RAMState *rs);
1306
3e81763e 1307static int send_queued_data(CompressParam *param)
56e93d26 1308{
3e81763e 1309 PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_PRECOPY];
eaa238ab 1310 MigrationState *ms = migrate_get_current();
3e81763e
LS
1311 QEMUFile *file = ms->to_dst_file;
1312 int len = 0;
1313
1314 RAMBlock *block = param->block;
1315 ram_addr_t offset = param->offset;
1316
1317 if (param->result == RES_NONE) {
1318 return 0;
1319 }
1320
1321 assert(block == pss->last_sent_block);
1322
1323 if (param->result == RES_ZEROPAGE) {
4024cc85 1324 assert(qemu_file_buffer_empty(param->file));
3e81763e
LS
1325 len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO);
1326 qemu_put_byte(file, 0);
1327 len += 1;
1328 ram_release_page(block->idstr, offset);
1329 } else if (param->result == RES_COMPRESS) {
4024cc85 1330 assert(!qemu_file_buffer_empty(param->file));
3e81763e
LS
1331 len += save_page_header(pss, file, block,
1332 offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1333 len += qemu_put_qemu_file(file, param->file);
1334 } else {
1335 abort();
1336 }
1337
680628d2
LS
1338 update_compress_thread_counts(param, len);
1339
3e81763e
LS
1340 return len;
1341}
1342
ef4f5f5d
LS
1343static void ram_flush_compressed_data(RAMState *rs)
1344{
1345 if (!save_page_use_compression(rs)) {
1346 return;
1347 }
1348
1349 flush_compressed_data(send_queued_data);
1350}
1351
31e2ac74
JQ
1352#define PAGE_ALL_CLEAN 0
1353#define PAGE_TRY_AGAIN 1
1354#define PAGE_DIRTY_FOUND 2
3d0684b2
JQ
1355/**
1356 * find_dirty_block: find the next dirty page and update any state
1357 * associated with the search process.
b9e60928 1358 *
31e2ac74 1359 * Returns:
294e5a40 1360 * <0: An error happened
31e2ac74
JQ
1361 * PAGE_ALL_CLEAN: no dirty page found, give up
1362 * PAGE_TRY_AGAIN: no dirty page found, retry for next block
1363 * PAGE_DIRTY_FOUND: dirty page found
b9e60928 1364 *
6f37bb8b 1365 * @rs: current RAM state
3d0684b2
JQ
1366 * @pss: data about the state of the current dirty page scan
1367 * @again: set to false if the search has scanned the whole of RAM
b9e60928 1368 */
31e2ac74 1369static int find_dirty_block(RAMState *rs, PageSearchStatus *pss)
b9e60928 1370{
d9e474ea
PX
1371 /* Update pss->page for the next dirty bit in ramblock */
1372 pss_find_next_dirty(pss);
1373
6f37bb8b 1374 if (pss->complete_round && pss->block == rs->last_seen_block &&
a935e30f 1375 pss->page >= rs->last_page) {
b9e60928
DDAG
1376 /*
1377 * We've been once around the RAM and haven't found anything.
1378 * Give up.
1379 */
31e2ac74 1380 return PAGE_ALL_CLEAN;
b9e60928 1381 }
542147f4
DH
1382 if (!offset_in_ramblock(pss->block,
1383 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
b9e60928 1384 /* Didn't find anything in this RAM Block */
a935e30f 1385 pss->page = 0;
b9e60928
DDAG
1386 pss->block = QLIST_NEXT_RCU(pss->block, next);
1387 if (!pss->block) {
294e5a40
JQ
1388 if (!migrate_multifd_flush_after_each_section()) {
1389 QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel;
1390 int ret = multifd_send_sync_main(f);
1391 if (ret < 0) {
1392 return ret;
1393 }
1394 qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
1395 qemu_fflush(f);
1396 }
48df9d80
XG
1397 /*
1398 * If memory migration starts over, we will meet a dirtied page
1399 * which may still exists in compression threads's ring, so we
1400 * should flush the compressed data to make sure the new page
1401 * is not overwritten by the old one in the destination.
1402 *
1403 * Also If xbzrle is on, stop using the data compression at this
1404 * point. In theory, xbzrle can do better than compression.
1405 */
ef4f5f5d 1406 ram_flush_compressed_data(rs);
48df9d80 1407
b9e60928
DDAG
1408 /* Hit the end of the list */
1409 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1410 /* Flag that we've looped */
1411 pss->complete_round = true;
1a373522 1412 /* After the first round, enable XBZRLE. */
87dca0c9 1413 if (migrate_xbzrle()) {
f3095cc8 1414 rs->xbzrle_started = true;
1a373522 1415 }
b9e60928
DDAG
1416 }
1417 /* Didn't find anything this time, but try again on the new block */
31e2ac74 1418 return PAGE_TRY_AGAIN;
b9e60928 1419 } else {
31e2ac74
JQ
1420 /* We've found something */
1421 return PAGE_DIRTY_FOUND;
b9e60928
DDAG
1422 }
1423}
1424
3d0684b2
JQ
1425/**
1426 * unqueue_page: gets a page of the queue
1427 *
a82d593b 1428 * Helper for 'get_queued_page' - gets a page off the queue
a82d593b 1429 *
3d0684b2
JQ
1430 * Returns the block of the page (or NULL if none available)
1431 *
ec481c6c 1432 * @rs: current RAM state
3d0684b2 1433 * @offset: used to return the offset within the RAMBlock
a82d593b 1434 */
f20e2865 1435static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
a82d593b 1436{
a1fe28df 1437 struct RAMSrcPageRequest *entry;
a82d593b
DDAG
1438 RAMBlock *block = NULL;
1439
a1fe28df 1440 if (!postcopy_has_request(rs)) {
ae526e32
XG
1441 return NULL;
1442 }
1443
6e8a355d 1444 QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
a1fe28df
PX
1445
1446 /*
1447 * This should _never_ change even after we take the lock, because no one
1448 * should be taking anything off the request list other than us.
1449 */
1450 assert(postcopy_has_request(rs));
1451
1452 entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1453 block = entry->rb;
1454 *offset = entry->offset;
1455
777f53c7
TH
1456 if (entry->len > TARGET_PAGE_SIZE) {
1457 entry->len -= TARGET_PAGE_SIZE;
1458 entry->offset += TARGET_PAGE_SIZE;
a1fe28df
PX
1459 } else {
1460 memory_region_unref(block->mr);
1461 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1462 g_free(entry);
1463 migration_consume_urgent_request();
a82d593b 1464 }
a82d593b
DDAG
1465
1466 return block;
1467}
1468
278e2f55
AG
1469#if defined(__linux__)
1470/**
1471 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1472 * is found, return RAM block pointer and page offset
1473 *
1474 * Returns pointer to the RAMBlock containing faulting page,
1475 * NULL if no write faults are pending
1476 *
1477 * @rs: current RAM state
1478 * @offset: page offset from the beginning of the block
1479 */
1480static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1481{
1482 struct uffd_msg uffd_msg;
1483 void *page_address;
82ea3e3b 1484 RAMBlock *block;
278e2f55
AG
1485 int res;
1486
1487 if (!migrate_background_snapshot()) {
1488 return NULL;
1489 }
1490
1491 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1492 if (res <= 0) {
1493 return NULL;
1494 }
1495
1496 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
82ea3e3b
AG
1497 block = qemu_ram_block_from_host(page_address, false, offset);
1498 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1499 return block;
278e2f55
AG
1500}
1501
1502/**
1503 * ram_save_release_protection: release UFFD write protection after
1504 * a range of pages has been saved
1505 *
1506 * @rs: current RAM state
1507 * @pss: page-search-status structure
1508 * @start_page: index of the first page in the range relative to pss->block
1509 *
1510 * Returns 0 on success, negative value in case of an error
1511*/
1512static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1513 unsigned long start_page)
1514{
1515 int res = 0;
1516
1517 /* Check if page is from UFFD-managed region. */
1518 if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1519 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
258f5c98 1520 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
278e2f55
AG
1521
1522 /* Flush async buffers before un-protect. */
61717ea9 1523 qemu_fflush(pss->pss_channel);
278e2f55
AG
1524 /* Un-protect memory range. */
1525 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1526 false, false);
1527 }
1528
1529 return res;
1530}
1531
1532/* ram_write_tracking_available: check if kernel supports required UFFD features
1533 *
1534 * Returns true if supports, false otherwise
1535 */
1536bool ram_write_tracking_available(void)
1537{
1538 uint64_t uffd_features;
1539 int res;
1540
1541 res = uffd_query_features(&uffd_features);
1542 return (res == 0 &&
1543 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1544}
1545
1546/* ram_write_tracking_compatible: check if guest configuration is
1547 * compatible with 'write-tracking'
1548 *
1549 * Returns true if compatible, false otherwise
1550 */
1551bool ram_write_tracking_compatible(void)
1552{
1553 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1554 int uffd_fd;
82ea3e3b 1555 RAMBlock *block;
278e2f55
AG
1556 bool ret = false;
1557
1558 /* Open UFFD file descriptor */
1559 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1560 if (uffd_fd < 0) {
1561 return false;
1562 }
1563
1564 RCU_READ_LOCK_GUARD();
1565
82ea3e3b 1566 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
278e2f55
AG
1567 uint64_t uffd_ioctls;
1568
1569 /* Nothing to do with read-only and MMIO-writable regions */
82ea3e3b 1570 if (block->mr->readonly || block->mr->rom_device) {
278e2f55
AG
1571 continue;
1572 }
1573 /* Try to register block memory via UFFD-IO to track writes */
82ea3e3b 1574 if (uffd_register_memory(uffd_fd, block->host, block->max_length,
278e2f55
AG
1575 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1576 goto out;
1577 }
1578 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1579 goto out;
1580 }
1581 }
1582 ret = true;
1583
1584out:
1585 uffd_close_fd(uffd_fd);
1586 return ret;
1587}
1588
f7b9dcfb
DH
1589static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1590 ram_addr_t size)
1591{
5f19a449
DH
1592 const ram_addr_t end = offset + size;
1593
f7b9dcfb
DH
1594 /*
1595 * We read one byte of each page; this will preallocate page tables if
1596 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1597 * where no page was populated yet. This might require adaption when
1598 * supporting other mappings, like shmem.
1599 */
5f19a449 1600 for (; offset < end; offset += block->page_size) {
f7b9dcfb
DH
1601 char tmp = *((char *)block->host + offset);
1602
1603 /* Don't optimize the read out */
1604 asm volatile("" : "+r" (tmp));
1605 }
1606}
1607
6fee3a1f
DH
1608static inline int populate_read_section(MemoryRegionSection *section,
1609 void *opaque)
1610{
1611 const hwaddr size = int128_get64(section->size);
1612 hwaddr offset = section->offset_within_region;
1613 RAMBlock *block = section->mr->ram_block;
1614
1615 populate_read_range(block, offset, size);
1616 return 0;
1617}
1618
eeccb99c 1619/*
f7b9dcfb
DH
1620 * ram_block_populate_read: preallocate page tables and populate pages in the
1621 * RAM block by reading a byte of each page.
eeccb99c
AG
1622 *
1623 * Since it's solely used for userfault_fd WP feature, here we just
1624 * hardcode page size to qemu_real_host_page_size.
1625 *
82ea3e3b 1626 * @block: RAM block to populate
eeccb99c 1627 */
6fee3a1f 1628static void ram_block_populate_read(RAMBlock *rb)
eeccb99c 1629{
6fee3a1f
DH
1630 /*
1631 * Skip populating all pages that fall into a discarded range as managed by
1632 * a RamDiscardManager responsible for the mapped memory region of the
1633 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1634 * must not get populated automatically. We don't have to track
1635 * modifications via userfaultfd WP reliably, because these pages will
1636 * not be part of the migration stream either way -- see
1637 * ramblock_dirty_bitmap_exclude_discarded_pages().
1638 *
1639 * Note: The result is only stable while migrating (precopy/postcopy).
1640 */
1641 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1642 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1643 MemoryRegionSection section = {
1644 .mr = rb->mr,
1645 .offset_within_region = 0,
1646 .size = rb->mr->size,
1647 };
1648
1649 ram_discard_manager_replay_populated(rdm, &section,
1650 populate_read_section, NULL);
1651 } else {
1652 populate_read_range(rb, 0, rb->used_length);
1653 }
eeccb99c
AG
1654}
1655
1656/*
1657 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1658 */
1659void ram_write_tracking_prepare(void)
1660{
82ea3e3b 1661 RAMBlock *block;
eeccb99c
AG
1662
1663 RCU_READ_LOCK_GUARD();
1664
82ea3e3b 1665 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
eeccb99c 1666 /* Nothing to do with read-only and MMIO-writable regions */
82ea3e3b 1667 if (block->mr->readonly || block->mr->rom_device) {
eeccb99c
AG
1668 continue;
1669 }
1670
1671 /*
1672 * Populate pages of the RAM block before enabling userfault_fd
1673 * write protection.
1674 *
1675 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1676 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1677 * pages with pte_none() entries in page table.
1678 */
f7b9dcfb 1679 ram_block_populate_read(block);
eeccb99c
AG
1680 }
1681}
1682
e41c5770
DH
1683static inline int uffd_protect_section(MemoryRegionSection *section,
1684 void *opaque)
1685{
1686 const hwaddr size = int128_get64(section->size);
1687 const hwaddr offset = section->offset_within_region;
1688 RAMBlock *rb = section->mr->ram_block;
1689 int uffd_fd = (uintptr_t)opaque;
1690
1691 return uffd_change_protection(uffd_fd, rb->host + offset, size, true,
1692 false);
1693}
1694
1695static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd)
1696{
1697 assert(rb->flags & RAM_UF_WRITEPROTECT);
1698
1699 /* See ram_block_populate_read() */
1700 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1701 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1702 MemoryRegionSection section = {
1703 .mr = rb->mr,
1704 .offset_within_region = 0,
1705 .size = rb->mr->size,
1706 };
1707
1708 return ram_discard_manager_replay_populated(rdm, &section,
1709 uffd_protect_section,
1710 (void *)(uintptr_t)uffd_fd);
1711 }
1712 return uffd_change_protection(uffd_fd, rb->host,
1713 rb->used_length, true, false);
1714}
1715
278e2f55
AG
1716/*
1717 * ram_write_tracking_start: start UFFD-WP memory tracking
1718 *
1719 * Returns 0 for success or negative value in case of error
1720 */
1721int ram_write_tracking_start(void)
1722{
1723 int uffd_fd;
1724 RAMState *rs = ram_state;
82ea3e3b 1725 RAMBlock *block;
278e2f55
AG
1726
1727 /* Open UFFD file descriptor */
1728 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1729 if (uffd_fd < 0) {
1730 return uffd_fd;
1731 }
1732 rs->uffdio_fd = uffd_fd;
1733
1734 RCU_READ_LOCK_GUARD();
1735
82ea3e3b 1736 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
278e2f55 1737 /* Nothing to do with read-only and MMIO-writable regions */
82ea3e3b 1738 if (block->mr->readonly || block->mr->rom_device) {
278e2f55
AG
1739 continue;
1740 }
1741
1742 /* Register block memory with UFFD to track writes */
82ea3e3b
AG
1743 if (uffd_register_memory(rs->uffdio_fd, block->host,
1744 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
278e2f55
AG
1745 goto fail;
1746 }
72ef3a37
DH
1747 block->flags |= RAM_UF_WRITEPROTECT;
1748 memory_region_ref(block->mr);
1749
278e2f55 1750 /* Apply UFFD write protection to the block memory range */
e41c5770 1751 if (ram_block_uffd_protect(block, uffd_fd)) {
278e2f55
AG
1752 goto fail;
1753 }
278e2f55 1754
82ea3e3b
AG
1755 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1756 block->host, block->max_length);
278e2f55
AG
1757 }
1758
1759 return 0;
1760
1761fail:
1762 error_report("ram_write_tracking_start() failed: restoring initial memory state");
1763
82ea3e3b
AG
1764 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1765 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
278e2f55
AG
1766 continue;
1767 }
82ea3e3b 1768 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
278e2f55 1769 /* Cleanup flags and remove reference */
82ea3e3b
AG
1770 block->flags &= ~RAM_UF_WRITEPROTECT;
1771 memory_region_unref(block->mr);
278e2f55
AG
1772 }
1773
1774 uffd_close_fd(uffd_fd);
1775 rs->uffdio_fd = -1;
1776 return -1;
1777}
1778
1779/**
1780 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1781 */
1782void ram_write_tracking_stop(void)
1783{
1784 RAMState *rs = ram_state;
82ea3e3b 1785 RAMBlock *block;
278e2f55
AG
1786
1787 RCU_READ_LOCK_GUARD();
1788
82ea3e3b
AG
1789 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1790 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
278e2f55
AG
1791 continue;
1792 }
82ea3e3b 1793 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
278e2f55 1794
82ea3e3b
AG
1795 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1796 block->host, block->max_length);
278e2f55
AG
1797
1798 /* Cleanup flags and remove reference */
82ea3e3b
AG
1799 block->flags &= ~RAM_UF_WRITEPROTECT;
1800 memory_region_unref(block->mr);
278e2f55
AG
1801 }
1802
1803 /* Finally close UFFD file descriptor */
1804 uffd_close_fd(rs->uffdio_fd);
1805 rs->uffdio_fd = -1;
1806}
1807
1808#else
1809/* No target OS support, stubs just fail or ignore */
1810
1811static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1812{
1813 (void) rs;
1814 (void) offset;
1815
1816 return NULL;
1817}
1818
1819static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1820 unsigned long start_page)
1821{
1822 (void) rs;
1823 (void) pss;
1824 (void) start_page;
1825
1826 return 0;
1827}
1828
1829bool ram_write_tracking_available(void)
1830{
1831 return false;
1832}
1833
1834bool ram_write_tracking_compatible(void)
1835{
1836 assert(0);
1837 return false;
1838}
1839
1840int ram_write_tracking_start(void)
1841{
1842 assert(0);
1843 return -1;
1844}
1845
1846void ram_write_tracking_stop(void)
1847{
1848 assert(0);
1849}
1850#endif /* defined(__linux__) */
1851
3d0684b2 1852/**
ff1543af 1853 * get_queued_page: unqueue a page from the postcopy requests
3d0684b2
JQ
1854 *
1855 * Skips pages that are already sent (!dirty)
a82d593b 1856 *
a5f7b1a6 1857 * Returns true if a queued page is found
a82d593b 1858 *
6f37bb8b 1859 * @rs: current RAM state
3d0684b2 1860 * @pss: data about the state of the current dirty page scan
a82d593b 1861 */
f20e2865 1862static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
a82d593b
DDAG
1863{
1864 RAMBlock *block;
1865 ram_addr_t offset;
777f53c7
TH
1866 bool dirty;
1867
1868 do {
1869 block = unqueue_page(rs, &offset);
1870 /*
1871 * We're sending this page, and since it's postcopy nothing else
1872 * will dirty it, and we must make sure it doesn't get sent again
1873 * even if this queue request was received after the background
1874 * search already sent it.
1875 */
1876 if (block) {
1877 unsigned long page;
1878
1879 page = offset >> TARGET_PAGE_BITS;
1880 dirty = test_bit(page, block->bmap);
1881 if (!dirty) {
1882 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1883 page);
1884 } else {
1885 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1886 }
1887 }
a82d593b 1888
777f53c7 1889 } while (block && !dirty);
a82d593b 1890
b062106d 1891 if (!block) {
278e2f55
AG
1892 /*
1893 * Poll write faults too if background snapshot is enabled; that's
1894 * when we have vcpus got blocked by the write protected pages.
1895 */
1896 block = poll_fault_page(rs, &offset);
1897 }
1898
a82d593b 1899 if (block) {
a82d593b
DDAG
1900 /*
1901 * We want the background search to continue from the queued page
1902 * since the guest is likely to want other pages near to the page
1903 * it just requested.
1904 */
1905 pss->block = block;
a935e30f 1906 pss->page = offset >> TARGET_PAGE_BITS;
422314e7
WY
1907
1908 /*
1909 * This unqueued page would break the "one round" check, even is
1910 * really rare.
1911 */
1912 pss->complete_round = false;
a82d593b
DDAG
1913 }
1914
1915 return !!block;
1916}
1917
6c595cde 1918/**
5e58f968
JQ
1919 * migration_page_queue_free: drop any remaining pages in the ram
1920 * request queue
6c595cde 1921 *
3d0684b2
JQ
1922 * It should be empty at the end anyway, but in error cases there may
1923 * be some left. in case that there is any page left, we drop it.
1924 *
6c595cde 1925 */
83c13382 1926static void migration_page_queue_free(RAMState *rs)
6c595cde 1927{
ec481c6c 1928 struct RAMSrcPageRequest *mspr, *next_mspr;
6c595cde
DDAG
1929 /* This queue generally should be empty - but in the case of a failed
1930 * migration might have some droppings in.
1931 */
89ac5a1d 1932 RCU_READ_LOCK_GUARD();
ec481c6c 1933 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
6c595cde 1934 memory_region_unref(mspr->rb->mr);
ec481c6c 1935 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
6c595cde
DDAG
1936 g_free(mspr);
1937 }
6c595cde
DDAG
1938}
1939
1940/**
3d0684b2
JQ
1941 * ram_save_queue_pages: queue the page for transmission
1942 *
1943 * A request from postcopy destination for example.
1944 *
1945 * Returns zero on success or negative on error
1946 *
3d0684b2
JQ
1947 * @rbname: Name of the RAMBLock of the request. NULL means the
1948 * same that last one.
1949 * @start: starting address from the start of the RAMBlock
1950 * @len: length (in bytes) to send
6c595cde 1951 */
96506894 1952int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
6c595cde
DDAG
1953{
1954 RAMBlock *ramblock;
53518d94 1955 RAMState *rs = ram_state;
6c595cde 1956
aff3f660 1957 stat64_add(&mig_stats.postcopy_requests, 1);
89ac5a1d
DDAG
1958 RCU_READ_LOCK_GUARD();
1959
6c595cde
DDAG
1960 if (!rbname) {
1961 /* Reuse last RAMBlock */
68a098f3 1962 ramblock = rs->last_req_rb;
6c595cde
DDAG
1963
1964 if (!ramblock) {
1965 /*
1966 * Shouldn't happen, we can't reuse the last RAMBlock if
1967 * it's the 1st request.
1968 */
1969 error_report("ram_save_queue_pages no previous block");
03acb4e9 1970 return -1;
6c595cde
DDAG
1971 }
1972 } else {
1973 ramblock = qemu_ram_block_by_name(rbname);
1974
1975 if (!ramblock) {
1976 /* We shouldn't be asked for a non-existent RAMBlock */
1977 error_report("ram_save_queue_pages no block '%s'", rbname);
03acb4e9 1978 return -1;
6c595cde 1979 }
68a098f3 1980 rs->last_req_rb = ramblock;
6c595cde
DDAG
1981 }
1982 trace_ram_save_queue_pages(ramblock->idstr, start, len);
542147f4 1983 if (!offset_in_ramblock(ramblock, start + len - 1)) {
9458ad6b
JQ
1984 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1985 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
6c595cde 1986 __func__, start, len, ramblock->used_length);
03acb4e9 1987 return -1;
6c595cde
DDAG
1988 }
1989
93589827
PX
1990 /*
1991 * When with postcopy preempt, we send back the page directly in the
1992 * rp-return thread.
1993 */
1994 if (postcopy_preempt_active()) {
1995 ram_addr_t page_start = start >> TARGET_PAGE_BITS;
1996 size_t page_size = qemu_ram_pagesize(ramblock);
1997 PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY];
1998 int ret = 0;
1999
2000 qemu_mutex_lock(&rs->bitmap_mutex);
2001
2002 pss_init(pss, ramblock, page_start);
2003 /*
2004 * Always use the preempt channel, and make sure it's there. It's
2005 * safe to access without lock, because when rp-thread is running
2006 * we should be the only one who operates on the qemufile
2007 */
2008 pss->pss_channel = migrate_get_current()->postcopy_qemufile_src;
93589827
PX
2009 assert(pss->pss_channel);
2010
2011 /*
2012 * It must be either one or multiple of host page size. Just
2013 * assert; if something wrong we're mostly split brain anyway.
2014 */
2015 assert(len % page_size == 0);
2016 while (len) {
2017 if (ram_save_host_page_urgent(pss)) {
2018 error_report("%s: ram_save_host_page_urgent() failed: "
2019 "ramblock=%s, start_addr=0x"RAM_ADDR_FMT,
2020 __func__, ramblock->idstr, start);
2021 ret = -1;
2022 break;
2023 }
2024 /*
2025 * NOTE: after ram_save_host_page_urgent() succeeded, pss->page
2026 * will automatically be moved and point to the next host page
2027 * we're going to send, so no need to update here.
2028 *
2029 * Normally QEMU never sends >1 host page in requests, so
2030 * logically we don't even need that as the loop should only
2031 * run once, but just to be consistent.
2032 */
2033 len -= page_size;
2034 };
2035 qemu_mutex_unlock(&rs->bitmap_mutex);
2036
2037 return ret;
2038 }
2039
ec481c6c 2040 struct RAMSrcPageRequest *new_entry =
b21e2380 2041 g_new0(struct RAMSrcPageRequest, 1);
6c595cde
DDAG
2042 new_entry->rb = ramblock;
2043 new_entry->offset = start;
2044 new_entry->len = len;
2045
2046 memory_region_ref(ramblock->mr);
ec481c6c
JQ
2047 qemu_mutex_lock(&rs->src_page_req_mutex);
2048 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
e03a34f8 2049 migration_make_urgent_request();
ec481c6c 2050 qemu_mutex_unlock(&rs->src_page_req_mutex);
6c595cde
DDAG
2051
2052 return 0;
6c595cde
DDAG
2053}
2054
d7400a34
XG
2055static bool save_page_use_compression(RAMState *rs)
2056{
a7a94d14 2057 if (!migrate_compress()) {
d7400a34
XG
2058 return false;
2059 }
2060
2061 /*
1a373522
DH
2062 * If xbzrle is enabled (e.g., after first round of migration), stop
2063 * using the data compression. In theory, xbzrle can do better than
2064 * compression.
d7400a34 2065 */
f3095cc8 2066 if (rs->xbzrle_started) {
1a373522 2067 return false;
d7400a34
XG
2068 }
2069
1a373522 2070 return true;
d7400a34
XG
2071}
2072
5e5fdcff
XG
2073/*
2074 * try to compress the page before posting it out, return true if the page
2075 * has been properly handled by compression, otherwise needs other
2076 * paths to handle it
2077 */
ec6f3ab9
PX
2078static bool save_compress_page(RAMState *rs, PageSearchStatus *pss,
2079 RAMBlock *block, ram_addr_t offset)
5e5fdcff
XG
2080{
2081 if (!save_page_use_compression(rs)) {
2082 return false;
2083 }
2084
2085 /*
2086 * When starting the process of a new block, the first page of
2087 * the block should be sent out before other pages in the same
2088 * block, and all the pages in last block should have been sent
2089 * out, keeping this order is important, because the 'cont' flag
2090 * is used to avoid resending the block name.
2091 *
2092 * We post the fist page as normal page as compression will take
2093 * much CPU resource.
2094 */
ec6f3ab9 2095 if (block != pss->last_sent_block) {
ef4f5f5d 2096 ram_flush_compressed_data(rs);
5e5fdcff
XG
2097 return false;
2098 }
2099
ef4f5f5d 2100 if (compress_page_with_multi_thread(block, offset, send_queued_data) > 0) {
5e5fdcff
XG
2101 return true;
2102 }
2103
76e03000 2104 compression_counters.busy++;
5e5fdcff
XG
2105 return false;
2106}
2107
a82d593b 2108/**
4010ba38 2109 * ram_save_target_page_legacy: save one target page
a82d593b 2110 *
3d0684b2 2111 * Returns the number of pages written
a82d593b 2112 *
6f37bb8b 2113 * @rs: current RAM state
3d0684b2 2114 * @pss: data about the page we want to send
a82d593b 2115 */
4010ba38 2116static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss)
a82d593b 2117{
a8ec91f9 2118 RAMBlock *block = pss->block;
8bba004c 2119 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
a8ec91f9
XG
2120 int res;
2121
61717ea9 2122 if (control_save_page(pss, block, offset, &res)) {
a8ec91f9
XG
2123 return res;
2124 }
2125
ec6f3ab9 2126 if (save_compress_page(rs, pss, block, offset)) {
5e5fdcff 2127 return 1;
d7400a34
XG
2128 }
2129
37502df3 2130 res = save_zero_page(pss, pss->pss_channel, block, offset);
d7400a34
XG
2131 if (res > 0) {
2132 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2133 * page would be stale
2134 */
f3095cc8 2135 if (rs->xbzrle_started) {
d7400a34
XG
2136 XBZRLE_cache_lock();
2137 xbzrle_cache_zero_page(rs, block->offset + offset);
2138 XBZRLE_cache_unlock();
2139 }
d7400a34
XG
2140 return res;
2141 }
2142
da3f56cb 2143 /*
6f39c90b
PX
2144 * Do not use multifd in postcopy as one whole host page should be
2145 * placed. Meanwhile postcopy requires atomic update of pages, so even
2146 * if host page size == guest page size the dest guest during run may
2147 * still see partially copied pages which is data corruption.
da3f56cb 2148 */
51b07548 2149 if (migrate_multifd() && !migration_in_postcopy()) {
61717ea9 2150 return ram_save_multifd_page(pss->pss_channel, block, offset);
a82d593b
DDAG
2151 }
2152
05931ec5 2153 return ram_save_page(rs, pss);
a82d593b
DDAG
2154}
2155
d9e474ea
PX
2156/* Should be called before sending a host page */
2157static void pss_host_page_prepare(PageSearchStatus *pss)
2158{
2159 /* How many guest pages are there in one host page? */
2160 size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2161
2162 pss->host_page_sending = true;
301d7ffe
PX
2163 if (guest_pfns <= 1) {
2164 /*
2165 * This covers both when guest psize == host psize, or when guest
2166 * has larger psize than the host (guest_pfns==0).
2167 *
2168 * For the latter, we always send one whole guest page per
2169 * iteration of the host page (example: an Alpha VM on x86 host
2170 * will have guest psize 8K while host psize 4K).
2171 */
2172 pss->host_page_start = pss->page;
2173 pss->host_page_end = pss->page + 1;
2174 } else {
2175 /*
2176 * The host page spans over multiple guest pages, we send them
2177 * within the same host page iteration.
2178 */
2179 pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
2180 pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
2181 }
d9e474ea
PX
2182}
2183
2184/*
2185 * Whether the page pointed by PSS is within the host page being sent.
2186 * Must be called after a previous pss_host_page_prepare().
2187 */
2188static bool pss_within_range(PageSearchStatus *pss)
2189{
2190 ram_addr_t ram_addr;
2191
2192 assert(pss->host_page_sending);
2193
2194 /* Over host-page boundary? */
2195 if (pss->page >= pss->host_page_end) {
2196 return false;
2197 }
2198
2199 ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2200
2201 return offset_in_ramblock(pss->block, ram_addr);
2202}
2203
2204static void pss_host_page_finish(PageSearchStatus *pss)
2205{
2206 pss->host_page_sending = false;
2207 /* This is not needed, but just to reset it */
2208 pss->host_page_start = pss->host_page_end = 0;
2209}
2210
93589827
PX
2211/*
2212 * Send an urgent host page specified by `pss'. Need to be called with
2213 * bitmap_mutex held.
2214 *
2215 * Returns 0 if save host page succeeded, false otherwise.
2216 */
2217static int ram_save_host_page_urgent(PageSearchStatus *pss)
2218{
2219 bool page_dirty, sent = false;
2220 RAMState *rs = ram_state;
2221 int ret = 0;
2222
2223 trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2224 pss_host_page_prepare(pss);
2225
2226 /*
2227 * If precopy is sending the same page, let it be done in precopy, or
2228 * we could send the same page in two channels and none of them will
2229 * receive the whole page.
2230 */
2231 if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) {
2232 trace_postcopy_preempt_hit(pss->block->idstr,
2233 pss->page << TARGET_PAGE_BITS);
2234 return 0;
2235 }
2236
2237 do {
2238 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2239
2240 if (page_dirty) {
2241 /* Be strict to return code; it must be 1, or what else? */
4010ba38 2242 if (migration_ops->ram_save_target_page(rs, pss) != 1) {
93589827
PX
2243 error_report_once("%s: ram_save_target_page failed", __func__);
2244 ret = -1;
2245 goto out;
2246 }
2247 sent = true;
2248 }
2249 pss_find_next_dirty(pss);
2250 } while (pss_within_range(pss));
2251out:
2252 pss_host_page_finish(pss);
2253 /* For urgent requests, flush immediately if sent */
2254 if (sent) {
2255 qemu_fflush(pss->pss_channel);
2256 }
2257 return ret;
2258}
2259
a82d593b 2260/**
3d0684b2 2261 * ram_save_host_page: save a whole host page
a82d593b 2262 *
3d0684b2
JQ
2263 * Starting at *offset send pages up to the end of the current host
2264 * page. It's valid for the initial offset to point into the middle of
2265 * a host page in which case the remainder of the hostpage is sent.
2266 * Only dirty target pages are sent. Note that the host page size may
2267 * be a huge page for this block.
f3321554 2268 *
1eb3fc0a
DDAG
2269 * The saving stops at the boundary of the used_length of the block
2270 * if the RAMBlock isn't a multiple of the host page size.
a82d593b 2271 *
f3321554
PX
2272 * The caller must be with ram_state.bitmap_mutex held to call this
2273 * function. Note that this function can temporarily release the lock, but
2274 * when the function is returned it'll make sure the lock is still held.
2275 *
3d0684b2
JQ
2276 * Returns the number of pages written or negative on error
2277 *
6f37bb8b 2278 * @rs: current RAM state
3d0684b2 2279 * @pss: data about the page we want to send
a82d593b 2280 */
05931ec5 2281static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
a82d593b 2282{
f3321554 2283 bool page_dirty, preempt_active = postcopy_preempt_active();
a82d593b 2284 int tmppages, pages = 0;
a935e30f
JQ
2285 size_t pagesize_bits =
2286 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
278e2f55
AG
2287 unsigned long start_page = pss->page;
2288 int res;
4c011c37 2289
fbd162e6 2290 if (ramblock_is_ignored(pss->block)) {
b895de50
CLG
2291 error_report("block %s should not be migrated !", pss->block->idstr);
2292 return 0;
2293 }
2294
d9e474ea
PX
2295 /* Update host page boundary information */
2296 pss_host_page_prepare(pss);
2297
a82d593b 2298 do {
f3321554 2299 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
a82d593b 2300
f3321554
PX
2301 /* Check the pages is dirty and if it is send it */
2302 if (page_dirty) {
ba1b7c81 2303 /*
f3321554
PX
2304 * Properly yield the lock only in postcopy preempt mode
2305 * because both migration thread and rp-return thread can
2306 * operate on the bitmaps.
ba1b7c81 2307 */
f3321554
PX
2308 if (preempt_active) {
2309 qemu_mutex_unlock(&rs->bitmap_mutex);
ba1b7c81 2310 }
4010ba38 2311 tmppages = migration_ops->ram_save_target_page(rs, pss);
f3321554
PX
2312 if (tmppages >= 0) {
2313 pages += tmppages;
2314 /*
2315 * Allow rate limiting to happen in the middle of huge pages if
2316 * something is sent in the current iteration.
2317 */
2318 if (pagesize_bits > 1 && tmppages > 0) {
2319 migration_rate_limit();
2320 }
2321 }
2322 if (preempt_active) {
2323 qemu_mutex_lock(&rs->bitmap_mutex);
2324 }
2325 } else {
2326 tmppages = 0;
23feba90 2327 }
f3321554
PX
2328
2329 if (tmppages < 0) {
d9e474ea 2330 pss_host_page_finish(pss);
f3321554
PX
2331 return tmppages;
2332 }
2333
d9e474ea
PX
2334 pss_find_next_dirty(pss);
2335 } while (pss_within_range(pss));
2336
2337 pss_host_page_finish(pss);
278e2f55
AG
2338
2339 res = ram_save_release_protection(rs, pss, start_page);
2340 return (res < 0 ? res : pages);
a82d593b 2341}
6c595cde 2342
56e93d26 2343/**
3d0684b2 2344 * ram_find_and_save_block: finds a dirty page and sends it to f
56e93d26
JQ
2345 *
2346 * Called within an RCU critical section.
2347 *
e8f3735f
XG
2348 * Returns the number of pages written where zero means no dirty pages,
2349 * or negative on error
56e93d26 2350 *
6f37bb8b 2351 * @rs: current RAM state
a82d593b
DDAG
2352 *
2353 * On systems where host-page-size > target-page-size it will send all the
2354 * pages in a host page that are dirty.
56e93d26 2355 */
05931ec5 2356static int ram_find_and_save_block(RAMState *rs)
56e93d26 2357{
f1668764 2358 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
56e93d26 2359 int pages = 0;
56e93d26 2360
0827b9e9 2361 /* No dirty page as there is zero RAM */
8d80e195 2362 if (!rs->ram_bytes_total) {
0827b9e9
AA
2363 return pages;
2364 }
2365
4934a5dd
PX
2366 /*
2367 * Always keep last_seen_block/last_page valid during this procedure,
2368 * because find_dirty_block() relies on these values (e.g., we compare
2369 * last_seen_block with pss.block to see whether we searched all the
2370 * ramblocks) to detect the completion of migration. Having NULL value
2371 * of last_seen_block can conditionally cause below loop to run forever.
2372 */
2373 if (!rs->last_seen_block) {
2374 rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks);
2375 rs->last_page = 0;
2376 }
2377
f1668764 2378 pss_init(pss, rs->last_seen_block, rs->last_page);
b8fb8cb7 2379
31e2ac74 2380 while (true){
51efd36f 2381 if (!get_queued_page(rs, pss)) {
b062106d 2382 /* priority queue empty, so just search for something dirty */
31e2ac74
JQ
2383 int res = find_dirty_block(rs, pss);
2384 if (res != PAGE_DIRTY_FOUND) {
2385 if (res == PAGE_ALL_CLEAN) {
51efd36f 2386 break;
31e2ac74
JQ
2387 } else if (res == PAGE_TRY_AGAIN) {
2388 continue;
294e5a40
JQ
2389 } else if (res < 0) {
2390 pages = res;
2391 break;
51efd36f
JQ
2392 }
2393 }
56e93d26 2394 }
51efd36f 2395 pages = ram_save_host_page(rs, pss);
31e2ac74
JQ
2396 if (pages) {
2397 break;
2398 }
2399 }
56e93d26 2400
f1668764
PX
2401 rs->last_seen_block = pss->block;
2402 rs->last_page = pss->page;
56e93d26
JQ
2403
2404 return pages;
2405}
2406
8008a272 2407static uint64_t ram_bytes_total_with_ignored(void)
56e93d26
JQ
2408{
2409 RAMBlock *block;
2410 uint64_t total = 0;
2411
89ac5a1d
DDAG
2412 RCU_READ_LOCK_GUARD();
2413
8008a272
JQ
2414 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2415 total += block->used_length;
99e15582 2416 }
56e93d26
JQ
2417 return total;
2418}
2419
fbd162e6
YK
2420uint64_t ram_bytes_total(void)
2421{
8008a272
JQ
2422 RAMBlock *block;
2423 uint64_t total = 0;
2424
2425 RCU_READ_LOCK_GUARD();
2426
2427 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2428 total += block->used_length;
2429 }
2430 return total;
fbd162e6
YK
2431}
2432
f265e0e4 2433static void xbzrle_load_setup(void)
56e93d26 2434{
f265e0e4 2435 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
56e93d26
JQ
2436}
2437
f265e0e4
JQ
2438static void xbzrle_load_cleanup(void)
2439{
2440 g_free(XBZRLE.decoded_buf);
2441 XBZRLE.decoded_buf = NULL;
2442}
2443
7d7c96be
PX
2444static void ram_state_cleanup(RAMState **rsp)
2445{
b9ccaf6d
DDAG
2446 if (*rsp) {
2447 migration_page_queue_free(*rsp);
2448 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2449 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2450 g_free(*rsp);
2451 *rsp = NULL;
2452 }
7d7c96be
PX
2453}
2454
84593a08
PX
2455static void xbzrle_cleanup(void)
2456{
2457 XBZRLE_cache_lock();
2458 if (XBZRLE.cache) {
2459 cache_fini(XBZRLE.cache);
2460 g_free(XBZRLE.encoded_buf);
2461 g_free(XBZRLE.current_buf);
2462 g_free(XBZRLE.zero_target_page);
2463 XBZRLE.cache = NULL;
2464 XBZRLE.encoded_buf = NULL;
2465 XBZRLE.current_buf = NULL;
2466 XBZRLE.zero_target_page = NULL;
2467 }
2468 XBZRLE_cache_unlock();
2469}
2470
f265e0e4 2471static void ram_save_cleanup(void *opaque)
56e93d26 2472{
53518d94 2473 RAMState **rsp = opaque;
6b6712ef 2474 RAMBlock *block;
eb859c53 2475
278e2f55
AG
2476 /* We don't use dirty log with background snapshots */
2477 if (!migrate_background_snapshot()) {
2478 /* caller have hold iothread lock or is in a bh, so there is
2479 * no writing race against the migration bitmap
2480 */
63b41db4
HH
2481 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2482 /*
2483 * do not stop dirty log without starting it, since
2484 * memory_global_dirty_log_stop will assert that
2485 * memory_global_dirty_log_start/stop used in pairs
2486 */
2487 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2488 }
278e2f55 2489 }
6b6712ef 2490
fbd162e6 2491 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
002cad6b
PX
2492 g_free(block->clear_bmap);
2493 block->clear_bmap = NULL;
6b6712ef
JQ
2494 g_free(block->bmap);
2495 block->bmap = NULL;
56e93d26
JQ
2496 }
2497
84593a08 2498 xbzrle_cleanup();
f0afa331 2499 compress_threads_save_cleanup();
7d7c96be 2500 ram_state_cleanup(rsp);
4010ba38
JQ
2501 g_free(migration_ops);
2502 migration_ops = NULL;
56e93d26
JQ
2503}
2504
6f37bb8b 2505static void ram_state_reset(RAMState *rs)
56e93d26 2506{
ec6f3ab9
PX
2507 int i;
2508
2509 for (i = 0; i < RAM_CHANNEL_MAX; i++) {
2510 rs->pss[i].last_sent_block = NULL;
2511 }
2512
6f37bb8b 2513 rs->last_seen_block = NULL;
269ace29 2514 rs->last_page = 0;
6f37bb8b 2515 rs->last_version = ram_list.version;
f3095cc8 2516 rs->xbzrle_started = false;
56e93d26
JQ
2517}
2518
2519#define MAX_WAIT 50 /* ms, half buffered_file limit */
2520
e0b266f0
DDAG
2521/* **** functions for postcopy ***** */
2522
ced1c616
PB
2523void ram_postcopy_migrated_memory_release(MigrationState *ms)
2524{
2525 struct RAMBlock *block;
ced1c616 2526
fbd162e6 2527 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
6b6712ef
JQ
2528 unsigned long *bitmap = block->bmap;
2529 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2530 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
ced1c616
PB
2531
2532 while (run_start < range) {
2533 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
8bba004c
AR
2534 ram_discard_range(block->idstr,
2535 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2536 ((ram_addr_t)(run_end - run_start))
2537 << TARGET_PAGE_BITS);
ced1c616
PB
2538 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2539 }
2540 }
2541}
2542
3d0684b2
JQ
2543/**
2544 * postcopy_send_discard_bm_ram: discard a RAMBlock
2545 *
e0b266f0 2546 * Callback from postcopy_each_ram_send_discard for each RAMBlock
3d0684b2
JQ
2547 *
2548 * @ms: current migration state
89dab31b 2549 * @block: RAMBlock to discard
e0b266f0 2550 */
9e7d1223 2551static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
e0b266f0 2552{
6b6712ef 2553 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
e0b266f0 2554 unsigned long current;
1e7cf8c3 2555 unsigned long *bitmap = block->bmap;
e0b266f0 2556
6b6712ef 2557 for (current = 0; current < end; ) {
1e7cf8c3 2558 unsigned long one = find_next_bit(bitmap, end, current);
33a5cb62 2559 unsigned long zero, discard_length;
e0b266f0 2560
33a5cb62
WY
2561 if (one >= end) {
2562 break;
2563 }
e0b266f0 2564
1e7cf8c3 2565 zero = find_next_zero_bit(bitmap, end, one + 1);
33a5cb62
WY
2566
2567 if (zero >= end) {
2568 discard_length = end - one;
e0b266f0 2569 } else {
33a5cb62
WY
2570 discard_length = zero - one;
2571 }
810cf2bb 2572 postcopy_discard_send_range(ms, one, discard_length);
33a5cb62 2573 current = one + discard_length;
e0b266f0 2574 }
e0b266f0
DDAG
2575}
2576
f30c2e5b
PX
2577static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2578
3d0684b2
JQ
2579/**
2580 * postcopy_each_ram_send_discard: discard all RAMBlocks
2581 *
e0b266f0
DDAG
2582 * Utility for the outgoing postcopy code.
2583 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2584 * passing it bitmap indexes and name.
e0b266f0
DDAG
2585 * (qemu_ram_foreach_block ends up passing unscaled lengths
2586 * which would mean postcopy code would have to deal with target page)
3d0684b2
JQ
2587 *
2588 * @ms: current migration state
e0b266f0 2589 */
739fcc1b 2590static void postcopy_each_ram_send_discard(MigrationState *ms)
e0b266f0
DDAG
2591{
2592 struct RAMBlock *block;
e0b266f0 2593
fbd162e6 2594 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
810cf2bb 2595 postcopy_discard_send_init(ms, block->idstr);
e0b266f0 2596
f30c2e5b
PX
2597 /*
2598 * Deal with TPS != HPS and huge pages. It discard any partially sent
2599 * host-page size chunks, mark any partially dirty host-page size
2600 * chunks as all dirty. In this case the host-page is the host-page
2601 * for the particular RAMBlock, i.e. it might be a huge page.
2602 */
2603 postcopy_chunk_hostpages_pass(ms, block);
2604
e0b266f0
DDAG
2605 /*
2606 * Postcopy sends chunks of bitmap over the wire, but it
2607 * just needs indexes at this point, avoids it having
2608 * target page specific code.
2609 */
739fcc1b 2610 postcopy_send_discard_bm_ram(ms, block);
810cf2bb 2611 postcopy_discard_send_finish(ms);
e0b266f0 2612 }
e0b266f0
DDAG
2613}
2614
3d0684b2 2615/**
8324ef86 2616 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
3d0684b2
JQ
2617 *
2618 * Helper for postcopy_chunk_hostpages; it's called twice to
2619 * canonicalize the two bitmaps, that are similar, but one is
2620 * inverted.
99e314eb 2621 *
3d0684b2
JQ
2622 * Postcopy requires that all target pages in a hostpage are dirty or
2623 * clean, not a mix. This function canonicalizes the bitmaps.
99e314eb 2624 *
3d0684b2 2625 * @ms: current migration state
3d0684b2 2626 * @block: block that contains the page we want to canonicalize
99e314eb 2627 */
1e7cf8c3 2628static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
99e314eb 2629{
53518d94 2630 RAMState *rs = ram_state;
6b6712ef 2631 unsigned long *bitmap = block->bmap;
29c59172 2632 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
6b6712ef 2633 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
99e314eb
DDAG
2634 unsigned long run_start;
2635
29c59172
DDAG
2636 if (block->page_size == TARGET_PAGE_SIZE) {
2637 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2638 return;
2639 }
2640
1e7cf8c3
WY
2641 /* Find a dirty page */
2642 run_start = find_next_bit(bitmap, pages, 0);
99e314eb 2643
6b6712ef 2644 while (run_start < pages) {
99e314eb
DDAG
2645
2646 /*
2647 * If the start of this run of pages is in the middle of a host
2648 * page, then we need to fixup this host page.
2649 */
9dec3cc3 2650 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
99e314eb 2651 /* Find the end of this run */
1e7cf8c3 2652 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
99e314eb
DDAG
2653 /*
2654 * If the end isn't at the start of a host page, then the
2655 * run doesn't finish at the end of a host page
2656 * and we need to discard.
2657 */
99e314eb
DDAG
2658 }
2659
9dec3cc3 2660 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
99e314eb 2661 unsigned long page;
dad45ab2
WY
2662 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2663 host_ratio);
2664 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
99e314eb 2665
99e314eb
DDAG
2666 /* Clean up the bitmap */
2667 for (page = fixup_start_addr;
2668 page < fixup_start_addr + host_ratio; page++) {
99e314eb
DDAG
2669 /*
2670 * Remark them as dirty, updating the count for any pages
2671 * that weren't previously dirty.
2672 */
0d8ec885 2673 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
99e314eb
DDAG
2674 }
2675 }
2676
1e7cf8c3
WY
2677 /* Find the next dirty page for the next iteration */
2678 run_start = find_next_bit(bitmap, pages, run_start);
99e314eb
DDAG
2679 }
2680}
2681
3d0684b2
JQ
2682/**
2683 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2684 *
e0b266f0
DDAG
2685 * Transmit the set of pages to be discarded after precopy to the target
2686 * these are pages that:
2687 * a) Have been previously transmitted but are now dirty again
2688 * b) Pages that have never been transmitted, this ensures that
2689 * any pages on the destination that have been mapped by background
2690 * tasks get discarded (transparent huge pages is the specific concern)
2691 * Hopefully this is pretty sparse
3d0684b2
JQ
2692 *
2693 * @ms: current migration state
e0b266f0 2694 */
739fcc1b 2695void ram_postcopy_send_discard_bitmap(MigrationState *ms)
e0b266f0 2696{
53518d94 2697 RAMState *rs = ram_state;
e0b266f0 2698
89ac5a1d 2699 RCU_READ_LOCK_GUARD();
e0b266f0
DDAG
2700
2701 /* This should be our last sync, the src is now paused */
1e493be5 2702 migration_bitmap_sync(rs, false);
e0b266f0 2703
6b6712ef 2704 /* Easiest way to make sure we don't resume in the middle of a host-page */
ec6f3ab9 2705 rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL;
6b6712ef 2706 rs->last_seen_block = NULL;
6b6712ef 2707 rs->last_page = 0;
e0b266f0 2708
739fcc1b 2709 postcopy_each_ram_send_discard(ms);
e0b266f0 2710
739fcc1b 2711 trace_ram_postcopy_send_discard_bitmap();
e0b266f0
DDAG
2712}
2713
3d0684b2
JQ
2714/**
2715 * ram_discard_range: discard dirtied pages at the beginning of postcopy
e0b266f0 2716 *
3d0684b2 2717 * Returns zero on success
e0b266f0 2718 *
36449157
JQ
2719 * @rbname: name of the RAMBlock of the request. NULL means the
2720 * same that last one.
3d0684b2
JQ
2721 * @start: RAMBlock starting page
2722 * @length: RAMBlock size
e0b266f0 2723 */
aaa2064c 2724int ram_discard_range(const char *rbname, uint64_t start, size_t length)
e0b266f0 2725{
36449157 2726 trace_ram_discard_range(rbname, start, length);
d3a5038c 2727
89ac5a1d 2728 RCU_READ_LOCK_GUARD();
36449157 2729 RAMBlock *rb = qemu_ram_block_by_name(rbname);
e0b266f0
DDAG
2730
2731 if (!rb) {
36449157 2732 error_report("ram_discard_range: Failed to find block '%s'", rbname);
03acb4e9 2733 return -1;
e0b266f0
DDAG
2734 }
2735
814bb08f
PX
2736 /*
2737 * On source VM, we don't need to update the received bitmap since
2738 * we don't even have one.
2739 */
2740 if (rb->receivedmap) {
2741 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2742 length >> qemu_target_page_bits());
2743 }
2744
03acb4e9 2745 return ram_block_discard_range(rb, start, length);
e0b266f0
DDAG
2746}
2747
84593a08
PX
2748/*
2749 * For every allocation, we will try not to crash the VM if the
2750 * allocation failed.
2751 */
2752static int xbzrle_init(void)
2753{
2754 Error *local_err = NULL;
2755
87dca0c9 2756 if (!migrate_xbzrle()) {
84593a08
PX
2757 return 0;
2758 }
2759
2760 XBZRLE_cache_lock();
2761
2762 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2763 if (!XBZRLE.zero_target_page) {
2764 error_report("%s: Error allocating zero page", __func__);
2765 goto err_out;
2766 }
2767
2768 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2769 TARGET_PAGE_SIZE, &local_err);
2770 if (!XBZRLE.cache) {
2771 error_report_err(local_err);
2772 goto free_zero_page;
2773 }
2774
2775 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2776 if (!XBZRLE.encoded_buf) {
2777 error_report("%s: Error allocating encoded_buf", __func__);
2778 goto free_cache;
2779 }
2780
2781 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2782 if (!XBZRLE.current_buf) {
2783 error_report("%s: Error allocating current_buf", __func__);
2784 goto free_encoded_buf;
2785 }
2786
2787 /* We are all good */
2788 XBZRLE_cache_unlock();
2789 return 0;
2790
2791free_encoded_buf:
2792 g_free(XBZRLE.encoded_buf);
2793 XBZRLE.encoded_buf = NULL;
2794free_cache:
2795 cache_fini(XBZRLE.cache);
2796 XBZRLE.cache = NULL;
2797free_zero_page:
2798 g_free(XBZRLE.zero_target_page);
2799 XBZRLE.zero_target_page = NULL;
2800err_out:
2801 XBZRLE_cache_unlock();
2802 return -ENOMEM;
2803}
2804
53518d94 2805static int ram_state_init(RAMState **rsp)
56e93d26 2806{
7d00ee6a
PX
2807 *rsp = g_try_new0(RAMState, 1);
2808
2809 if (!*rsp) {
2810 error_report("%s: Init ramstate fail", __func__);
2811 return -1;
2812 }
53518d94
JQ
2813
2814 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2815 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2816 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
8d80e195 2817 (*rsp)->ram_bytes_total = ram_bytes_total();
56e93d26 2818
7d00ee6a 2819 /*
40c4d4a8
IR
2820 * Count the total number of pages used by ram blocks not including any
2821 * gaps due to alignment or unplugs.
03158519 2822 * This must match with the initial values of dirty bitmap.
7d00ee6a 2823 */
8d80e195 2824 (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS;
7d00ee6a
PX
2825 ram_state_reset(*rsp);
2826
2827 return 0;
2828}
2829
d6eff5d7 2830static void ram_list_init_bitmaps(void)
7d00ee6a 2831{
002cad6b 2832 MigrationState *ms = migrate_get_current();
d6eff5d7
PX
2833 RAMBlock *block;
2834 unsigned long pages;
002cad6b 2835 uint8_t shift;
56e93d26 2836
0827b9e9
AA
2837 /* Skip setting bitmap if there is no RAM */
2838 if (ram_bytes_total()) {
002cad6b
PX
2839 shift = ms->clear_bitmap_shift;
2840 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2841 error_report("clear_bitmap_shift (%u) too big, using "
2842 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2843 shift = CLEAR_BITMAP_SHIFT_MAX;
2844 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2845 error_report("clear_bitmap_shift (%u) too small, using "
2846 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2847 shift = CLEAR_BITMAP_SHIFT_MIN;
2848 }
2849
fbd162e6 2850 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
d6eff5d7 2851 pages = block->max_length >> TARGET_PAGE_BITS;
03158519
WY
2852 /*
2853 * The initial dirty bitmap for migration must be set with all
2854 * ones to make sure we'll migrate every guest RAM page to
2855 * destination.
40c4d4a8
IR
2856 * Here we set RAMBlock.bmap all to 1 because when rebegin a
2857 * new migration after a failed migration, ram_list.
2858 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2859 * guest memory.
03158519 2860 */
6b6712ef 2861 block->bmap = bitmap_new(pages);
40c4d4a8 2862 bitmap_set(block->bmap, 0, pages);
002cad6b
PX
2863 block->clear_bmap_shift = shift;
2864 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
0827b9e9 2865 }
f3f491fc 2866 }
d6eff5d7
PX
2867}
2868
be39b4cd
DH
2869static void migration_bitmap_clear_discarded_pages(RAMState *rs)
2870{
2871 unsigned long pages;
2872 RAMBlock *rb;
2873
2874 RCU_READ_LOCK_GUARD();
2875
2876 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
2877 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
2878 rs->migration_dirty_pages -= pages;
2879 }
2880}
2881
d6eff5d7
PX
2882static void ram_init_bitmaps(RAMState *rs)
2883{
2884 /* For memory_global_dirty_log_start below. */
2885 qemu_mutex_lock_iothread();
2886 qemu_mutex_lock_ramlist();
f3f491fc 2887
89ac5a1d
DDAG
2888 WITH_RCU_READ_LOCK_GUARD() {
2889 ram_list_init_bitmaps();
278e2f55
AG
2890 /* We don't use dirty log with background snapshots */
2891 if (!migrate_background_snapshot()) {
63b41db4 2892 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
1e493be5 2893 migration_bitmap_sync_precopy(rs, false);
278e2f55 2894 }
89ac5a1d 2895 }
56e93d26 2896 qemu_mutex_unlock_ramlist();
49877834 2897 qemu_mutex_unlock_iothread();
be39b4cd
DH
2898
2899 /*
2900 * After an eventual first bitmap sync, fixup the initial bitmap
2901 * containing all 1s to exclude any discarded pages from migration.
2902 */
2903 migration_bitmap_clear_discarded_pages(rs);
d6eff5d7
PX
2904}
2905
2906static int ram_init_all(RAMState **rsp)
2907{
2908 if (ram_state_init(rsp)) {
2909 return -1;
2910 }
2911
2912 if (xbzrle_init()) {
2913 ram_state_cleanup(rsp);
2914 return -1;
2915 }
2916
2917 ram_init_bitmaps(*rsp);
a91246c9
HZ
2918
2919 return 0;
2920}
2921
08614f34
PX
2922static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2923{
2924 RAMBlock *block;
2925 uint64_t pages = 0;
2926
2927 /*
2928 * Postcopy is not using xbzrle/compression, so no need for that.
2929 * Also, since source are already halted, we don't need to care
2930 * about dirty page logging as well.
2931 */
2932
fbd162e6 2933 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
08614f34
PX
2934 pages += bitmap_count_one(block->bmap,
2935 block->used_length >> TARGET_PAGE_BITS);
2936 }
2937
2938 /* This may not be aligned with current bitmaps. Recalculate. */
2939 rs->migration_dirty_pages = pages;
2940
1a373522 2941 ram_state_reset(rs);
08614f34
PX
2942
2943 /* Update RAMState cache of output QEMUFile */
7f401b80 2944 rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out;
08614f34
PX
2945
2946 trace_ram_state_resume_prepare(pages);
2947}
2948
6bcb05fc
WW
2949/*
2950 * This function clears bits of the free pages reported by the caller from the
2951 * migration dirty bitmap. @addr is the host address corresponding to the
2952 * start of the continuous guest free pages, and @len is the total bytes of
2953 * those pages.
2954 */
2955void qemu_guest_free_page_hint(void *addr, size_t len)
2956{
2957 RAMBlock *block;
2958 ram_addr_t offset;
2959 size_t used_len, start, npages;
2960 MigrationState *s = migrate_get_current();
2961
2962 /* This function is currently expected to be used during live migration */
2963 if (!migration_is_setup_or_active(s->state)) {
2964 return;
2965 }
2966
2967 for (; len > 0; len -= used_len, addr += used_len) {
2968 block = qemu_ram_block_from_host(addr, false, &offset);
2969 if (unlikely(!block || offset >= block->used_length)) {
2970 /*
2971 * The implementation might not support RAMBlock resize during
2972 * live migration, but it could happen in theory with future
2973 * updates. So we add a check here to capture that case.
2974 */
2975 error_report_once("%s unexpected error", __func__);
2976 return;
2977 }
2978
2979 if (len <= block->used_length - offset) {
2980 used_len = len;
2981 } else {
2982 used_len = block->used_length - offset;
2983 }
2984
2985 start = offset >> TARGET_PAGE_BITS;
2986 npages = used_len >> TARGET_PAGE_BITS;
2987
2988 qemu_mutex_lock(&ram_state->bitmap_mutex);
3143577d
WW
2989 /*
2990 * The skipped free pages are equavalent to be sent from clear_bmap's
2991 * perspective, so clear the bits from the memory region bitmap which
2992 * are initially set. Otherwise those skipped pages will be sent in
2993 * the next round after syncing from the memory region bitmap.
2994 */
1230a25f 2995 migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
6bcb05fc
WW
2996 ram_state->migration_dirty_pages -=
2997 bitmap_count_one_with_offset(block->bmap, start, npages);
2998 bitmap_clear(block->bmap, start, npages);
2999 qemu_mutex_unlock(&ram_state->bitmap_mutex);
3000 }
3001}
3002
3d0684b2
JQ
3003/*
3004 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
a91246c9
HZ
3005 * long-running RCU critical section. When rcu-reclaims in the code
3006 * start to become numerous it will be necessary to reduce the
3007 * granularity of these critical sections.
3008 */
3009
3d0684b2
JQ
3010/**
3011 * ram_save_setup: Setup RAM for migration
3012 *
3013 * Returns zero to indicate success and negative for error
3014 *
3015 * @f: QEMUFile where to send the data
3016 * @opaque: RAMState pointer
3017 */
a91246c9
HZ
3018static int ram_save_setup(QEMUFile *f, void *opaque)
3019{
53518d94 3020 RAMState **rsp = opaque;
a91246c9 3021 RAMBlock *block;
33d70973 3022 int ret;
a91246c9 3023
dcaf446e
XG
3024 if (compress_threads_save_setup()) {
3025 return -1;
3026 }
3027
a91246c9
HZ
3028 /* migration has already setup the bitmap, reuse it. */
3029 if (!migration_in_colo_state()) {
7d00ee6a 3030 if (ram_init_all(rsp) != 0) {
dcaf446e 3031 compress_threads_save_cleanup();
a91246c9 3032 return -1;
53518d94 3033 }
a91246c9 3034 }
7f401b80 3035 (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f;
a91246c9 3036
0e6ebd48 3037 WITH_RCU_READ_LOCK_GUARD() {
8008a272
JQ
3038 qemu_put_be64(f, ram_bytes_total_with_ignored()
3039 | RAM_SAVE_FLAG_MEM_SIZE);
56e93d26 3040
0e6ebd48
DDAG
3041 RAMBLOCK_FOREACH_MIGRATABLE(block) {
3042 qemu_put_byte(f, strlen(block->idstr));
3043 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3044 qemu_put_be64(f, block->used_length);
3045 if (migrate_postcopy_ram() && block->page_size !=
3046 qemu_host_page_size) {
3047 qemu_put_be64(f, block->page_size);
3048 }
3049 if (migrate_ignore_shared()) {
3050 qemu_put_be64(f, block->mr->addr);
3051 }
fbd162e6 3052 }
56e93d26
JQ
3053 }
3054
56e93d26
JQ
3055 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3056 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3057
4010ba38
JQ
3058 migration_ops = g_malloc0(sizeof(MigrationOps));
3059 migration_ops->ram_save_target_page = ram_save_target_page_legacy;
8ebb6ecc 3060 ret = multifd_send_sync_main(f);
33d70973
LB
3061 if (ret < 0) {
3062 return ret;
3063 }
3064
294e5a40
JQ
3065 if (!migrate_multifd_flush_after_each_section()) {
3066 qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
3067 }
3068
56e93d26 3069 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
35374cbd 3070 qemu_fflush(f);
56e93d26
JQ
3071
3072 return 0;
3073}
3074
3d0684b2
JQ
3075/**
3076 * ram_save_iterate: iterative stage for migration
3077 *
3078 * Returns zero to indicate success and negative for error
3079 *
3080 * @f: QEMUFile where to send the data
3081 * @opaque: RAMState pointer
3082 */
56e93d26
JQ
3083static int ram_save_iterate(QEMUFile *f, void *opaque)
3084{
53518d94
JQ
3085 RAMState **temp = opaque;
3086 RAMState *rs = *temp;
3d4095b2 3087 int ret = 0;
56e93d26
JQ
3088 int i;
3089 int64_t t0;
5c90308f 3090 int done = 0;
56e93d26 3091
b2557345
PL
3092 if (blk_mig_bulk_active()) {
3093 /* Avoid transferring ram during bulk phase of block migration as
3094 * the bulk phase will usually take a long time and transferring
3095 * ram updates during that time is pointless. */
3096 goto out;
3097 }
3098
63268c49
PX
3099 /*
3100 * We'll take this lock a little bit long, but it's okay for two reasons.
3101 * Firstly, the only possible other thread to take it is who calls
3102 * qemu_guest_free_page_hint(), which should be rare; secondly, see
3103 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3104 * guarantees that we'll at least released it in a regular basis.
3105 */
3106 qemu_mutex_lock(&rs->bitmap_mutex);
89ac5a1d
DDAG
3107 WITH_RCU_READ_LOCK_GUARD() {
3108 if (ram_list.version != rs->last_version) {
3109 ram_state_reset(rs);
3110 }
56e93d26 3111
89ac5a1d
DDAG
3112 /* Read version before ram_list.blocks */
3113 smp_rmb();
56e93d26 3114
89ac5a1d 3115 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
56e93d26 3116
89ac5a1d
DDAG
3117 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3118 i = 0;
3119 while ((ret = qemu_file_rate_limit(f)) == 0 ||
a1fe28df 3120 postcopy_has_request(rs)) {
89ac5a1d 3121 int pages;
e03a34f8 3122
89ac5a1d
DDAG
3123 if (qemu_file_get_error(f)) {
3124 break;
3125 }
e8f3735f 3126
05931ec5 3127 pages = ram_find_and_save_block(rs);
89ac5a1d
DDAG
3128 /* no more pages to sent */
3129 if (pages == 0) {
3130 done = 1;
3131 break;
3132 }
e8f3735f 3133
89ac5a1d
DDAG
3134 if (pages < 0) {
3135 qemu_file_set_error(f, pages);
56e93d26
JQ
3136 break;
3137 }
89ac5a1d
DDAG
3138
3139 rs->target_page_count += pages;
3140
644acf99
WY
3141 /*
3142 * During postcopy, it is necessary to make sure one whole host
3143 * page is sent in one chunk.
3144 */
3145 if (migrate_postcopy_ram()) {
ef4f5f5d 3146 ram_flush_compressed_data(rs);
644acf99
WY
3147 }
3148
89ac5a1d
DDAG
3149 /*
3150 * we want to check in the 1st loop, just in case it was the 1st
3151 * time and we had to sync the dirty bitmap.
3152 * qemu_clock_get_ns() is a bit expensive, so we only check each
3153 * some iterations
3154 */
3155 if ((i & 63) == 0) {
3156 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3157 1000000;
3158 if (t1 > MAX_WAIT) {
3159 trace_ram_save_iterate_big_wait(t1, i);
3160 break;
3161 }
3162 }
3163 i++;
56e93d26 3164 }
56e93d26 3165 }
63268c49 3166 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26
JQ
3167
3168 /*
3169 * Must occur before EOS (or any QEMUFile operation)
3170 * because of RDMA protocol.
3171 */
3172 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3173
b2557345 3174out:
b69a0227
JQ
3175 if (ret >= 0
3176 && migration_is_setup_or_active(migrate_get_current()->state)) {
b05292c2
JQ
3177 if (migrate_multifd_flush_after_each_section()) {
3178 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3179 if (ret < 0) {
3180 return ret;
3181 }
33d70973
LB
3182 }
3183
3d4095b2
JQ
3184 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3185 qemu_fflush(f);
4c2d0f6d 3186 ram_transferred_add(8);
56e93d26 3187
3d4095b2
JQ
3188 ret = qemu_file_get_error(f);
3189 }
56e93d26
JQ
3190 if (ret < 0) {
3191 return ret;
3192 }
3193
5c90308f 3194 return done;
56e93d26
JQ
3195}
3196
3d0684b2
JQ
3197/**
3198 * ram_save_complete: function called to send the remaining amount of ram
3199 *
e8f3735f 3200 * Returns zero to indicate success or negative on error
3d0684b2
JQ
3201 *
3202 * Called with iothread lock
3203 *
3204 * @f: QEMUFile where to send the data
3205 * @opaque: RAMState pointer
3206 */
56e93d26
JQ
3207static int ram_save_complete(QEMUFile *f, void *opaque)
3208{
53518d94
JQ
3209 RAMState **temp = opaque;
3210 RAMState *rs = *temp;
e8f3735f 3211 int ret = 0;
6f37bb8b 3212
05931ec5
JQ
3213 rs->last_stage = !migration_in_colo_state();
3214
89ac5a1d
DDAG
3215 WITH_RCU_READ_LOCK_GUARD() {
3216 if (!migration_in_postcopy()) {
1e493be5 3217 migration_bitmap_sync_precopy(rs, true);
89ac5a1d 3218 }
56e93d26 3219
89ac5a1d 3220 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
56e93d26 3221
89ac5a1d 3222 /* try transferring iterative blocks of memory */
56e93d26 3223
89ac5a1d 3224 /* flush all remaining blocks regardless of rate limiting */
c13221b5 3225 qemu_mutex_lock(&rs->bitmap_mutex);
89ac5a1d
DDAG
3226 while (true) {
3227 int pages;
56e93d26 3228
05931ec5 3229 pages = ram_find_and_save_block(rs);
89ac5a1d
DDAG
3230 /* no more blocks to sent */
3231 if (pages == 0) {
3232 break;
3233 }
3234 if (pages < 0) {
3235 ret = pages;
3236 break;
3237 }
e8f3735f 3238 }
c13221b5 3239 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26 3240
ef4f5f5d 3241 ram_flush_compressed_data(rs);
89ac5a1d
DDAG
3242 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3243 }
d09a6fde 3244
33d70973
LB
3245 if (ret < 0) {
3246 return ret;
3d4095b2 3247 }
56e93d26 3248
7f401b80 3249 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
33d70973
LB
3250 if (ret < 0) {
3251 return ret;
3252 }
3253
294e5a40
JQ
3254 if (!migrate_multifd_flush_after_each_section()) {
3255 qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
3256 }
33d70973
LB
3257 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3258 qemu_fflush(f);
3259
3260 return 0;
56e93d26
JQ
3261}
3262
24beea4e
JQ
3263static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy,
3264 uint64_t *can_postcopy)
56e93d26 3265{
53518d94
JQ
3266 RAMState **temp = opaque;
3267 RAMState *rs = *temp;
56e93d26 3268
c8df4a7a 3269 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 3270
c8df4a7a
JQ
3271 if (migrate_postcopy_ram()) {
3272 /* We can do postcopy, and all the data is postcopiable */
24beea4e 3273 *can_postcopy += remaining_size;
c8df4a7a 3274 } else {
24beea4e 3275 *must_precopy += remaining_size;
c8df4a7a
JQ
3276 }
3277}
3278
24beea4e
JQ
3279static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy,
3280 uint64_t *can_postcopy)
c8df4a7a 3281{
28ef5339 3282 MigrationState *s = migrate_get_current();
c8df4a7a
JQ
3283 RAMState **temp = opaque;
3284 RAMState *rs = *temp;
3285
3286 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3287
28ef5339 3288 if (!migration_in_postcopy() && remaining_size < s->threshold_size) {
56e93d26 3289 qemu_mutex_lock_iothread();
89ac5a1d 3290 WITH_RCU_READ_LOCK_GUARD() {
1e493be5 3291 migration_bitmap_sync_precopy(rs, false);
89ac5a1d 3292 }
56e93d26 3293 qemu_mutex_unlock_iothread();
9edabd4d 3294 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 3295 }
c31b098f 3296
86e1167e
VSO
3297 if (migrate_postcopy_ram()) {
3298 /* We can do postcopy, and all the data is postcopiable */
24beea4e 3299 *can_postcopy += remaining_size;
86e1167e 3300 } else {
24beea4e 3301 *must_precopy += remaining_size;
86e1167e 3302 }
56e93d26
JQ
3303}
3304
3305static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3306{
3307 unsigned int xh_len;
3308 int xh_flags;
063e760a 3309 uint8_t *loaded_data;
56e93d26 3310
56e93d26
JQ
3311 /* extract RLE header */
3312 xh_flags = qemu_get_byte(f);
3313 xh_len = qemu_get_be16(f);
3314
3315 if (xh_flags != ENCODING_FLAG_XBZRLE) {
3316 error_report("Failed to load XBZRLE page - wrong compression!");
3317 return -1;
3318 }
3319
3320 if (xh_len > TARGET_PAGE_SIZE) {
3321 error_report("Failed to load XBZRLE page - len overflow!");
3322 return -1;
3323 }
f265e0e4 3324 loaded_data = XBZRLE.decoded_buf;
56e93d26 3325 /* load data and decode */
f265e0e4 3326 /* it can change loaded_data to point to an internal buffer */
063e760a 3327 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
56e93d26
JQ
3328
3329 /* decode RLE */
063e760a 3330 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
56e93d26
JQ
3331 TARGET_PAGE_SIZE) == -1) {
3332 error_report("Failed to load XBZRLE page - decode error!");
3333 return -1;
3334 }
3335
3336 return 0;
3337}
3338
3d0684b2
JQ
3339/**
3340 * ram_block_from_stream: read a RAMBlock id from the migration stream
3341 *
3342 * Must be called from within a rcu critical section.
3343 *
56e93d26 3344 * Returns a pointer from within the RCU-protected ram_list.
a7180877 3345 *
755e8d7c 3346 * @mis: the migration incoming state pointer
3d0684b2
JQ
3347 * @f: QEMUFile where to read the data from
3348 * @flags: Page flags (mostly to see if it's a continuation of previous block)
c01b16ed 3349 * @channel: the channel we're using
a7180877 3350 */
755e8d7c 3351static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
c01b16ed
PX
3352 QEMUFile *f, int flags,
3353 int channel)
56e93d26 3354{
c01b16ed 3355 RAMBlock *block = mis->last_recv_block[channel];
56e93d26
JQ
3356 char id[256];
3357 uint8_t len;
3358
3359 if (flags & RAM_SAVE_FLAG_CONTINUE) {
4c4bad48 3360 if (!block) {
56e93d26
JQ
3361 error_report("Ack, bad migration stream!");
3362 return NULL;
3363 }
4c4bad48 3364 return block;
56e93d26
JQ
3365 }
3366
3367 len = qemu_get_byte(f);
3368 qemu_get_buffer(f, (uint8_t *)id, len);
3369 id[len] = 0;
3370
e3dd7493 3371 block = qemu_ram_block_by_name(id);
4c4bad48
HZ
3372 if (!block) {
3373 error_report("Can't find block %s", id);
3374 return NULL;
56e93d26
JQ
3375 }
3376
fbd162e6 3377 if (ramblock_is_ignored(block)) {
b895de50
CLG
3378 error_report("block %s should not be migrated !", id);
3379 return NULL;
3380 }
3381
c01b16ed 3382 mis->last_recv_block[channel] = block;
755e8d7c 3383
4c4bad48
HZ
3384 return block;
3385}
3386
3387static inline void *host_from_ram_block_offset(RAMBlock *block,
3388 ram_addr_t offset)
3389{
3390 if (!offset_in_ramblock(block, offset)) {
3391 return NULL;
3392 }
3393
3394 return block->host + offset;
56e93d26
JQ
3395}
3396
6a23f639
DH
3397static void *host_page_from_ram_block_offset(RAMBlock *block,
3398 ram_addr_t offset)
3399{
3400 /* Note: Explicitly no check against offset_in_ramblock(). */
3401 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3402 block->page_size);
3403}
3404
3405static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3406 ram_addr_t offset)
3407{
3408 return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3409}
3410
871cfc54
LS
3411void colo_record_bitmap(RAMBlock *block, ram_addr_t *normal, uint32_t pages)
3412{
3413 qemu_mutex_lock(&ram_state->bitmap_mutex);
3414 for (int i = 0; i < pages; i++) {
3415 ram_addr_t offset = normal[i];
3416 ram_state->migration_dirty_pages += !test_and_set_bit(
3417 offset >> TARGET_PAGE_BITS,
3418 block->bmap);
3419 }
3420 qemu_mutex_unlock(&ram_state->bitmap_mutex);
3421}
3422
13af18f2 3423static inline void *colo_cache_from_block_offset(RAMBlock *block,
8af66371 3424 ram_addr_t offset, bool record_bitmap)
13af18f2
ZC
3425{
3426 if (!offset_in_ramblock(block, offset)) {
3427 return NULL;
3428 }
3429 if (!block->colo_cache) {
3430 error_report("%s: colo_cache is NULL in block :%s",
3431 __func__, block->idstr);
3432 return NULL;
3433 }
7d9acafa
ZC
3434
3435 /*
3436 * During colo checkpoint, we need bitmap of these migrated pages.
3437 * It help us to decide which pages in ram cache should be flushed
3438 * into VM's RAM later.
3439 */
871cfc54
LS
3440 if (record_bitmap) {
3441 colo_record_bitmap(block, &offset, 1);
7d9acafa 3442 }
13af18f2
ZC
3443 return block->colo_cache + offset;
3444}
3445
3d0684b2
JQ
3446/**
3447 * ram_handle_compressed: handle the zero page case
3448 *
56e93d26
JQ
3449 * If a page (or a whole RDMA chunk) has been
3450 * determined to be zero, then zap it.
3d0684b2
JQ
3451 *
3452 * @host: host address for the zero page
3453 * @ch: what the page is filled from. We only support zero
3454 * @size: size of the zero page
56e93d26
JQ
3455 */
3456void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3457{
bad452a7 3458 if (ch != 0 || !buffer_is_zero(host, size)) {
56e93d26
JQ
3459 memset(host, ch, size);
3460 }
3461}
3462
b70cb3b4
RL
3463static void colo_init_ram_state(void)
3464{
3465 ram_state_init(&ram_state);
b70cb3b4
RL
3466}
3467
13af18f2
ZC
3468/*
3469 * colo cache: this is for secondary VM, we cache the whole
3470 * memory of the secondary VM, it is need to hold the global lock
3471 * to call this helper.
3472 */
3473int colo_init_ram_cache(void)
3474{
3475 RAMBlock *block;
3476
44901b5a
PB
3477 WITH_RCU_READ_LOCK_GUARD() {
3478 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3479 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
8dbe22c6 3480 NULL, false, false);
44901b5a
PB
3481 if (!block->colo_cache) {
3482 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3483 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3484 block->used_length);
3485 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3486 if (block->colo_cache) {
3487 qemu_anon_ram_free(block->colo_cache, block->used_length);
3488 block->colo_cache = NULL;
3489 }
89ac5a1d 3490 }
44901b5a 3491 return -errno;
89ac5a1d 3492 }
e5fdf920
LS
3493 if (!machine_dump_guest_core(current_machine)) {
3494 qemu_madvise(block->colo_cache, block->used_length,
3495 QEMU_MADV_DONTDUMP);
3496 }
13af18f2 3497 }
13af18f2 3498 }
44901b5a 3499
7d9acafa
ZC
3500 /*
3501 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3502 * with to decide which page in cache should be flushed into SVM's RAM. Here
3503 * we use the same name 'ram_bitmap' as for migration.
3504 */
3505 if (ram_bytes_total()) {
3506 RAMBlock *block;
3507
fbd162e6 3508 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7d9acafa 3509 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
7d9acafa 3510 block->bmap = bitmap_new(pages);
7d9acafa
ZC
3511 }
3512 }
7d9acafa 3513
b70cb3b4 3514 colo_init_ram_state();
13af18f2 3515 return 0;
13af18f2
ZC
3516}
3517
0393031a
HZ
3518/* TODO: duplicated with ram_init_bitmaps */
3519void colo_incoming_start_dirty_log(void)
3520{
3521 RAMBlock *block = NULL;
3522 /* For memory_global_dirty_log_start below. */
3523 qemu_mutex_lock_iothread();
3524 qemu_mutex_lock_ramlist();
3525
1e493be5 3526 memory_global_dirty_log_sync(false);
0393031a
HZ
3527 WITH_RCU_READ_LOCK_GUARD() {
3528 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3529 ramblock_sync_dirty_bitmap(ram_state, block);
3530 /* Discard this dirty bitmap record */
3531 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3532 }
63b41db4 3533 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
0393031a
HZ
3534 }
3535 ram_state->migration_dirty_pages = 0;
3536 qemu_mutex_unlock_ramlist();
3537 qemu_mutex_unlock_iothread();
3538}
3539
13af18f2
ZC
3540/* It is need to hold the global lock to call this helper */
3541void colo_release_ram_cache(void)
3542{
3543 RAMBlock *block;
3544
63b41db4 3545 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
fbd162e6 3546 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7d9acafa
ZC
3547 g_free(block->bmap);
3548 block->bmap = NULL;
3549 }
3550
89ac5a1d
DDAG
3551 WITH_RCU_READ_LOCK_GUARD() {
3552 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3553 if (block->colo_cache) {
3554 qemu_anon_ram_free(block->colo_cache, block->used_length);
3555 block->colo_cache = NULL;
3556 }
13af18f2
ZC
3557 }
3558 }
0393031a 3559 ram_state_cleanup(&ram_state);
13af18f2
ZC
3560}
3561
f265e0e4
JQ
3562/**
3563 * ram_load_setup: Setup RAM for migration incoming side
3564 *
3565 * Returns zero to indicate success and negative for error
3566 *
3567 * @f: QEMUFile where to receive the data
3568 * @opaque: RAMState pointer
3569 */
3570static int ram_load_setup(QEMUFile *f, void *opaque)
3571{
3572 xbzrle_load_setup();
f9494614 3573 ramblock_recv_map_init();
13af18f2 3574
f265e0e4
JQ
3575 return 0;
3576}
3577
3578static int ram_load_cleanup(void *opaque)
3579{
f9494614 3580 RAMBlock *rb;
56eb90af 3581
fbd162e6 3582 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
bd108a44 3583 qemu_ram_block_writeback(rb);
56eb90af
JH
3584 }
3585
f265e0e4 3586 xbzrle_load_cleanup();
f9494614 3587
fbd162e6 3588 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
f9494614
AP
3589 g_free(rb->receivedmap);
3590 rb->receivedmap = NULL;
3591 }
13af18f2 3592
f265e0e4
JQ
3593 return 0;
3594}
3595
3d0684b2
JQ
3596/**
3597 * ram_postcopy_incoming_init: allocate postcopy data structures
3598 *
3599 * Returns 0 for success and negative if there was one error
3600 *
3601 * @mis: current migration incoming state
3602 *
3603 * Allocate data structures etc needed by incoming migration with
3604 * postcopy-ram. postcopy-ram's similarly names
3605 * postcopy_ram_incoming_init does the work.
1caddf8a
DDAG
3606 */
3607int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3608{
c136180c 3609 return postcopy_ram_incoming_init(mis);
1caddf8a
DDAG
3610}
3611
3d0684b2
JQ
3612/**
3613 * ram_load_postcopy: load a page in postcopy case
3614 *
3615 * Returns 0 for success or -errno in case of error
3616 *
a7180877
DDAG
3617 * Called in postcopy mode by ram_load().
3618 * rcu_read_lock is taken prior to this being called.
3d0684b2
JQ
3619 *
3620 * @f: QEMUFile where to send the data
36f62f11 3621 * @channel: the channel to use for loading
a7180877 3622 */
36f62f11 3623int ram_load_postcopy(QEMUFile *f, int channel)
a7180877
DDAG
3624{
3625 int flags = 0, ret = 0;
3626 bool place_needed = false;
1aa83678 3627 bool matches_target_page_size = false;
a7180877 3628 MigrationIncomingState *mis = migration_incoming_get_current();
36f62f11 3629 PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
a7180877
DDAG
3630
3631 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3632 ram_addr_t addr;
a7180877
DDAG
3633 void *page_buffer = NULL;
3634 void *place_source = NULL;
df9ff5e1 3635 RAMBlock *block = NULL;
a7180877 3636 uint8_t ch;
644acf99 3637 int len;
a7180877
DDAG
3638
3639 addr = qemu_get_be64(f);
7a9ddfbf
PX
3640
3641 /*
3642 * If qemu file error, we should stop here, and then "addr"
3643 * may be invalid
3644 */
3645 ret = qemu_file_get_error(f);
3646 if (ret) {
3647 break;
3648 }
3649
a7180877
DDAG
3650 flags = addr & ~TARGET_PAGE_MASK;
3651 addr &= TARGET_PAGE_MASK;
3652
36f62f11 3653 trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
644acf99
WY
3654 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3655 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
c01b16ed 3656 block = ram_block_from_stream(mis, f, flags, channel);
6a23f639
DH
3657 if (!block) {
3658 ret = -EINVAL;
3659 break;
3660 }
4c4bad48 3661
898ba906
DH
3662 /*
3663 * Relying on used_length is racy and can result in false positives.
3664 * We might place pages beyond used_length in case RAM was shrunk
3665 * while in postcopy, which is fine - trying to place via
3666 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3667 */
3668 if (!block->host || addr >= block->postcopy_length) {
a7180877
DDAG
3669 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3670 ret = -EINVAL;
3671 break;
3672 }
77dadc3f 3673 tmp_page->target_pages++;
1aa83678 3674 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
a7180877 3675 /*
28abd200
DDAG
3676 * Postcopy requires that we place whole host pages atomically;
3677 * these may be huge pages for RAMBlocks that are backed by
3678 * hugetlbfs.
a7180877
DDAG
3679 * To make it atomic, the data is read into a temporary page
3680 * that's moved into place later.
3681 * The migration protocol uses, possibly smaller, target-pages
3682 * however the source ensures it always sends all the components
91ba442f 3683 * of a host page in one chunk.
a7180877 3684 */
77dadc3f 3685 page_buffer = tmp_page->tmp_huge_page +
6a23f639
DH
3686 host_page_offset_from_ram_block_offset(block, addr);
3687 /* If all TP are zero then we can optimise the place */
77dadc3f
PX
3688 if (tmp_page->target_pages == 1) {
3689 tmp_page->host_addr =
3690 host_page_from_ram_block_offset(block, addr);
3691 } else if (tmp_page->host_addr !=
3692 host_page_from_ram_block_offset(block, addr)) {
c53b7ddc 3693 /* not the 1st TP within the HP */
36f62f11 3694 error_report("Non-same host page detected on channel %d: "
cfc7dc8a
PX
3695 "Target host page %p, received host page %p "
3696 "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
36f62f11 3697 channel, tmp_page->host_addr,
cfc7dc8a
PX
3698 host_page_from_ram_block_offset(block, addr),
3699 block->idstr, addr, tmp_page->target_pages);
6a23f639
DH
3700 ret = -EINVAL;
3701 break;
a7180877
DDAG
3702 }
3703
3704 /*
3705 * If it's the last part of a host page then we place the host
3706 * page
3707 */
77dadc3f
PX
3708 if (tmp_page->target_pages ==
3709 (block->page_size / TARGET_PAGE_SIZE)) {
4cbb3c63 3710 place_needed = true;
4cbb3c63 3711 }
77dadc3f 3712 place_source = tmp_page->tmp_huge_page;
a7180877
DDAG
3713 }
3714
3715 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
bb890ed5 3716 case RAM_SAVE_FLAG_ZERO:
a7180877 3717 ch = qemu_get_byte(f);
2e36bc1b
WY
3718 /*
3719 * Can skip to set page_buffer when
3720 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3721 */
3722 if (ch || !matches_target_page_size) {
3723 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3724 }
a7180877 3725 if (ch) {
77dadc3f 3726 tmp_page->all_zero = false;
a7180877
DDAG
3727 }
3728 break;
3729
3730 case RAM_SAVE_FLAG_PAGE:
77dadc3f 3731 tmp_page->all_zero = false;
1aa83678
PX
3732 if (!matches_target_page_size) {
3733 /* For huge pages, we always use temporary buffer */
a7180877
DDAG
3734 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3735 } else {
1aa83678
PX
3736 /*
3737 * For small pages that matches target page size, we
3738 * avoid the qemu_file copy. Instead we directly use
3739 * the buffer of QEMUFile to place the page. Note: we
3740 * cannot do any QEMUFile operation before using that
3741 * buffer to make sure the buffer is valid when
3742 * placing the page.
a7180877
DDAG
3743 */
3744 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3745 TARGET_PAGE_SIZE);
3746 }
3747 break;
644acf99 3748 case RAM_SAVE_FLAG_COMPRESS_PAGE:
77dadc3f 3749 tmp_page->all_zero = false;
644acf99
WY
3750 len = qemu_get_be32(f);
3751 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3752 error_report("Invalid compressed data length: %d", len);
3753 ret = -EINVAL;
3754 break;
3755 }
3756 decompress_data_with_multi_threads(f, page_buffer, len);
3757 break;
294e5a40
JQ
3758 case RAM_SAVE_FLAG_MULTIFD_FLUSH:
3759 multifd_recv_sync_main();
3760 break;
a7180877
DDAG
3761 case RAM_SAVE_FLAG_EOS:
3762 /* normal exit */
b05292c2
JQ
3763 if (migrate_multifd_flush_after_each_section()) {
3764 multifd_recv_sync_main();
3765 }
a7180877
DDAG
3766 break;
3767 default:
29fccade 3768 error_report("Unknown combination of migration flags: 0x%x"
a7180877
DDAG
3769 " (postcopy mode)", flags);
3770 ret = -EINVAL;
7a9ddfbf
PX
3771 break;
3772 }
3773
644acf99
WY
3774 /* Got the whole host page, wait for decompress before placing. */
3775 if (place_needed) {
3776 ret |= wait_for_decompress_done();
3777 }
3778
7a9ddfbf
PX
3779 /* Detect for any possible file errors */
3780 if (!ret && qemu_file_get_error(f)) {
3781 ret = qemu_file_get_error(f);
a7180877
DDAG
3782 }
3783
7a9ddfbf 3784 if (!ret && place_needed) {
77dadc3f
PX
3785 if (tmp_page->all_zero) {
3786 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
a7180877 3787 } else {
77dadc3f
PX
3788 ret = postcopy_place_page(mis, tmp_page->host_addr,
3789 place_source, block);
a7180877 3790 }
ddf35bdf 3791 place_needed = false;
77dadc3f 3792 postcopy_temp_page_reset(tmp_page);
a7180877 3793 }
a7180877
DDAG
3794 }
3795
3796 return ret;
3797}
3798
acab30b8
DHB
3799static bool postcopy_is_running(void)
3800{
3801 PostcopyState ps = postcopy_state_get();
3802 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3803}
3804
e6f4aa18
ZC
3805/*
3806 * Flush content of RAM cache into SVM's memory.
3807 * Only flush the pages that be dirtied by PVM or SVM or both.
3808 */
24fa16f8 3809void colo_flush_ram_cache(void)
e6f4aa18
ZC
3810{
3811 RAMBlock *block = NULL;
3812 void *dst_host;
3813 void *src_host;
3814 unsigned long offset = 0;
3815
1e493be5 3816 memory_global_dirty_log_sync(false);
9d638407 3817 qemu_mutex_lock(&ram_state->bitmap_mutex);
89ac5a1d
DDAG
3818 WITH_RCU_READ_LOCK_GUARD() {
3819 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3820 ramblock_sync_dirty_bitmap(ram_state, block);
3821 }
d1955d22 3822 }
d1955d22 3823
e6f4aa18 3824 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
89ac5a1d
DDAG
3825 WITH_RCU_READ_LOCK_GUARD() {
3826 block = QLIST_FIRST_RCU(&ram_list.blocks);
e6f4aa18 3827
89ac5a1d 3828 while (block) {
a6a83cef 3829 unsigned long num = 0;
e6f4aa18 3830
a6a83cef 3831 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
542147f4
DH
3832 if (!offset_in_ramblock(block,
3833 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
89ac5a1d 3834 offset = 0;
a6a83cef 3835 num = 0;
89ac5a1d
DDAG
3836 block = QLIST_NEXT_RCU(block, next);
3837 } else {
a6a83cef
RL
3838 unsigned long i = 0;
3839
3840 for (i = 0; i < num; i++) {
3841 migration_bitmap_clear_dirty(ram_state, block, offset + i);
3842 }
8bba004c
AR
3843 dst_host = block->host
3844 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3845 src_host = block->colo_cache
3846 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
a6a83cef
RL
3847 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
3848 offset += num;
89ac5a1d 3849 }
e6f4aa18
ZC
3850 }
3851 }
9d638407 3852 qemu_mutex_unlock(&ram_state->bitmap_mutex);
e6f4aa18
ZC
3853 trace_colo_flush_ram_cache_end();
3854}
3855
10da4a36
WY
3856/**
3857 * ram_load_precopy: load pages in precopy case
3858 *
3859 * Returns 0 for success or -errno in case of error
3860 *
3861 * Called in precopy mode by ram_load().
3862 * rcu_read_lock is taken prior to this being called.
3863 *
3864 * @f: QEMUFile where to send the data
3865 */
3866static int ram_load_precopy(QEMUFile *f)
56e93d26 3867{
755e8d7c 3868 MigrationIncomingState *mis = migration_incoming_get_current();
e65cec5e 3869 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
ef08fb38 3870 /* ADVISE is earlier, it shows the source has the postcopy capability on */
80fe315c 3871 bool postcopy_advised = migration_incoming_postcopy_advised();
a7a94d14 3872 if (!migrate_compress()) {
edc60127
JQ
3873 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3874 }
a7180877 3875
10da4a36 3876 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
56e93d26 3877 ram_addr_t addr, total_ram_bytes;
0393031a 3878 void *host = NULL, *host_bak = NULL;
56e93d26
JQ
3879 uint8_t ch;
3880
e65cec5e
YK
3881 /*
3882 * Yield periodically to let main loop run, but an iteration of
3883 * the main loop is expensive, so do it each some iterations
3884 */
3885 if ((i & 32767) == 0 && qemu_in_coroutine()) {
3886 aio_co_schedule(qemu_get_current_aio_context(),
3887 qemu_coroutine_self());
3888 qemu_coroutine_yield();
3889 }
3890 i++;
3891
56e93d26
JQ
3892 addr = qemu_get_be64(f);
3893 flags = addr & ~TARGET_PAGE_MASK;
3894 addr &= TARGET_PAGE_MASK;
3895
edc60127
JQ
3896 if (flags & invalid_flags) {
3897 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3898 error_report("Received an unexpected compressed page");
3899 }
3900
3901 ret = -EINVAL;
3902 break;
3903 }
3904
bb890ed5 3905 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
a776aa15 3906 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
c01b16ed
PX
3907 RAMBlock *block = ram_block_from_stream(mis, f, flags,
3908 RAM_CHANNEL_PRECOPY);
4c4bad48 3909
0393031a 3910 host = host_from_ram_block_offset(block, addr);
13af18f2 3911 /*
0393031a
HZ
3912 * After going into COLO stage, we should not load the page
3913 * into SVM's memory directly, we put them into colo_cache firstly.
3914 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3915 * Previously, we copied all these memory in preparing stage of COLO
3916 * while we need to stop VM, which is a time-consuming process.
3917 * Here we optimize it by a trick, back-up every page while in
3918 * migration process while COLO is enabled, though it affects the
3919 * speed of the migration, but it obviously reduce the downtime of
3920 * back-up all SVM'S memory in COLO preparing stage.
13af18f2 3921 */
0393031a
HZ
3922 if (migration_incoming_colo_enabled()) {
3923 if (migration_incoming_in_colo_state()) {
3924 /* In COLO stage, put all pages into cache temporarily */
8af66371 3925 host = colo_cache_from_block_offset(block, addr, true);
0393031a
HZ
3926 } else {
3927 /*
3928 * In migration stage but before COLO stage,
3929 * Put all pages into both cache and SVM's memory.
3930 */
8af66371 3931 host_bak = colo_cache_from_block_offset(block, addr, false);
0393031a 3932 }
13af18f2 3933 }
a776aa15
DDAG
3934 if (!host) {
3935 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3936 ret = -EINVAL;
3937 break;
3938 }
13af18f2
ZC
3939 if (!migration_incoming_in_colo_state()) {
3940 ramblock_recv_bitmap_set(block, host);
3941 }
3942
1db9d8e5 3943 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
a776aa15
DDAG
3944 }
3945
56e93d26
JQ
3946 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3947 case RAM_SAVE_FLAG_MEM_SIZE:
3948 /* Synchronize RAM block list */
3949 total_ram_bytes = addr;
3950 while (!ret && total_ram_bytes) {
3951 RAMBlock *block;
56e93d26
JQ
3952 char id[256];
3953 ram_addr_t length;
3954
3955 len = qemu_get_byte(f);
3956 qemu_get_buffer(f, (uint8_t *)id, len);
3957 id[len] = 0;
3958 length = qemu_get_be64(f);
3959
e3dd7493 3960 block = qemu_ram_block_by_name(id);
b895de50
CLG
3961 if (block && !qemu_ram_is_migratable(block)) {
3962 error_report("block %s should not be migrated !", id);
3963 ret = -EINVAL;
3964 } else if (block) {
e3dd7493
DDAG
3965 if (length != block->used_length) {
3966 Error *local_err = NULL;
56e93d26 3967
fa53a0e5 3968 ret = qemu_ram_resize(block, length,
e3dd7493
DDAG
3969 &local_err);
3970 if (local_err) {
3971 error_report_err(local_err);
56e93d26 3972 }
56e93d26 3973 }
ef08fb38 3974 /* For postcopy we need to check hugepage sizes match */
e846b746 3975 if (postcopy_advised && migrate_postcopy_ram() &&
ef08fb38
DDAG
3976 block->page_size != qemu_host_page_size) {
3977 uint64_t remote_page_size = qemu_get_be64(f);
3978 if (remote_page_size != block->page_size) {
3979 error_report("Mismatched RAM page size %s "
3980 "(local) %zd != %" PRId64,
3981 id, block->page_size,
3982 remote_page_size);
3983 ret = -EINVAL;
3984 }
3985 }
fbd162e6
YK
3986 if (migrate_ignore_shared()) {
3987 hwaddr addr = qemu_get_be64(f);
fbd162e6
YK
3988 if (ramblock_is_ignored(block) &&
3989 block->mr->addr != addr) {
3990 error_report("Mismatched GPAs for block %s "
3991 "%" PRId64 "!= %" PRId64,
3992 id, (uint64_t)addr,
3993 (uint64_t)block->mr->addr);
3994 ret = -EINVAL;
3995 }
3996 }
e3dd7493
DDAG
3997 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3998 block->idstr);
3999 } else {
56e93d26
JQ
4000 error_report("Unknown ramblock \"%s\", cannot "
4001 "accept migration", id);
4002 ret = -EINVAL;
4003 }
4004
4005 total_ram_bytes -= length;
4006 }
4007 break;
a776aa15 4008
bb890ed5 4009 case RAM_SAVE_FLAG_ZERO:
56e93d26
JQ
4010 ch = qemu_get_byte(f);
4011 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4012 break;
a776aa15 4013
56e93d26 4014 case RAM_SAVE_FLAG_PAGE:
56e93d26
JQ
4015 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4016 break;
56e93d26 4017
a776aa15 4018 case RAM_SAVE_FLAG_COMPRESS_PAGE:
56e93d26
JQ
4019 len = qemu_get_be32(f);
4020 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4021 error_report("Invalid compressed data length: %d", len);
4022 ret = -EINVAL;
4023 break;
4024 }
c1bc6626 4025 decompress_data_with_multi_threads(f, host, len);
56e93d26 4026 break;
a776aa15 4027
56e93d26 4028 case RAM_SAVE_FLAG_XBZRLE:
56e93d26
JQ
4029 if (load_xbzrle(f, addr, host) < 0) {
4030 error_report("Failed to decompress XBZRLE page at "
4031 RAM_ADDR_FMT, addr);
4032 ret = -EINVAL;
4033 break;
4034 }
4035 break;
294e5a40
JQ
4036 case RAM_SAVE_FLAG_MULTIFD_FLUSH:
4037 multifd_recv_sync_main();
4038 break;
56e93d26
JQ
4039 case RAM_SAVE_FLAG_EOS:
4040 /* normal exit */
b05292c2
JQ
4041 if (migrate_multifd_flush_after_each_section()) {
4042 multifd_recv_sync_main();
4043 }
56e93d26 4044 break;
5f1e7540
JQ
4045 case RAM_SAVE_FLAG_HOOK:
4046 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4047 break;
56e93d26 4048 default:
5f1e7540
JQ
4049 error_report("Unknown combination of migration flags: 0x%x", flags);
4050 ret = -EINVAL;
56e93d26
JQ
4051 }
4052 if (!ret) {
4053 ret = qemu_file_get_error(f);
4054 }
0393031a
HZ
4055 if (!ret && host_bak) {
4056 memcpy(host_bak, host, TARGET_PAGE_SIZE);
4057 }
56e93d26
JQ
4058 }
4059
ca1a6b70 4060 ret |= wait_for_decompress_done();
10da4a36
WY
4061 return ret;
4062}
4063
4064static int ram_load(QEMUFile *f, void *opaque, int version_id)
4065{
4066 int ret = 0;
4067 static uint64_t seq_iter;
4068 /*
4069 * If system is running in postcopy mode, page inserts to host memory must
4070 * be atomic
4071 */
4072 bool postcopy_running = postcopy_is_running();
4073
4074 seq_iter++;
4075
4076 if (version_id != 4) {
4077 return -EINVAL;
4078 }
4079
4080 /*
4081 * This RCU critical section can be very long running.
4082 * When RCU reclaims in the code start to become numerous,
4083 * it will be necessary to reduce the granularity of this
4084 * critical section.
4085 */
89ac5a1d
DDAG
4086 WITH_RCU_READ_LOCK_GUARD() {
4087 if (postcopy_running) {
36f62f11
PX
4088 /*
4089 * Note! Here RAM_CHANNEL_PRECOPY is the precopy channel of
4090 * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
4091 * service fast page faults.
4092 */
4093 ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
89ac5a1d
DDAG
4094 } else {
4095 ret = ram_load_precopy(f);
4096 }
10da4a36 4097 }
55c4446b 4098 trace_ram_load_complete(ret, seq_iter);
e6f4aa18 4099
56e93d26
JQ
4100 return ret;
4101}
4102
c6467627
VSO
4103static bool ram_has_postcopy(void *opaque)
4104{
469dd51b 4105 RAMBlock *rb;
fbd162e6 4106 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
469dd51b
JH
4107 if (ramblock_is_pmem(rb)) {
4108 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4109 "is not supported now!", rb->idstr, rb->host);
4110 return false;
4111 }
4112 }
4113
c6467627
VSO
4114 return migrate_postcopy_ram();
4115}
4116
edd090c7
PX
4117/* Sync all the dirty bitmap with destination VM. */
4118static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4119{
4120 RAMBlock *block;
4121 QEMUFile *file = s->to_dst_file;
4122 int ramblock_count = 0;
4123
4124 trace_ram_dirty_bitmap_sync_start();
4125
fbd162e6 4126 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
edd090c7
PX
4127 qemu_savevm_send_recv_bitmap(file, block->idstr);
4128 trace_ram_dirty_bitmap_request(block->idstr);
4129 ramblock_count++;
4130 }
4131
4132 trace_ram_dirty_bitmap_sync_wait();
4133
4134 /* Wait until all the ramblocks' dirty bitmap synced */
4135 while (ramblock_count--) {
4136 qemu_sem_wait(&s->rp_state.rp_sem);
4137 }
4138
4139 trace_ram_dirty_bitmap_sync_complete();
4140
4141 return 0;
4142}
4143
4144static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4145{
4146 qemu_sem_post(&s->rp_state.rp_sem);
4147}
4148
a335debb
PX
4149/*
4150 * Read the received bitmap, revert it as the initial dirty bitmap.
4151 * This is only used when the postcopy migration is paused but wants
4152 * to resume from a middle point.
4153 */
4154int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4155{
4156 int ret = -EINVAL;
43044ac0 4157 /* from_dst_file is always valid because we're within rp_thread */
a335debb
PX
4158 QEMUFile *file = s->rp_state.from_dst_file;
4159 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
a725ef9f 4160 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
4161 uint64_t size, end_mark;
4162
4163 trace_ram_dirty_bitmap_reload_begin(block->idstr);
4164
4165 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4166 error_report("%s: incorrect state %s", __func__,
4167 MigrationStatus_str(s->state));
4168 return -EINVAL;
4169 }
4170
4171 /*
4172 * Note: see comments in ramblock_recv_bitmap_send() on why we
3a4452d8 4173 * need the endianness conversion, and the paddings.
a335debb
PX
4174 */
4175 local_size = ROUND_UP(local_size, 8);
4176
4177 /* Add paddings */
4178 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4179
4180 size = qemu_get_be64(file);
4181
4182 /* The size of the bitmap should match with our ramblock */
4183 if (size != local_size) {
4184 error_report("%s: ramblock '%s' bitmap size mismatch "
4185 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4186 block->idstr, size, local_size);
4187 ret = -EINVAL;
4188 goto out;
4189 }
4190
4191 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4192 end_mark = qemu_get_be64(file);
4193
4194 ret = qemu_file_get_error(file);
4195 if (ret || size != local_size) {
4196 error_report("%s: read bitmap failed for ramblock '%s': %d"
4197 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4198 __func__, block->idstr, ret, local_size, size);
4199 ret = -EIO;
4200 goto out;
4201 }
4202
4203 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
af3bbbe9 4204 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
a335debb
PX
4205 __func__, block->idstr, end_mark);
4206 ret = -EINVAL;
4207 goto out;
4208 }
4209
4210 /*
3a4452d8 4211 * Endianness conversion. We are during postcopy (though paused).
a335debb
PX
4212 * The dirty bitmap won't change. We can directly modify it.
4213 */
4214 bitmap_from_le(block->bmap, le_bitmap, nbits);
4215
4216 /*
4217 * What we received is "received bitmap". Revert it as the initial
4218 * dirty bitmap for this ramblock.
4219 */
4220 bitmap_complement(block->bmap, block->bmap, nbits);
4221
be39b4cd
DH
4222 /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4223 ramblock_dirty_bitmap_clear_discarded_pages(block);
4224
4225 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
a335debb
PX
4226 trace_ram_dirty_bitmap_reload_complete(block->idstr);
4227
edd090c7
PX
4228 /*
4229 * We succeeded to sync bitmap for current ramblock. If this is
4230 * the last one to sync, we need to notify the main send thread.
4231 */
4232 ram_dirty_bitmap_reload_notify(s);
4233
a335debb
PX
4234 ret = 0;
4235out:
bf269906 4236 g_free(le_bitmap);
a335debb
PX
4237 return ret;
4238}
4239
edd090c7
PX
4240static int ram_resume_prepare(MigrationState *s, void *opaque)
4241{
4242 RAMState *rs = *(RAMState **)opaque;
08614f34 4243 int ret;
edd090c7 4244
08614f34
PX
4245 ret = ram_dirty_bitmap_sync_all(s, rs);
4246 if (ret) {
4247 return ret;
4248 }
4249
4250 ram_state_resume_prepare(rs, s->to_dst_file);
4251
4252 return 0;
edd090c7
PX
4253}
4254
36f62f11
PX
4255void postcopy_preempt_shutdown_file(MigrationState *s)
4256{
4257 qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
4258 qemu_fflush(s->postcopy_qemufile_src);
4259}
4260
56e93d26 4261static SaveVMHandlers savevm_ram_handlers = {
9907e842 4262 .save_setup = ram_save_setup,
56e93d26 4263 .save_live_iterate = ram_save_iterate,
763c906b 4264 .save_live_complete_postcopy = ram_save_complete,
a3e06c3d 4265 .save_live_complete_precopy = ram_save_complete,
c6467627 4266 .has_postcopy = ram_has_postcopy,
c8df4a7a
JQ
4267 .state_pending_exact = ram_state_pending_exact,
4268 .state_pending_estimate = ram_state_pending_estimate,
56e93d26 4269 .load_state = ram_load,
f265e0e4
JQ
4270 .save_cleanup = ram_save_cleanup,
4271 .load_setup = ram_load_setup,
4272 .load_cleanup = ram_load_cleanup,
edd090c7 4273 .resume_prepare = ram_resume_prepare,
56e93d26
JQ
4274};
4275
c7c0e724
DH
4276static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4277 size_t old_size, size_t new_size)
4278{
cc61c703 4279 PostcopyState ps = postcopy_state_get();
c7c0e724
DH
4280 ram_addr_t offset;
4281 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4282 Error *err = NULL;
4283
4284 if (ramblock_is_ignored(rb)) {
4285 return;
4286 }
4287
4288 if (!migration_is_idle()) {
4289 /*
4290 * Precopy code on the source cannot deal with the size of RAM blocks
4291 * changing at random points in time - especially after sending the
4292 * RAM block sizes in the migration stream, they must no longer change.
4293 * Abort and indicate a proper reason.
4294 */
4295 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
458fecca 4296 migration_cancel(err);
c7c0e724 4297 error_free(err);
c7c0e724 4298 }
cc61c703
DH
4299
4300 switch (ps) {
4301 case POSTCOPY_INCOMING_ADVISE:
4302 /*
4303 * Update what ram_postcopy_incoming_init()->init_range() does at the
4304 * time postcopy was advised. Syncing RAM blocks with the source will
4305 * result in RAM resizes.
4306 */
4307 if (old_size < new_size) {
4308 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4309 error_report("RAM block '%s' discard of resized RAM failed",
4310 rb->idstr);
4311 }
4312 }
898ba906 4313 rb->postcopy_length = new_size;
cc61c703
DH
4314 break;
4315 case POSTCOPY_INCOMING_NONE:
4316 case POSTCOPY_INCOMING_RUNNING:
4317 case POSTCOPY_INCOMING_END:
4318 /*
4319 * Once our guest is running, postcopy does no longer care about
4320 * resizes. When growing, the new memory was not available on the
4321 * source, no handler needed.
4322 */
4323 break;
4324 default:
4325 error_report("RAM block '%s' resized during postcopy state: %d",
4326 rb->idstr, ps);
4327 exit(-1);
4328 }
c7c0e724
DH
4329}
4330
4331static RAMBlockNotifier ram_mig_ram_notifier = {
4332 .ram_block_resized = ram_mig_ram_block_resized,
4333};
4334
56e93d26
JQ
4335void ram_mig_init(void)
4336{
4337 qemu_mutex_init(&XBZRLE.lock);
ce62df53 4338 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
c7c0e724 4339 ram_block_notifier_add(&ram_mig_ram_notifier);
56e93d26 4340}