]> git.proxmox.com Git - mirror_qemu.git/blame - migration/ram.c
Merge tag 'pull-riscv-to-apply-20230314' of https://github.com/alistair23/qemu into...
[mirror_qemu.git] / migration / ram.c
CommitLineData
56e93d26
JQ
1/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
76cc7b58
JQ
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
56e93d26
JQ
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
e688df6b 28
1393a485 29#include "qemu/osdep.h"
f348b6d1 30#include "qemu/cutils.h"
56e93d26
JQ
31#include "qemu/bitops.h"
32#include "qemu/bitmap.h"
b85ea5fa 33#include "qemu/madvise.h"
7205c9ec 34#include "qemu/main-loop.h"
c0e0825c 35#include "io/channel-null.h"
709e3fe8 36#include "xbzrle.h"
7b1e1a22 37#include "ram.h"
6666c96a 38#include "migration.h"
f2a8f0a6 39#include "migration/register.h"
7b1e1a22 40#include "migration/misc.h"
08a0aee1 41#include "qemu-file.h"
be07b0ac 42#include "postcopy-ram.h"
53d37d36 43#include "page_cache.h"
56e93d26 44#include "qemu/error-report.h"
e688df6b 45#include "qapi/error.h"
ab7cbb0b 46#include "qapi/qapi-types-migration.h"
9af23989 47#include "qapi/qapi-events-migration.h"
8acabf69 48#include "qapi/qmp/qerror.h"
56e93d26 49#include "trace.h"
56e93d26 50#include "exec/ram_addr.h"
f9494614 51#include "exec/target_page.h"
56e93d26 52#include "qemu/rcu_queue.h"
a91246c9 53#include "migration/colo.h"
53d37d36 54#include "block.h"
b0c3cf94 55#include "sysemu/cpu-throttle.h"
edd090c7 56#include "savevm.h"
b9ee2f7d 57#include "qemu/iov.h"
d32ca5ad 58#include "multifd.h"
278e2f55
AG
59#include "sysemu/runstate.h"
60
e5fdf920
LS
61#include "hw/boards.h" /* for machine_dump_guest_core() */
62
278e2f55
AG
63#if defined(__linux__)
64#include "qemu/userfaultfd.h"
65#endif /* defined(__linux__) */
56e93d26 66
56e93d26
JQ
67/***********************************************************/
68/* ram save/restore */
69
7b548761
JQ
70/*
71 * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
72 * worked for pages that were filled with the same char. We switched
bb890ed5 73 * it to only search for the zero value. And to avoid confusion with
7b548761 74 * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it.
bb890ed5 75 */
7b548761
JQ
76/*
77 * RAM_SAVE_FLAG_FULL was obsoleted in 2009, it can be reused now
78 */
79#define RAM_SAVE_FLAG_FULL 0x01
bb890ed5 80#define RAM_SAVE_FLAG_ZERO 0x02
56e93d26
JQ
81#define RAM_SAVE_FLAG_MEM_SIZE 0x04
82#define RAM_SAVE_FLAG_PAGE 0x08
83#define RAM_SAVE_FLAG_EOS 0x10
84#define RAM_SAVE_FLAG_CONTINUE 0x20
85#define RAM_SAVE_FLAG_XBZRLE 0x40
7b548761 86/* 0x80 is reserved in qemu-file.h for RAM_SAVE_FLAG_HOOK */
56e93d26 87#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
7b548761 88/* We can't use any flag that is bigger than 0x200 */
56e93d26 89
04ffce13 90int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int,
91 uint8_t *, int) = xbzrle_encode_buffer;
92#if defined(CONFIG_AVX512BW_OPT)
93#include "qemu/cpuid.h"
94static void __attribute__((constructor)) init_cpu_flag(void)
95{
96 unsigned max = __get_cpuid_max(0, NULL);
97 int a, b, c, d;
98 if (max >= 1) {
99 __cpuid(1, a, b, c, d);
100 /* We must check that AVX is not just available, but usable. */
101 if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
102 int bv;
103 __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
104 __cpuid_count(7, 0, a, b, c, d);
105 /* 0xe6:
106 * XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
107 * and ZMM16-ZMM31 state are enabled by OS)
108 * XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
109 */
110 if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
111 xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512;
112 }
113 }
114 }
115}
116#endif
117
9360447d
JQ
118XBZRLECacheStats xbzrle_counters;
119
f1668764
PX
120/* used by the search for pages to send */
121struct PageSearchStatus {
122 /* The migration channel used for a specific host page */
123 QEMUFile *pss_channel;
ec6f3ab9
PX
124 /* Last block from where we have sent data */
125 RAMBlock *last_sent_block;
f1668764
PX
126 /* Current block being searched */
127 RAMBlock *block;
128 /* Current page to search from */
129 unsigned long page;
130 /* Set once we wrap around */
131 bool complete_round;
f1668764
PX
132 /* Whether we're sending a host page */
133 bool host_page_sending;
134 /* The start/end of current host page. Invalid if host_page_sending==false */
135 unsigned long host_page_start;
136 unsigned long host_page_end;
137};
138typedef struct PageSearchStatus PageSearchStatus;
139
56e93d26
JQ
140/* struct contains XBZRLE cache and a static page
141 used by the compression */
142static struct {
143 /* buffer used for XBZRLE encoding */
144 uint8_t *encoded_buf;
145 /* buffer for storing page content */
146 uint8_t *current_buf;
147 /* Cache for XBZRLE, Protected by lock. */
148 PageCache *cache;
149 QemuMutex lock;
c00e0928
JQ
150 /* it will store a page full of zeros */
151 uint8_t *zero_target_page;
f265e0e4
JQ
152 /* buffer used for XBZRLE decoding */
153 uint8_t *decoded_buf;
56e93d26
JQ
154} XBZRLE;
155
56e93d26
JQ
156static void XBZRLE_cache_lock(void)
157{
f4c51a6b 158 if (migrate_use_xbzrle()) {
56e93d26 159 qemu_mutex_lock(&XBZRLE.lock);
f4c51a6b 160 }
56e93d26
JQ
161}
162
163static void XBZRLE_cache_unlock(void)
164{
f4c51a6b 165 if (migrate_use_xbzrle()) {
56e93d26 166 qemu_mutex_unlock(&XBZRLE.lock);
f4c51a6b 167 }
56e93d26
JQ
168}
169
3d0684b2
JQ
170/**
171 * xbzrle_cache_resize: resize the xbzrle cache
172 *
cbde7be9 173 * This function is called from migrate_params_apply in main
3d0684b2
JQ
174 * thread, possibly while a migration is in progress. A running
175 * migration may be using the cache and might finish during this call,
176 * hence changes to the cache are protected by XBZRLE.lock().
177 *
c9dede2d 178 * Returns 0 for success or -1 for error
3d0684b2
JQ
179 *
180 * @new_size: new cache size
8acabf69 181 * @errp: set *errp if the check failed, with reason
56e93d26 182 */
8b9407a0 183int xbzrle_cache_resize(uint64_t new_size, Error **errp)
56e93d26
JQ
184{
185 PageCache *new_cache;
c9dede2d 186 int64_t ret = 0;
56e93d26 187
8acabf69
JQ
188 /* Check for truncation */
189 if (new_size != (size_t)new_size) {
190 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
191 "exceeding address space");
192 return -1;
193 }
194
2a313e5c
JQ
195 if (new_size == migrate_xbzrle_cache_size()) {
196 /* nothing to do */
c9dede2d 197 return 0;
2a313e5c
JQ
198 }
199
56e93d26
JQ
200 XBZRLE_cache_lock();
201
202 if (XBZRLE.cache != NULL) {
80f8dfde 203 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
56e93d26 204 if (!new_cache) {
56e93d26
JQ
205 ret = -1;
206 goto out;
207 }
208
209 cache_fini(XBZRLE.cache);
210 XBZRLE.cache = new_cache;
211 }
56e93d26
JQ
212out:
213 XBZRLE_cache_unlock();
214 return ret;
215}
216
20123ee1
PX
217static bool postcopy_preempt_active(void)
218{
219 return migrate_postcopy_preempt() && migration_in_postcopy();
220}
221
3ded54b1 222bool ramblock_is_ignored(RAMBlock *block)
fbd162e6
YK
223{
224 return !qemu_ram_is_migratable(block) ||
225 (migrate_ignore_shared() && qemu_ram_is_shared(block));
226}
227
343f632c
DDAG
228#undef RAMBLOCK_FOREACH
229
fbd162e6
YK
230int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
231{
232 RAMBlock *block;
233 int ret = 0;
234
89ac5a1d
DDAG
235 RCU_READ_LOCK_GUARD();
236
fbd162e6
YK
237 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
238 ret = func(block, opaque);
239 if (ret) {
240 break;
241 }
242 }
fbd162e6
YK
243 return ret;
244}
245
f9494614
AP
246static void ramblock_recv_map_init(void)
247{
248 RAMBlock *rb;
249
fbd162e6 250 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
f9494614
AP
251 assert(!rb->receivedmap);
252 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
253 }
254}
255
256int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
257{
258 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
259 rb->receivedmap);
260}
261
1cba9f6e
DDAG
262bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
263{
264 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
265}
266
f9494614
AP
267void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
268{
269 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
270}
271
272void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
273 size_t nr)
274{
275 bitmap_set_atomic(rb->receivedmap,
276 ramblock_recv_bitmap_offset(host_addr, rb),
277 nr);
278}
279
a335debb
PX
280#define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
281
282/*
283 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
284 *
285 * Returns >0 if success with sent bytes, or <0 if error.
286 */
287int64_t ramblock_recv_bitmap_send(QEMUFile *file,
288 const char *block_name)
289{
290 RAMBlock *block = qemu_ram_block_by_name(block_name);
291 unsigned long *le_bitmap, nbits;
292 uint64_t size;
293
294 if (!block) {
295 error_report("%s: invalid block name: %s", __func__, block_name);
296 return -1;
297 }
298
898ba906 299 nbits = block->postcopy_length >> TARGET_PAGE_BITS;
a335debb
PX
300
301 /*
302 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
303 * machines we may need 4 more bytes for padding (see below
304 * comment). So extend it a bit before hand.
305 */
306 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
307
308 /*
309 * Always use little endian when sending the bitmap. This is
310 * required that when source and destination VMs are not using the
3a4452d8 311 * same endianness. (Note: big endian won't work.)
a335debb
PX
312 */
313 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
314
315 /* Size of the bitmap, in bytes */
a725ef9f 316 size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
317
318 /*
319 * size is always aligned to 8 bytes for 64bit machines, but it
320 * may not be true for 32bit machines. We need this padding to
321 * make sure the migration can survive even between 32bit and
322 * 64bit machines.
323 */
324 size = ROUND_UP(size, 8);
325
326 qemu_put_be64(file, size);
327 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
328 /*
329 * Mark as an end, in case the middle part is screwed up due to
3a4452d8 330 * some "mysterious" reason.
a335debb
PX
331 */
332 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
333 qemu_fflush(file);
334
bf269906 335 g_free(le_bitmap);
a335debb
PX
336
337 if (qemu_file_get_error(file)) {
338 return qemu_file_get_error(file);
339 }
340
341 return size + sizeof(size);
342}
343
ec481c6c
JQ
344/*
345 * An outstanding page request, on the source, having been received
346 * and queued
347 */
348struct RAMSrcPageRequest {
349 RAMBlock *rb;
350 hwaddr offset;
351 hwaddr len;
352
353 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
354};
355
6f37bb8b
JQ
356/* State of RAM for migration */
357struct RAMState {
f1668764
PX
358 /*
359 * PageSearchStatus structures for the channels when send pages.
360 * Protected by the bitmap_mutex.
361 */
362 PageSearchStatus pss[RAM_CHANNEL_MAX];
278e2f55
AG
363 /* UFFD file descriptor, used in 'write-tracking' migration */
364 int uffdio_fd;
8d80e195
JQ
365 /* total ram size in bytes */
366 uint64_t ram_bytes_total;
6f37bb8b
JQ
367 /* Last block that we have visited searching for dirty pages */
368 RAMBlock *last_seen_block;
269ace29
JQ
369 /* Last dirty target page we have sent */
370 ram_addr_t last_page;
6f37bb8b
JQ
371 /* last ram version we have seen */
372 uint32_t last_version;
8d820d6f
JQ
373 /* How many times we have dirty too many pages */
374 int dirty_rate_high_cnt;
f664da80
JQ
375 /* these variables are used for bitmap sync */
376 /* last time we did a full bitmap_sync */
377 int64_t time_last_bitmap_sync;
eac74159 378 /* bytes transferred at start_time */
c4bdf0cf 379 uint64_t bytes_xfer_prev;
a66cd90c 380 /* number of dirty pages since start_time */
68908ed6 381 uint64_t num_dirty_pages_period;
b5833fde
JQ
382 /* xbzrle misses since the beginning of the period */
383 uint64_t xbzrle_cache_miss_prev;
e460a4b1
WW
384 /* Amount of xbzrle pages since the beginning of the period */
385 uint64_t xbzrle_pages_prev;
386 /* Amount of xbzrle encoded bytes since the beginning of the period */
387 uint64_t xbzrle_bytes_prev;
1a373522
DH
388 /* Start using XBZRLE (e.g., after the first round). */
389 bool xbzrle_enabled;
05931ec5
JQ
390 /* Are we on the last stage of migration */
391 bool last_stage;
76e03000
XG
392 /* compression statistics since the beginning of the period */
393 /* amount of count that no free thread to compress data */
394 uint64_t compress_thread_busy_prev;
395 /* amount bytes after compression */
396 uint64_t compressed_size_prev;
397 /* amount of compressed pages */
398 uint64_t compress_pages_prev;
399
be8b02ed
XG
400 /* total handled target pages at the beginning of period */
401 uint64_t target_page_count_prev;
402 /* total handled target pages since start */
403 uint64_t target_page_count;
9360447d 404 /* number of dirty bits in the bitmap */
2dfaf12e 405 uint64_t migration_dirty_pages;
f1668764
PX
406 /*
407 * Protects:
408 * - dirty/clear bitmap
409 * - migration_dirty_pages
410 * - pss structures
411 */
108cfae0 412 QemuMutex bitmap_mutex;
68a098f3
JQ
413 /* The RAMBlock used in the last src_page_requests */
414 RAMBlock *last_req_rb;
ec481c6c
JQ
415 /* Queue of outstanding page requests from the destination */
416 QemuMutex src_page_req_mutex;
b58deb34 417 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
6f37bb8b
JQ
418};
419typedef struct RAMState RAMState;
420
53518d94 421static RAMState *ram_state;
6f37bb8b 422
bd227060
WW
423static NotifierWithReturnList precopy_notifier_list;
424
a1fe28df
PX
425/* Whether postcopy has queued requests? */
426static bool postcopy_has_request(RAMState *rs)
427{
428 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
429}
430
bd227060
WW
431void precopy_infrastructure_init(void)
432{
433 notifier_with_return_list_init(&precopy_notifier_list);
434}
435
436void precopy_add_notifier(NotifierWithReturn *n)
437{
438 notifier_with_return_list_add(&precopy_notifier_list, n);
439}
440
441void precopy_remove_notifier(NotifierWithReturn *n)
442{
443 notifier_with_return_remove(n);
444}
445
446int precopy_notify(PrecopyNotifyReason reason, Error **errp)
447{
448 PrecopyNotifyData pnd;
449 pnd.reason = reason;
450 pnd.errp = errp;
451
452 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
453}
454
9edabd4d 455uint64_t ram_bytes_remaining(void)
2f4fde93 456{
bae416e5
DDAG
457 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
458 0;
2f4fde93
JQ
459}
460
23b7576d
PX
461/*
462 * NOTE: not all stats in ram_counters are used in reality. See comments
463 * for struct MigrationAtomicStats. The ultimate result of ram migration
464 * counters will be a merged version with both ram_counters and the atomic
465 * fields in ram_atomic_counters.
466 */
9360447d 467MigrationStats ram_counters;
23b7576d 468MigrationAtomicStats ram_atomic_counters;
96506894 469
26a26069 470void ram_transferred_add(uint64_t bytes)
4c2d0f6d 471{
ae680668
DE
472 if (runstate_is_running()) {
473 ram_counters.precopy_bytes += bytes;
474 } else if (migration_in_postcopy()) {
23b7576d 475 stat64_add(&ram_atomic_counters.postcopy_bytes, bytes);
ae680668
DE
476 } else {
477 ram_counters.downtime_bytes += bytes;
478 }
23b7576d 479 stat64_add(&ram_atomic_counters.transferred, bytes);
4c2d0f6d
DE
480}
481
d59c40cc
LB
482void dirty_sync_missed_zero_copy(void)
483{
484 ram_counters.dirty_sync_missed_zero_copy++;
485}
486
4010ba38
JQ
487struct MigrationOps {
488 int (*ram_save_target_page)(RAMState *rs, PageSearchStatus *pss);
489};
490typedef struct MigrationOps MigrationOps;
491
492MigrationOps *migration_ops;
493
76e03000
XG
494CompressionStats compression_counters;
495
56e93d26 496struct CompressParam {
56e93d26 497 bool done;
90e56fb4 498 bool quit;
5e5fdcff 499 bool zero_page;
56e93d26
JQ
500 QEMUFile *file;
501 QemuMutex mutex;
502 QemuCond cond;
503 RAMBlock *block;
504 ram_addr_t offset;
34ab9e97
XG
505
506 /* internally used fields */
dcaf446e 507 z_stream stream;
34ab9e97 508 uint8_t *originbuf;
56e93d26
JQ
509};
510typedef struct CompressParam CompressParam;
511
512struct DecompressParam {
73a8912b 513 bool done;
90e56fb4 514 bool quit;
56e93d26
JQ
515 QemuMutex mutex;
516 QemuCond cond;
517 void *des;
d341d9f3 518 uint8_t *compbuf;
56e93d26 519 int len;
797ca154 520 z_stream stream;
56e93d26
JQ
521};
522typedef struct DecompressParam DecompressParam;
523
524static CompressParam *comp_param;
525static QemuThread *compress_threads;
526/* comp_done_cond is used to wake up the migration thread when
527 * one of the compression threads has finished the compression.
528 * comp_done_lock is used to co-work with comp_done_cond.
529 */
0d9f9a5c
LL
530static QemuMutex comp_done_lock;
531static QemuCond comp_done_cond;
56e93d26 532
34ab9e97 533static QEMUFile *decomp_file;
56e93d26
JQ
534static DecompressParam *decomp_param;
535static QemuThread *decompress_threads;
73a8912b
LL
536static QemuMutex decomp_done_lock;
537static QemuCond decomp_done_cond;
56e93d26 538
93589827
PX
539static int ram_save_host_page_urgent(PageSearchStatus *pss);
540
5e5fdcff 541static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
6ef3771c 542 ram_addr_t offset, uint8_t *source_buf);
56e93d26 543
ebd88a49
PX
544/* NOTE: page is the PFN not real ram_addr_t. */
545static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page)
546{
547 pss->block = rb;
548 pss->page = page;
549 pss->complete_round = false;
550}
551
93589827
PX
552/*
553 * Check whether two PSSs are actively sending the same page. Return true
554 * if it is, false otherwise.
555 */
556static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2)
557{
558 return pss1->host_page_sending && pss2->host_page_sending &&
559 (pss1->host_page_start == pss2->host_page_start);
560}
561
56e93d26
JQ
562static void *do_data_compress(void *opaque)
563{
564 CompressParam *param = opaque;
a7a9a88f
LL
565 RAMBlock *block;
566 ram_addr_t offset;
5e5fdcff 567 bool zero_page;
56e93d26 568
a7a9a88f 569 qemu_mutex_lock(&param->mutex);
90e56fb4 570 while (!param->quit) {
a7a9a88f
LL
571 if (param->block) {
572 block = param->block;
573 offset = param->offset;
574 param->block = NULL;
575 qemu_mutex_unlock(&param->mutex);
576
5e5fdcff
XG
577 zero_page = do_compress_ram_page(param->file, &param->stream,
578 block, offset, param->originbuf);
a7a9a88f 579
0d9f9a5c 580 qemu_mutex_lock(&comp_done_lock);
a7a9a88f 581 param->done = true;
5e5fdcff 582 param->zero_page = zero_page;
0d9f9a5c
LL
583 qemu_cond_signal(&comp_done_cond);
584 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
585
586 qemu_mutex_lock(&param->mutex);
587 } else {
56e93d26
JQ
588 qemu_cond_wait(&param->cond, &param->mutex);
589 }
56e93d26 590 }
a7a9a88f 591 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
592
593 return NULL;
594}
595
f0afa331 596static void compress_threads_save_cleanup(void)
56e93d26
JQ
597{
598 int i, thread_count;
599
05306935 600 if (!migrate_use_compression() || !comp_param) {
56e93d26
JQ
601 return;
602 }
05306935 603
56e93d26
JQ
604 thread_count = migrate_compress_threads();
605 for (i = 0; i < thread_count; i++) {
dcaf446e
XG
606 /*
607 * we use it as a indicator which shows if the thread is
608 * properly init'd or not
609 */
610 if (!comp_param[i].file) {
611 break;
612 }
05306935
FL
613
614 qemu_mutex_lock(&comp_param[i].mutex);
615 comp_param[i].quit = true;
616 qemu_cond_signal(&comp_param[i].cond);
617 qemu_mutex_unlock(&comp_param[i].mutex);
618
56e93d26 619 qemu_thread_join(compress_threads + i);
56e93d26
JQ
620 qemu_mutex_destroy(&comp_param[i].mutex);
621 qemu_cond_destroy(&comp_param[i].cond);
dcaf446e 622 deflateEnd(&comp_param[i].stream);
34ab9e97 623 g_free(comp_param[i].originbuf);
dcaf446e
XG
624 qemu_fclose(comp_param[i].file);
625 comp_param[i].file = NULL;
56e93d26 626 }
0d9f9a5c
LL
627 qemu_mutex_destroy(&comp_done_lock);
628 qemu_cond_destroy(&comp_done_cond);
56e93d26
JQ
629 g_free(compress_threads);
630 g_free(comp_param);
56e93d26
JQ
631 compress_threads = NULL;
632 comp_param = NULL;
56e93d26
JQ
633}
634
dcaf446e 635static int compress_threads_save_setup(void)
56e93d26
JQ
636{
637 int i, thread_count;
638
639 if (!migrate_use_compression()) {
dcaf446e 640 return 0;
56e93d26 641 }
56e93d26
JQ
642 thread_count = migrate_compress_threads();
643 compress_threads = g_new0(QemuThread, thread_count);
644 comp_param = g_new0(CompressParam, thread_count);
0d9f9a5c
LL
645 qemu_cond_init(&comp_done_cond);
646 qemu_mutex_init(&comp_done_lock);
56e93d26 647 for (i = 0; i < thread_count; i++) {
34ab9e97
XG
648 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
649 if (!comp_param[i].originbuf) {
650 goto exit;
651 }
652
dcaf446e
XG
653 if (deflateInit(&comp_param[i].stream,
654 migrate_compress_level()) != Z_OK) {
34ab9e97 655 g_free(comp_param[i].originbuf);
dcaf446e
XG
656 goto exit;
657 }
658
e110aa91
C
659 /* comp_param[i].file is just used as a dummy buffer to save data,
660 * set its ops to empty.
56e93d26 661 */
77ef2dc1 662 comp_param[i].file = qemu_file_new_output(
c0e0825c 663 QIO_CHANNEL(qio_channel_null_new()));
56e93d26 664 comp_param[i].done = true;
90e56fb4 665 comp_param[i].quit = false;
56e93d26
JQ
666 qemu_mutex_init(&comp_param[i].mutex);
667 qemu_cond_init(&comp_param[i].cond);
668 qemu_thread_create(compress_threads + i, "compress",
669 do_data_compress, comp_param + i,
670 QEMU_THREAD_JOINABLE);
671 }
dcaf446e
XG
672 return 0;
673
674exit:
675 compress_threads_save_cleanup();
676 return -1;
56e93d26
JQ
677}
678
679/**
3d0684b2 680 * save_page_header: write page header to wire
56e93d26
JQ
681 *
682 * If this is the 1st block, it also writes the block identification
683 *
3d0684b2 684 * Returns the number of bytes written
56e93d26 685 *
ec6f3ab9 686 * @pss: current PSS channel status
56e93d26
JQ
687 * @block: block that contains the page we want to send
688 * @offset: offset inside the block for the page
689 * in the lower bits, it contains flags
690 */
ec6f3ab9 691static size_t save_page_header(PageSearchStatus *pss, RAMBlock *block,
2bf3aa85 692 ram_addr_t offset)
56e93d26 693{
9f5f380b 694 size_t size, len;
ec6f3ab9
PX
695 bool same_block = (block == pss->last_sent_block);
696 QEMUFile *f = pss->pss_channel;
56e93d26 697
10661f11 698 if (same_block) {
24795694
JQ
699 offset |= RAM_SAVE_FLAG_CONTINUE;
700 }
2bf3aa85 701 qemu_put_be64(f, offset);
56e93d26
JQ
702 size = 8;
703
10661f11 704 if (!same_block) {
9f5f380b 705 len = strlen(block->idstr);
2bf3aa85
JQ
706 qemu_put_byte(f, len);
707 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
9f5f380b 708 size += 1 + len;
ec6f3ab9 709 pss->last_sent_block = block;
56e93d26
JQ
710 }
711 return size;
712}
713
3d0684b2 714/**
179a8080 715 * mig_throttle_guest_down: throttle down the guest
3d0684b2
JQ
716 *
717 * Reduce amount of guest cpu execution to hopefully slow down memory
718 * writes. If guest dirty memory rate is reduced below the rate at
719 * which we can transfer pages to the destination then we should be
720 * able to complete migration. Some workloads dirty memory way too
721 * fast and will not effectively converge, even with auto-converge.
070afca2 722 */
cbbf8182
KZ
723static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
724 uint64_t bytes_dirty_threshold)
070afca2
JH
725{
726 MigrationState *s = migrate_get_current();
2594f56d 727 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
cbbf8182
KZ
728 uint64_t pct_increment = s->parameters.cpu_throttle_increment;
729 bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
4cbc9c7f 730 int pct_max = s->parameters.max_cpu_throttle;
070afca2 731
cbbf8182
KZ
732 uint64_t throttle_now = cpu_throttle_get_percentage();
733 uint64_t cpu_now, cpu_ideal, throttle_inc;
734
070afca2
JH
735 /* We have not started throttling yet. Let's start it. */
736 if (!cpu_throttle_active()) {
737 cpu_throttle_set(pct_initial);
738 } else {
739 /* Throttling already on, just increase the rate */
cbbf8182
KZ
740 if (!pct_tailslow) {
741 throttle_inc = pct_increment;
742 } else {
743 /* Compute the ideal CPU percentage used by Guest, which may
744 * make the dirty rate match the dirty rate threshold. */
745 cpu_now = 100 - throttle_now;
746 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
747 bytes_dirty_period);
748 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
749 }
750 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
070afca2
JH
751 }
752}
753
91fe9a8d
RL
754void mig_throttle_counter_reset(void)
755{
756 RAMState *rs = ram_state;
757
758 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
759 rs->num_dirty_pages_period = 0;
23b7576d 760 rs->bytes_xfer_prev = stat64_get(&ram_atomic_counters.transferred);
91fe9a8d
RL
761}
762
3d0684b2
JQ
763/**
764 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
765 *
6f37bb8b 766 * @rs: current RAM state
3d0684b2
JQ
767 * @current_addr: address for the zero page
768 *
769 * Update the xbzrle cache to reflect a page that's been sent as all 0.
56e93d26
JQ
770 * The important thing is that a stale (not-yet-0'd) page be replaced
771 * by the new data.
772 * As a bonus, if the page wasn't in the cache it gets added so that
3d0684b2 773 * when a small write is made into the 0'd page it gets XBZRLE sent.
56e93d26 774 */
6f37bb8b 775static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
56e93d26 776{
56e93d26
JQ
777 /* We don't care if this fails to allocate a new cache page
778 * as long as it updated an old one */
c00e0928 779 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
9360447d 780 ram_counters.dirty_sync_count);
56e93d26
JQ
781}
782
783#define ENCODING_FLAG_XBZRLE 0x1
784
785/**
786 * save_xbzrle_page: compress and send current page
787 *
788 * Returns: 1 means that we wrote the page
789 * 0 means that page is identical to the one already sent
790 * -1 means that xbzrle would be longer than normal
791 *
5a987738 792 * @rs: current RAM state
ec6f3ab9 793 * @pss: current PSS channel
3d0684b2
JQ
794 * @current_data: pointer to the address of the page contents
795 * @current_addr: addr of the page
56e93d26
JQ
796 * @block: block that contains the page we want to send
797 * @offset: offset inside the block for the page
56e93d26 798 */
ec6f3ab9 799static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
61717ea9
PX
800 uint8_t **current_data, ram_addr_t current_addr,
801 RAMBlock *block, ram_addr_t offset)
56e93d26
JQ
802{
803 int encoded_len = 0, bytes_xbzrle;
804 uint8_t *prev_cached_page;
ec6f3ab9 805 QEMUFile *file = pss->pss_channel;
56e93d26 806
9360447d
JQ
807 if (!cache_is_cached(XBZRLE.cache, current_addr,
808 ram_counters.dirty_sync_count)) {
809 xbzrle_counters.cache_miss++;
05931ec5 810 if (!rs->last_stage) {
56e93d26 811 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
9360447d 812 ram_counters.dirty_sync_count) == -1) {
56e93d26
JQ
813 return -1;
814 } else {
815 /* update *current_data when the page has been
816 inserted into cache */
817 *current_data = get_cached_data(XBZRLE.cache, current_addr);
818 }
819 }
820 return -1;
821 }
822
e460a4b1
WW
823 /*
824 * Reaching here means the page has hit the xbzrle cache, no matter what
825 * encoding result it is (normal encoding, overflow or skipping the page),
3a4452d8 826 * count the page as encoded. This is used to calculate the encoding rate.
e460a4b1
WW
827 *
828 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
829 * 2nd page turns out to be skipped (i.e. no new bytes written to the
830 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
831 * skipped page included. In this way, the encoding rate can tell if the
832 * guest page is good for xbzrle encoding.
833 */
834 xbzrle_counters.pages++;
56e93d26
JQ
835 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
836
837 /* save current buffer into memory */
838 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
839
840 /* XBZRLE encoding (if there is no overflow) */
04ffce13 841 encoded_len = xbzrle_encode_buffer_func(prev_cached_page, XBZRLE.current_buf,
842 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
843 TARGET_PAGE_SIZE);
ca353803
WY
844
845 /*
846 * Update the cache contents, so that it corresponds to the data
847 * sent, in all cases except where we skip the page.
848 */
05931ec5 849 if (!rs->last_stage && encoded_len != 0) {
ca353803
WY
850 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
851 /*
852 * In the case where we couldn't compress, ensure that the caller
853 * sends the data from the cache, since the guest might have
854 * changed the RAM since we copied it.
855 */
856 *current_data = prev_cached_page;
857 }
858
56e93d26 859 if (encoded_len == 0) {
55c4446b 860 trace_save_xbzrle_page_skipping();
56e93d26
JQ
861 return 0;
862 } else if (encoded_len == -1) {
55c4446b 863 trace_save_xbzrle_page_overflow();
9360447d 864 xbzrle_counters.overflow++;
e460a4b1 865 xbzrle_counters.bytes += TARGET_PAGE_SIZE;
56e93d26
JQ
866 return -1;
867 }
868
56e93d26 869 /* Send XBZRLE based compressed page */
ec6f3ab9 870 bytes_xbzrle = save_page_header(pss, block,
204b88b8 871 offset | RAM_SAVE_FLAG_XBZRLE);
61717ea9
PX
872 qemu_put_byte(file, ENCODING_FLAG_XBZRLE);
873 qemu_put_be16(file, encoded_len);
874 qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len);
56e93d26 875 bytes_xbzrle += encoded_len + 1 + 2;
e460a4b1
WW
876 /*
877 * Like compressed_size (please see update_compress_thread_counts),
878 * the xbzrle encoded bytes don't count the 8 byte header with
879 * RAM_SAVE_FLAG_CONTINUE.
880 */
881 xbzrle_counters.bytes += bytes_xbzrle - 8;
4c2d0f6d 882 ram_transferred_add(bytes_xbzrle);
56e93d26
JQ
883
884 return 1;
885}
886
3d0684b2 887/**
d9e474ea 888 * pss_find_next_dirty: find the next dirty page of current ramblock
f3f491fc 889 *
d9e474ea
PX
890 * This function updates pss->page to point to the next dirty page index
891 * within the ramblock to migrate, or the end of ramblock when nothing
892 * found. Note that when pss->host_page_sending==true it means we're
893 * during sending a host page, so we won't look for dirty page that is
894 * outside the host page boundary.
3d0684b2 895 *
d9e474ea 896 * @pss: the current page search status
f3f491fc 897 */
d9e474ea 898static void pss_find_next_dirty(PageSearchStatus *pss)
56e93d26 899{
d9e474ea 900 RAMBlock *rb = pss->block;
6b6712ef
JQ
901 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
902 unsigned long *bitmap = rb->bmap;
56e93d26 903
fbd162e6 904 if (ramblock_is_ignored(rb)) {
d9e474ea
PX
905 /* Points directly to the end, so we know no dirty page */
906 pss->page = size;
907 return;
908 }
909
910 /*
911 * If during sending a host page, only look for dirty pages within the
912 * current host page being send.
913 */
914 if (pss->host_page_sending) {
915 assert(pss->host_page_end);
916 size = MIN(size, pss->host_page_end);
b895de50
CLG
917 }
918
d9e474ea 919 pss->page = find_next_bit(bitmap, size, pss->page);
56e93d26
JQ
920}
921
1230a25f 922static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
3143577d
WW
923 unsigned long page)
924{
925 uint8_t shift;
926 hwaddr size, start;
927
928 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
929 return;
930 }
931
932 shift = rb->clear_bmap_shift;
933 /*
934 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
935 * can make things easier sometimes since then start address
936 * of the small chunk will always be 64 pages aligned so the
937 * bitmap will always be aligned to unsigned long. We should
938 * even be able to remove this restriction but I'm simply
939 * keeping it.
940 */
941 assert(shift >= 6);
942
943 size = 1ULL << (TARGET_PAGE_BITS + shift);
7648297d 944 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
3143577d
WW
945 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
946 memory_region_clear_dirty_bitmap(rb->mr, start, size);
947}
948
949static void
1230a25f 950migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
3143577d
WW
951 unsigned long start,
952 unsigned long npages)
953{
954 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
955 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
956 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
957
958 /*
959 * Clear pages from start to start + npages - 1, so the end boundary is
960 * exclusive.
961 */
962 for (i = chunk_start; i < chunk_end; i += chunk_pages) {
1230a25f 963 migration_clear_memory_region_dirty_bitmap(rb, i);
3143577d
WW
964 }
965}
966
a6a83cef
RL
967/*
968 * colo_bitmap_find_diry:find contiguous dirty pages from start
969 *
970 * Returns the page offset within memory region of the start of the contiguout
971 * dirty page
972 *
973 * @rs: current RAM state
974 * @rb: RAMBlock where to search for dirty pages
975 * @start: page where we start the search
976 * @num: the number of contiguous dirty pages
977 */
978static inline
979unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
980 unsigned long start, unsigned long *num)
981{
982 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
983 unsigned long *bitmap = rb->bmap;
984 unsigned long first, next;
985
986 *num = 0;
987
988 if (ramblock_is_ignored(rb)) {
989 return size;
990 }
991
992 first = find_next_bit(bitmap, size, start);
993 if (first >= size) {
994 return first;
995 }
996 next = find_next_zero_bit(bitmap, size, first + 1);
997 assert(next >= first);
998 *num = next - first;
999 return first;
1000}
1001
06b10688 1002static inline bool migration_bitmap_clear_dirty(RAMState *rs,
f20e2865
JQ
1003 RAMBlock *rb,
1004 unsigned long page)
a82d593b
DDAG
1005{
1006 bool ret;
a82d593b 1007
002cad6b
PX
1008 /*
1009 * Clear dirty bitmap if needed. This _must_ be called before we
1010 * send any of the page in the chunk because we need to make sure
1011 * we can capture further page content changes when we sync dirty
1012 * log the next time. So as long as we are going to send any of
1013 * the page in the chunk we clear the remote dirty bitmap for all.
1014 * Clearing it earlier won't be a problem, but too late will.
1015 */
1230a25f 1016 migration_clear_memory_region_dirty_bitmap(rb, page);
002cad6b 1017
6b6712ef 1018 ret = test_and_clear_bit(page, rb->bmap);
a82d593b 1019 if (ret) {
0d8ec885 1020 rs->migration_dirty_pages--;
a82d593b 1021 }
386a907b 1022
a82d593b
DDAG
1023 return ret;
1024}
1025
be39b4cd
DH
1026static void dirty_bitmap_clear_section(MemoryRegionSection *section,
1027 void *opaque)
1028{
1029 const hwaddr offset = section->offset_within_region;
1030 const hwaddr size = int128_get64(section->size);
1031 const unsigned long start = offset >> TARGET_PAGE_BITS;
1032 const unsigned long npages = size >> TARGET_PAGE_BITS;
1033 RAMBlock *rb = section->mr->ram_block;
1034 uint64_t *cleared_bits = opaque;
1035
1036 /*
1037 * We don't grab ram_state->bitmap_mutex because we expect to run
1038 * only when starting migration or during postcopy recovery where
1039 * we don't have concurrent access.
1040 */
1041 if (!migration_in_postcopy() && !migrate_background_snapshot()) {
1042 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
1043 }
1044 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
1045 bitmap_clear(rb->bmap, start, npages);
1046}
1047
1048/*
1049 * Exclude all dirty pages from migration that fall into a discarded range as
1050 * managed by a RamDiscardManager responsible for the mapped memory region of
1051 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
1052 *
1053 * Discarded pages ("logically unplugged") have undefined content and must
1054 * not get migrated, because even reading these pages for migration might
1055 * result in undesired behavior.
1056 *
1057 * Returns the number of cleared bits in the RAMBlock dirty bitmap.
1058 *
1059 * Note: The result is only stable while migrating (precopy/postcopy).
1060 */
1061static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
1062{
1063 uint64_t cleared_bits = 0;
1064
1065 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
1066 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1067 MemoryRegionSection section = {
1068 .mr = rb->mr,
1069 .offset_within_region = 0,
1070 .size = int128_make64(qemu_ram_get_used_length(rb)),
1071 };
1072
1073 ram_discard_manager_replay_discarded(rdm, &section,
1074 dirty_bitmap_clear_section,
1075 &cleared_bits);
1076 }
1077 return cleared_bits;
1078}
1079
9470c5e0
DH
1080/*
1081 * Check if a host-page aligned page falls into a discarded range as managed by
1082 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
1083 *
1084 * Note: The result is only stable while migrating (precopy/postcopy).
1085 */
1086bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
1087{
1088 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1089 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1090 MemoryRegionSection section = {
1091 .mr = rb->mr,
1092 .offset_within_region = start,
1093 .size = int128_make64(qemu_ram_pagesize(rb)),
1094 };
1095
1096 return !ram_discard_manager_is_populated(rdm, &section);
1097 }
1098 return false;
1099}
1100
267691b6 1101/* Called with RCU critical section */
7a3e9571 1102static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
56e93d26 1103{
fb613580
KZ
1104 uint64_t new_dirty_pages =
1105 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
1106
1107 rs->migration_dirty_pages += new_dirty_pages;
1108 rs->num_dirty_pages_period += new_dirty_pages;
56e93d26
JQ
1109}
1110
3d0684b2
JQ
1111/**
1112 * ram_pagesize_summary: calculate all the pagesizes of a VM
1113 *
1114 * Returns a summary bitmap of the page sizes of all RAMBlocks
1115 *
1116 * For VMs with just normal pages this is equivalent to the host page
1117 * size. If it's got some huge pages then it's the OR of all the
1118 * different page sizes.
e8ca1db2
DDAG
1119 */
1120uint64_t ram_pagesize_summary(void)
1121{
1122 RAMBlock *block;
1123 uint64_t summary = 0;
1124
fbd162e6 1125 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
e8ca1db2
DDAG
1126 summary |= block->page_size;
1127 }
1128
1129 return summary;
1130}
1131
aecbfe9c
XG
1132uint64_t ram_get_total_transferred_pages(void)
1133{
23b7576d
PX
1134 return stat64_get(&ram_atomic_counters.normal) +
1135 stat64_get(&ram_atomic_counters.duplicate) +
1136 compression_counters.pages + xbzrle_counters.pages;
aecbfe9c
XG
1137}
1138
b734035b
XG
1139static void migration_update_rates(RAMState *rs, int64_t end_time)
1140{
be8b02ed 1141 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
76e03000 1142 double compressed_size;
b734035b
XG
1143
1144 /* calculate period counters */
1145 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1146 / (end_time - rs->time_last_bitmap_sync);
1147
be8b02ed 1148 if (!page_count) {
b734035b
XG
1149 return;
1150 }
1151
1152 if (migrate_use_xbzrle()) {
e460a4b1
WW
1153 double encoded_size, unencoded_size;
1154
b734035b 1155 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
be8b02ed 1156 rs->xbzrle_cache_miss_prev) / page_count;
b734035b 1157 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
e460a4b1
WW
1158 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1159 TARGET_PAGE_SIZE;
1160 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
92271402 1161 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
e460a4b1 1162 xbzrle_counters.encoding_rate = 0;
e460a4b1
WW
1163 } else {
1164 xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1165 }
1166 rs->xbzrle_pages_prev = xbzrle_counters.pages;
1167 rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
b734035b 1168 }
76e03000
XG
1169
1170 if (migrate_use_compression()) {
1171 compression_counters.busy_rate = (double)(compression_counters.busy -
1172 rs->compress_thread_busy_prev) / page_count;
1173 rs->compress_thread_busy_prev = compression_counters.busy;
1174
1175 compressed_size = compression_counters.compressed_size -
1176 rs->compressed_size_prev;
1177 if (compressed_size) {
1178 double uncompressed_size = (compression_counters.pages -
1179 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1180
1181 /* Compression-Ratio = Uncompressed-size / Compressed-size */
1182 compression_counters.compression_rate =
1183 uncompressed_size / compressed_size;
1184
1185 rs->compress_pages_prev = compression_counters.pages;
1186 rs->compressed_size_prev = compression_counters.compressed_size;
1187 }
1188 }
b734035b
XG
1189}
1190
dc14a470
KZ
1191static void migration_trigger_throttle(RAMState *rs)
1192{
1193 MigrationState *s = migrate_get_current();
1194 uint64_t threshold = s->parameters.throttle_trigger_threshold;
23b7576d
PX
1195 uint64_t bytes_xfer_period =
1196 stat64_get(&ram_atomic_counters.transferred) - rs->bytes_xfer_prev;
dc14a470
KZ
1197 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1198 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1199
1200 /* During block migration the auto-converge logic incorrectly detects
1201 * that ram migration makes no progress. Avoid this by disabling the
1202 * throttling logic during the bulk phase of block migration. */
1203 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1204 /* The following detection logic can be refined later. For now:
1205 Check to see if the ratio between dirtied bytes and the approx.
1206 amount of bytes that just got transferred since the last time
1207 we were in this routine reaches the threshold. If that happens
1208 twice, start or increase throttling. */
1209
1210 if ((bytes_dirty_period > bytes_dirty_threshold) &&
1211 (++rs->dirty_rate_high_cnt >= 2)) {
1212 trace_migration_throttle();
1213 rs->dirty_rate_high_cnt = 0;
cbbf8182
KZ
1214 mig_throttle_guest_down(bytes_dirty_period,
1215 bytes_dirty_threshold);
dc14a470
KZ
1216 }
1217 }
1218}
1219
8d820d6f 1220static void migration_bitmap_sync(RAMState *rs)
56e93d26
JQ
1221{
1222 RAMBlock *block;
56e93d26 1223 int64_t end_time;
56e93d26 1224
9360447d 1225 ram_counters.dirty_sync_count++;
56e93d26 1226
f664da80
JQ
1227 if (!rs->time_last_bitmap_sync) {
1228 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
56e93d26
JQ
1229 }
1230
1231 trace_migration_bitmap_sync_start();
9c1f8f44 1232 memory_global_dirty_log_sync();
56e93d26 1233
108cfae0 1234 qemu_mutex_lock(&rs->bitmap_mutex);
89ac5a1d
DDAG
1235 WITH_RCU_READ_LOCK_GUARD() {
1236 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1237 ramblock_sync_dirty_bitmap(rs, block);
1238 }
1239 ram_counters.remaining = ram_bytes_remaining();
56e93d26 1240 }
108cfae0 1241 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26 1242
9458a9a1 1243 memory_global_after_dirty_log_sync();
a66cd90c 1244 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1ffb5dfd 1245
56e93d26
JQ
1246 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1247
1248 /* more than 1 second = 1000 millisecons */
f664da80 1249 if (end_time > rs->time_last_bitmap_sync + 1000) {
dc14a470 1250 migration_trigger_throttle(rs);
070afca2 1251
b734035b
XG
1252 migration_update_rates(rs, end_time);
1253
be8b02ed 1254 rs->target_page_count_prev = rs->target_page_count;
d693c6f1
FF
1255
1256 /* reset period counters */
f664da80 1257 rs->time_last_bitmap_sync = end_time;
a66cd90c 1258 rs->num_dirty_pages_period = 0;
23b7576d 1259 rs->bytes_xfer_prev = stat64_get(&ram_atomic_counters.transferred);
56e93d26 1260 }
4addcd4f 1261 if (migrate_use_events()) {
3ab72385 1262 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
4addcd4f 1263 }
56e93d26
JQ
1264}
1265
bd227060
WW
1266static void migration_bitmap_sync_precopy(RAMState *rs)
1267{
1268 Error *local_err = NULL;
1269
1270 /*
1271 * The current notifier usage is just an optimization to migration, so we
1272 * don't stop the normal migration process in the error case.
1273 */
1274 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1275 error_report_err(local_err);
b4a1733c 1276 local_err = NULL;
bd227060
WW
1277 }
1278
1279 migration_bitmap_sync(rs);
1280
1281 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1282 error_report_err(local_err);
1283 }
1284}
1285
a4dbaf8e 1286void ram_release_page(const char *rbname, uint64_t offset)
47fe16ff
JQ
1287{
1288 if (!migrate_release_ram() || !migration_in_postcopy()) {
1289 return;
1290 }
1291
1292 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1293}
1294
6c97ec5f
XG
1295/**
1296 * save_zero_page_to_file: send the zero page to the file
1297 *
1298 * Returns the size of data written to the file, 0 means the page is not
1299 * a zero page
1300 *
ec6f3ab9 1301 * @pss: current PSS channel
6c97ec5f
XG
1302 * @block: block that contains the page we want to send
1303 * @offset: offset inside the block for the page
1304 */
ec6f3ab9 1305static int save_zero_page_to_file(PageSearchStatus *pss,
6c97ec5f
XG
1306 RAMBlock *block, ram_addr_t offset)
1307{
1308 uint8_t *p = block->host + offset;
ec6f3ab9 1309 QEMUFile *file = pss->pss_channel;
6c97ec5f
XG
1310 int len = 0;
1311
bad452a7 1312 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
ec6f3ab9 1313 len += save_page_header(pss, block, offset | RAM_SAVE_FLAG_ZERO);
6c97ec5f
XG
1314 qemu_put_byte(file, 0);
1315 len += 1;
47fe16ff 1316 ram_release_page(block->idstr, offset);
6c97ec5f
XG
1317 }
1318 return len;
1319}
1320
56e93d26 1321/**
3d0684b2 1322 * save_zero_page: send the zero page to the stream
56e93d26 1323 *
3d0684b2 1324 * Returns the number of pages written.
56e93d26 1325 *
ec6f3ab9 1326 * @pss: current PSS channel
56e93d26
JQ
1327 * @block: block that contains the page we want to send
1328 * @offset: offset inside the block for the page
56e93d26 1329 */
ec6f3ab9 1330static int save_zero_page(PageSearchStatus *pss, RAMBlock *block,
61717ea9 1331 ram_addr_t offset)
56e93d26 1332{
ec6f3ab9 1333 int len = save_zero_page_to_file(pss, block, offset);
56e93d26 1334
6c97ec5f 1335 if (len) {
23b7576d 1336 stat64_add(&ram_atomic_counters.duplicate, 1);
4c2d0f6d 1337 ram_transferred_add(len);
6c97ec5f 1338 return 1;
56e93d26 1339 }
6c97ec5f 1340 return -1;
56e93d26
JQ
1341}
1342
059ff0fb
XG
1343/*
1344 * @pages: the number of pages written by the control path,
1345 * < 0 - error
1346 * > 0 - number of pages written
1347 *
1348 * Return true if the pages has been saved, otherwise false is returned.
1349 */
61717ea9
PX
1350static bool control_save_page(PageSearchStatus *pss, RAMBlock *block,
1351 ram_addr_t offset, int *pages)
059ff0fb
XG
1352{
1353 uint64_t bytes_xmit = 0;
1354 int ret;
1355
1356 *pages = -1;
61717ea9
PX
1357 ret = ram_control_save_page(pss->pss_channel, block->offset, offset,
1358 TARGET_PAGE_SIZE, &bytes_xmit);
059ff0fb
XG
1359 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1360 return false;
1361 }
1362
1363 if (bytes_xmit) {
4c2d0f6d 1364 ram_transferred_add(bytes_xmit);
059ff0fb
XG
1365 *pages = 1;
1366 }
1367
1368 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1369 return true;
1370 }
1371
1372 if (bytes_xmit > 0) {
23b7576d 1373 stat64_add(&ram_atomic_counters.normal, 1);
059ff0fb 1374 } else if (bytes_xmit == 0) {
23b7576d 1375 stat64_add(&ram_atomic_counters.duplicate, 1);
059ff0fb
XG
1376 }
1377
1378 return true;
1379}
1380
65dacaa0
XG
1381/*
1382 * directly send the page to the stream
1383 *
1384 * Returns the number of pages written.
1385 *
ec6f3ab9 1386 * @pss: current PSS channel
65dacaa0
XG
1387 * @block: block that contains the page we want to send
1388 * @offset: offset inside the block for the page
1389 * @buf: the page to be sent
1390 * @async: send to page asyncly
1391 */
ec6f3ab9 1392static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
61717ea9 1393 ram_addr_t offset, uint8_t *buf, bool async)
65dacaa0 1394{
ec6f3ab9
PX
1395 QEMUFile *file = pss->pss_channel;
1396
1397 ram_transferred_add(save_page_header(pss, block,
4c2d0f6d 1398 offset | RAM_SAVE_FLAG_PAGE));
65dacaa0 1399 if (async) {
61717ea9 1400 qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
f912ec5b 1401 migrate_release_ram() &&
65dacaa0
XG
1402 migration_in_postcopy());
1403 } else {
61717ea9 1404 qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
65dacaa0 1405 }
4c2d0f6d 1406 ram_transferred_add(TARGET_PAGE_SIZE);
23b7576d 1407 stat64_add(&ram_atomic_counters.normal, 1);
65dacaa0
XG
1408 return 1;
1409}
1410
56e93d26 1411/**
3d0684b2 1412 * ram_save_page: send the given page to the stream
56e93d26 1413 *
3d0684b2 1414 * Returns the number of pages written.
3fd3c4b3
DDAG
1415 * < 0 - error
1416 * >=0 - Number of pages written - this might legally be 0
1417 * if xbzrle noticed the page was the same.
56e93d26 1418 *
6f37bb8b 1419 * @rs: current RAM state
56e93d26
JQ
1420 * @block: block that contains the page we want to send
1421 * @offset: offset inside the block for the page
56e93d26 1422 */
05931ec5 1423static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
56e93d26
JQ
1424{
1425 int pages = -1;
56e93d26 1426 uint8_t *p;
56e93d26 1427 bool send_async = true;
a08f6890 1428 RAMBlock *block = pss->block;
8bba004c 1429 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
059ff0fb 1430 ram_addr_t current_addr = block->offset + offset;
56e93d26 1431
2f68e399 1432 p = block->host + offset;
1db9d8e5 1433 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
56e93d26 1434
56e93d26 1435 XBZRLE_cache_lock();
1a373522 1436 if (rs->xbzrle_enabled && !migration_in_postcopy()) {
ec6f3ab9 1437 pages = save_xbzrle_page(rs, pss, &p, current_addr,
61717ea9 1438 block, offset);
05931ec5 1439 if (!rs->last_stage) {
059ff0fb
XG
1440 /* Can't send this cached data async, since the cache page
1441 * might get updated before it gets to the wire
56e93d26 1442 */
059ff0fb 1443 send_async = false;
56e93d26
JQ
1444 }
1445 }
1446
1447 /* XBZRLE overflow or normal page */
1448 if (pages == -1) {
ec6f3ab9 1449 pages = save_normal_page(pss, block, offset, p, send_async);
56e93d26
JQ
1450 }
1451
1452 XBZRLE_cache_unlock();
1453
1454 return pages;
1455}
1456
61717ea9 1457static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block,
b9ee2f7d
JQ
1458 ram_addr_t offset)
1459{
61717ea9 1460 if (multifd_queue_page(file, block, offset) < 0) {
713f762a
IR
1461 return -1;
1462 }
23b7576d 1463 stat64_add(&ram_atomic_counters.normal, 1);
b9ee2f7d
JQ
1464
1465 return 1;
1466}
1467
5e5fdcff 1468static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
6ef3771c 1469 ram_addr_t offset, uint8_t *source_buf)
56e93d26 1470{
53518d94 1471 RAMState *rs = ram_state;
ec6f3ab9 1472 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
20d549cb 1473 uint8_t *p = block->host + offset;
6ef3771c 1474 int ret;
56e93d26 1475
ec6f3ab9 1476 if (save_zero_page_to_file(pss, block, offset)) {
e7f2e190 1477 return true;
5e5fdcff
XG
1478 }
1479
ec6f3ab9 1480 save_page_header(pss, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
34ab9e97
XG
1481
1482 /*
1483 * copy it to a internal buffer to avoid it being modified by VM
1484 * so that we can catch up the error during compression and
1485 * decompression
1486 */
1487 memcpy(source_buf, p, TARGET_PAGE_SIZE);
6ef3771c
XG
1488 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1489 if (ret < 0) {
1490 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
b3be2896 1491 error_report("compressed data failed!");
b3be2896 1492 }
e7f2e190 1493 return false;
5e5fdcff
XG
1494}
1495
1496static void
1497update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1498{
4c2d0f6d 1499 ram_transferred_add(bytes_xmit);
76e03000 1500
5e5fdcff 1501 if (param->zero_page) {
23b7576d 1502 stat64_add(&ram_atomic_counters.duplicate, 1);
76e03000 1503 return;
5e5fdcff 1504 }
76e03000
XG
1505
1506 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1507 compression_counters.compressed_size += bytes_xmit - 8;
1508 compression_counters.pages++;
56e93d26
JQ
1509}
1510
32b05495
XG
1511static bool save_page_use_compression(RAMState *rs);
1512
ce25d337 1513static void flush_compressed_data(RAMState *rs)
56e93d26 1514{
eaa238ab 1515 MigrationState *ms = migrate_get_current();
56e93d26
JQ
1516 int idx, len, thread_count;
1517
32b05495 1518 if (!save_page_use_compression(rs)) {
56e93d26
JQ
1519 return;
1520 }
1521 thread_count = migrate_compress_threads();
a7a9a88f 1522
0d9f9a5c 1523 qemu_mutex_lock(&comp_done_lock);
56e93d26 1524 for (idx = 0; idx < thread_count; idx++) {
a7a9a88f 1525 while (!comp_param[idx].done) {
0d9f9a5c 1526 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26 1527 }
a7a9a88f 1528 }
0d9f9a5c 1529 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
1530
1531 for (idx = 0; idx < thread_count; idx++) {
1532 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 1533 if (!comp_param[idx].quit) {
eaa238ab 1534 len = qemu_put_qemu_file(ms->to_dst_file, comp_param[idx].file);
5e5fdcff
XG
1535 /*
1536 * it's safe to fetch zero_page without holding comp_done_lock
1537 * as there is no further request submitted to the thread,
1538 * i.e, the thread should be waiting for a request at this point.
1539 */
1540 update_compress_thread_counts(&comp_param[idx], len);
56e93d26 1541 }
a7a9a88f 1542 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26
JQ
1543 }
1544}
1545
1546static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1547 ram_addr_t offset)
1548{
1549 param->block = block;
1550 param->offset = offset;
1551}
1552
eaa238ab 1553static int compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset)
56e93d26
JQ
1554{
1555 int idx, thread_count, bytes_xmit = -1, pages = -1;
1d58872a 1556 bool wait = migrate_compress_wait_thread();
eaa238ab 1557 MigrationState *ms = migrate_get_current();
56e93d26
JQ
1558
1559 thread_count = migrate_compress_threads();
0d9f9a5c 1560 qemu_mutex_lock(&comp_done_lock);
1d58872a
XG
1561retry:
1562 for (idx = 0; idx < thread_count; idx++) {
1563 if (comp_param[idx].done) {
1564 comp_param[idx].done = false;
eaa238ab
PX
1565 bytes_xmit = qemu_put_qemu_file(ms->to_dst_file,
1566 comp_param[idx].file);
1d58872a
XG
1567 qemu_mutex_lock(&comp_param[idx].mutex);
1568 set_compress_params(&comp_param[idx], block, offset);
1569 qemu_cond_signal(&comp_param[idx].cond);
1570 qemu_mutex_unlock(&comp_param[idx].mutex);
1571 pages = 1;
5e5fdcff 1572 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
56e93d26 1573 break;
56e93d26
JQ
1574 }
1575 }
1d58872a
XG
1576
1577 /*
1578 * wait for the free thread if the user specifies 'compress-wait-thread',
1579 * otherwise we will post the page out in the main thread as normal page.
1580 */
1581 if (pages < 0 && wait) {
1582 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1583 goto retry;
1584 }
0d9f9a5c 1585 qemu_mutex_unlock(&comp_done_lock);
56e93d26
JQ
1586
1587 return pages;
1588}
1589
31e2ac74
JQ
1590#define PAGE_ALL_CLEAN 0
1591#define PAGE_TRY_AGAIN 1
1592#define PAGE_DIRTY_FOUND 2
3d0684b2
JQ
1593/**
1594 * find_dirty_block: find the next dirty page and update any state
1595 * associated with the search process.
b9e60928 1596 *
31e2ac74
JQ
1597 * Returns:
1598 * PAGE_ALL_CLEAN: no dirty page found, give up
1599 * PAGE_TRY_AGAIN: no dirty page found, retry for next block
1600 * PAGE_DIRTY_FOUND: dirty page found
b9e60928 1601 *
6f37bb8b 1602 * @rs: current RAM state
3d0684b2
JQ
1603 * @pss: data about the state of the current dirty page scan
1604 * @again: set to false if the search has scanned the whole of RAM
b9e60928 1605 */
31e2ac74 1606static int find_dirty_block(RAMState *rs, PageSearchStatus *pss)
b9e60928 1607{
d9e474ea
PX
1608 /* Update pss->page for the next dirty bit in ramblock */
1609 pss_find_next_dirty(pss);
1610
6f37bb8b 1611 if (pss->complete_round && pss->block == rs->last_seen_block &&
a935e30f 1612 pss->page >= rs->last_page) {
b9e60928
DDAG
1613 /*
1614 * We've been once around the RAM and haven't found anything.
1615 * Give up.
1616 */
31e2ac74 1617 return PAGE_ALL_CLEAN;
b9e60928 1618 }
542147f4
DH
1619 if (!offset_in_ramblock(pss->block,
1620 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
b9e60928 1621 /* Didn't find anything in this RAM Block */
a935e30f 1622 pss->page = 0;
b9e60928
DDAG
1623 pss->block = QLIST_NEXT_RCU(pss->block, next);
1624 if (!pss->block) {
48df9d80
XG
1625 /*
1626 * If memory migration starts over, we will meet a dirtied page
1627 * which may still exists in compression threads's ring, so we
1628 * should flush the compressed data to make sure the new page
1629 * is not overwritten by the old one in the destination.
1630 *
1631 * Also If xbzrle is on, stop using the data compression at this
1632 * point. In theory, xbzrle can do better than compression.
1633 */
1634 flush_compressed_data(rs);
1635
b9e60928
DDAG
1636 /* Hit the end of the list */
1637 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1638 /* Flag that we've looped */
1639 pss->complete_round = true;
1a373522
DH
1640 /* After the first round, enable XBZRLE. */
1641 if (migrate_use_xbzrle()) {
1642 rs->xbzrle_enabled = true;
1643 }
b9e60928
DDAG
1644 }
1645 /* Didn't find anything this time, but try again on the new block */
31e2ac74 1646 return PAGE_TRY_AGAIN;
b9e60928 1647 } else {
31e2ac74
JQ
1648 /* We've found something */
1649 return PAGE_DIRTY_FOUND;
b9e60928
DDAG
1650 }
1651}
1652
3d0684b2
JQ
1653/**
1654 * unqueue_page: gets a page of the queue
1655 *
a82d593b 1656 * Helper for 'get_queued_page' - gets a page off the queue
a82d593b 1657 *
3d0684b2
JQ
1658 * Returns the block of the page (or NULL if none available)
1659 *
ec481c6c 1660 * @rs: current RAM state
3d0684b2 1661 * @offset: used to return the offset within the RAMBlock
a82d593b 1662 */
f20e2865 1663static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
a82d593b 1664{
a1fe28df 1665 struct RAMSrcPageRequest *entry;
a82d593b
DDAG
1666 RAMBlock *block = NULL;
1667
a1fe28df 1668 if (!postcopy_has_request(rs)) {
ae526e32
XG
1669 return NULL;
1670 }
1671
6e8a355d 1672 QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
a1fe28df
PX
1673
1674 /*
1675 * This should _never_ change even after we take the lock, because no one
1676 * should be taking anything off the request list other than us.
1677 */
1678 assert(postcopy_has_request(rs));
1679
1680 entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1681 block = entry->rb;
1682 *offset = entry->offset;
1683
777f53c7
TH
1684 if (entry->len > TARGET_PAGE_SIZE) {
1685 entry->len -= TARGET_PAGE_SIZE;
1686 entry->offset += TARGET_PAGE_SIZE;
a1fe28df
PX
1687 } else {
1688 memory_region_unref(block->mr);
1689 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1690 g_free(entry);
1691 migration_consume_urgent_request();
a82d593b 1692 }
a82d593b
DDAG
1693
1694 return block;
1695}
1696
278e2f55
AG
1697#if defined(__linux__)
1698/**
1699 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1700 * is found, return RAM block pointer and page offset
1701 *
1702 * Returns pointer to the RAMBlock containing faulting page,
1703 * NULL if no write faults are pending
1704 *
1705 * @rs: current RAM state
1706 * @offset: page offset from the beginning of the block
1707 */
1708static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1709{
1710 struct uffd_msg uffd_msg;
1711 void *page_address;
82ea3e3b 1712 RAMBlock *block;
278e2f55
AG
1713 int res;
1714
1715 if (!migrate_background_snapshot()) {
1716 return NULL;
1717 }
1718
1719 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1720 if (res <= 0) {
1721 return NULL;
1722 }
1723
1724 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
82ea3e3b
AG
1725 block = qemu_ram_block_from_host(page_address, false, offset);
1726 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1727 return block;
278e2f55
AG
1728}
1729
1730/**
1731 * ram_save_release_protection: release UFFD write protection after
1732 * a range of pages has been saved
1733 *
1734 * @rs: current RAM state
1735 * @pss: page-search-status structure
1736 * @start_page: index of the first page in the range relative to pss->block
1737 *
1738 * Returns 0 on success, negative value in case of an error
1739*/
1740static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1741 unsigned long start_page)
1742{
1743 int res = 0;
1744
1745 /* Check if page is from UFFD-managed region. */
1746 if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1747 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
258f5c98 1748 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
278e2f55
AG
1749
1750 /* Flush async buffers before un-protect. */
61717ea9 1751 qemu_fflush(pss->pss_channel);
278e2f55
AG
1752 /* Un-protect memory range. */
1753 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1754 false, false);
1755 }
1756
1757 return res;
1758}
1759
1760/* ram_write_tracking_available: check if kernel supports required UFFD features
1761 *
1762 * Returns true if supports, false otherwise
1763 */
1764bool ram_write_tracking_available(void)
1765{
1766 uint64_t uffd_features;
1767 int res;
1768
1769 res = uffd_query_features(&uffd_features);
1770 return (res == 0 &&
1771 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1772}
1773
1774/* ram_write_tracking_compatible: check if guest configuration is
1775 * compatible with 'write-tracking'
1776 *
1777 * Returns true if compatible, false otherwise
1778 */
1779bool ram_write_tracking_compatible(void)
1780{
1781 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1782 int uffd_fd;
82ea3e3b 1783 RAMBlock *block;
278e2f55
AG
1784 bool ret = false;
1785
1786 /* Open UFFD file descriptor */
1787 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1788 if (uffd_fd < 0) {
1789 return false;
1790 }
1791
1792 RCU_READ_LOCK_GUARD();
1793
82ea3e3b 1794 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
278e2f55
AG
1795 uint64_t uffd_ioctls;
1796
1797 /* Nothing to do with read-only and MMIO-writable regions */
82ea3e3b 1798 if (block->mr->readonly || block->mr->rom_device) {
278e2f55
AG
1799 continue;
1800 }
1801 /* Try to register block memory via UFFD-IO to track writes */
82ea3e3b 1802 if (uffd_register_memory(uffd_fd, block->host, block->max_length,
278e2f55
AG
1803 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1804 goto out;
1805 }
1806 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1807 goto out;
1808 }
1809 }
1810 ret = true;
1811
1812out:
1813 uffd_close_fd(uffd_fd);
1814 return ret;
1815}
1816
f7b9dcfb
DH
1817static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1818 ram_addr_t size)
1819{
5f19a449
DH
1820 const ram_addr_t end = offset + size;
1821
f7b9dcfb
DH
1822 /*
1823 * We read one byte of each page; this will preallocate page tables if
1824 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1825 * where no page was populated yet. This might require adaption when
1826 * supporting other mappings, like shmem.
1827 */
5f19a449 1828 for (; offset < end; offset += block->page_size) {
f7b9dcfb
DH
1829 char tmp = *((char *)block->host + offset);
1830
1831 /* Don't optimize the read out */
1832 asm volatile("" : "+r" (tmp));
1833 }
1834}
1835
6fee3a1f
DH
1836static inline int populate_read_section(MemoryRegionSection *section,
1837 void *opaque)
1838{
1839 const hwaddr size = int128_get64(section->size);
1840 hwaddr offset = section->offset_within_region;
1841 RAMBlock *block = section->mr->ram_block;
1842
1843 populate_read_range(block, offset, size);
1844 return 0;
1845}
1846
eeccb99c 1847/*
f7b9dcfb
DH
1848 * ram_block_populate_read: preallocate page tables and populate pages in the
1849 * RAM block by reading a byte of each page.
eeccb99c
AG
1850 *
1851 * Since it's solely used for userfault_fd WP feature, here we just
1852 * hardcode page size to qemu_real_host_page_size.
1853 *
82ea3e3b 1854 * @block: RAM block to populate
eeccb99c 1855 */
6fee3a1f 1856static void ram_block_populate_read(RAMBlock *rb)
eeccb99c 1857{
6fee3a1f
DH
1858 /*
1859 * Skip populating all pages that fall into a discarded range as managed by
1860 * a RamDiscardManager responsible for the mapped memory region of the
1861 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1862 * must not get populated automatically. We don't have to track
1863 * modifications via userfaultfd WP reliably, because these pages will
1864 * not be part of the migration stream either way -- see
1865 * ramblock_dirty_bitmap_exclude_discarded_pages().
1866 *
1867 * Note: The result is only stable while migrating (precopy/postcopy).
1868 */
1869 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1870 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1871 MemoryRegionSection section = {
1872 .mr = rb->mr,
1873 .offset_within_region = 0,
1874 .size = rb->mr->size,
1875 };
1876
1877 ram_discard_manager_replay_populated(rdm, &section,
1878 populate_read_section, NULL);
1879 } else {
1880 populate_read_range(rb, 0, rb->used_length);
1881 }
eeccb99c
AG
1882}
1883
1884/*
1885 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1886 */
1887void ram_write_tracking_prepare(void)
1888{
82ea3e3b 1889 RAMBlock *block;
eeccb99c
AG
1890
1891 RCU_READ_LOCK_GUARD();
1892
82ea3e3b 1893 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
eeccb99c 1894 /* Nothing to do with read-only and MMIO-writable regions */
82ea3e3b 1895 if (block->mr->readonly || block->mr->rom_device) {
eeccb99c
AG
1896 continue;
1897 }
1898
1899 /*
1900 * Populate pages of the RAM block before enabling userfault_fd
1901 * write protection.
1902 *
1903 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1904 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1905 * pages with pte_none() entries in page table.
1906 */
f7b9dcfb 1907 ram_block_populate_read(block);
eeccb99c
AG
1908 }
1909}
1910
e41c5770
DH
1911static inline int uffd_protect_section(MemoryRegionSection *section,
1912 void *opaque)
1913{
1914 const hwaddr size = int128_get64(section->size);
1915 const hwaddr offset = section->offset_within_region;
1916 RAMBlock *rb = section->mr->ram_block;
1917 int uffd_fd = (uintptr_t)opaque;
1918
1919 return uffd_change_protection(uffd_fd, rb->host + offset, size, true,
1920 false);
1921}
1922
1923static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd)
1924{
1925 assert(rb->flags & RAM_UF_WRITEPROTECT);
1926
1927 /* See ram_block_populate_read() */
1928 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1929 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1930 MemoryRegionSection section = {
1931 .mr = rb->mr,
1932 .offset_within_region = 0,
1933 .size = rb->mr->size,
1934 };
1935
1936 return ram_discard_manager_replay_populated(rdm, &section,
1937 uffd_protect_section,
1938 (void *)(uintptr_t)uffd_fd);
1939 }
1940 return uffd_change_protection(uffd_fd, rb->host,
1941 rb->used_length, true, false);
1942}
1943
278e2f55
AG
1944/*
1945 * ram_write_tracking_start: start UFFD-WP memory tracking
1946 *
1947 * Returns 0 for success or negative value in case of error
1948 */
1949int ram_write_tracking_start(void)
1950{
1951 int uffd_fd;
1952 RAMState *rs = ram_state;
82ea3e3b 1953 RAMBlock *block;
278e2f55
AG
1954
1955 /* Open UFFD file descriptor */
1956 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1957 if (uffd_fd < 0) {
1958 return uffd_fd;
1959 }
1960 rs->uffdio_fd = uffd_fd;
1961
1962 RCU_READ_LOCK_GUARD();
1963
82ea3e3b 1964 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
278e2f55 1965 /* Nothing to do with read-only and MMIO-writable regions */
82ea3e3b 1966 if (block->mr->readonly || block->mr->rom_device) {
278e2f55
AG
1967 continue;
1968 }
1969
1970 /* Register block memory with UFFD to track writes */
82ea3e3b
AG
1971 if (uffd_register_memory(rs->uffdio_fd, block->host,
1972 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
278e2f55
AG
1973 goto fail;
1974 }
72ef3a37
DH
1975 block->flags |= RAM_UF_WRITEPROTECT;
1976 memory_region_ref(block->mr);
1977
278e2f55 1978 /* Apply UFFD write protection to the block memory range */
e41c5770 1979 if (ram_block_uffd_protect(block, uffd_fd)) {
278e2f55
AG
1980 goto fail;
1981 }
278e2f55 1982
82ea3e3b
AG
1983 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1984 block->host, block->max_length);
278e2f55
AG
1985 }
1986
1987 return 0;
1988
1989fail:
1990 error_report("ram_write_tracking_start() failed: restoring initial memory state");
1991
82ea3e3b
AG
1992 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1993 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
278e2f55
AG
1994 continue;
1995 }
82ea3e3b 1996 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
278e2f55 1997 /* Cleanup flags and remove reference */
82ea3e3b
AG
1998 block->flags &= ~RAM_UF_WRITEPROTECT;
1999 memory_region_unref(block->mr);
278e2f55
AG
2000 }
2001
2002 uffd_close_fd(uffd_fd);
2003 rs->uffdio_fd = -1;
2004 return -1;
2005}
2006
2007/**
2008 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
2009 */
2010void ram_write_tracking_stop(void)
2011{
2012 RAMState *rs = ram_state;
82ea3e3b 2013 RAMBlock *block;
278e2f55
AG
2014
2015 RCU_READ_LOCK_GUARD();
2016
82ea3e3b
AG
2017 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2018 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
278e2f55
AG
2019 continue;
2020 }
82ea3e3b 2021 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
278e2f55 2022
82ea3e3b
AG
2023 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
2024 block->host, block->max_length);
278e2f55
AG
2025
2026 /* Cleanup flags and remove reference */
82ea3e3b
AG
2027 block->flags &= ~RAM_UF_WRITEPROTECT;
2028 memory_region_unref(block->mr);
278e2f55
AG
2029 }
2030
2031 /* Finally close UFFD file descriptor */
2032 uffd_close_fd(rs->uffdio_fd);
2033 rs->uffdio_fd = -1;
2034}
2035
2036#else
2037/* No target OS support, stubs just fail or ignore */
2038
2039static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
2040{
2041 (void) rs;
2042 (void) offset;
2043
2044 return NULL;
2045}
2046
2047static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
2048 unsigned long start_page)
2049{
2050 (void) rs;
2051 (void) pss;
2052 (void) start_page;
2053
2054 return 0;
2055}
2056
2057bool ram_write_tracking_available(void)
2058{
2059 return false;
2060}
2061
2062bool ram_write_tracking_compatible(void)
2063{
2064 assert(0);
2065 return false;
2066}
2067
2068int ram_write_tracking_start(void)
2069{
2070 assert(0);
2071 return -1;
2072}
2073
2074void ram_write_tracking_stop(void)
2075{
2076 assert(0);
2077}
2078#endif /* defined(__linux__) */
2079
3d0684b2 2080/**
ff1543af 2081 * get_queued_page: unqueue a page from the postcopy requests
3d0684b2
JQ
2082 *
2083 * Skips pages that are already sent (!dirty)
a82d593b 2084 *
a5f7b1a6 2085 * Returns true if a queued page is found
a82d593b 2086 *
6f37bb8b 2087 * @rs: current RAM state
3d0684b2 2088 * @pss: data about the state of the current dirty page scan
a82d593b 2089 */
f20e2865 2090static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
a82d593b
DDAG
2091{
2092 RAMBlock *block;
2093 ram_addr_t offset;
777f53c7
TH
2094 bool dirty;
2095
2096 do {
2097 block = unqueue_page(rs, &offset);
2098 /*
2099 * We're sending this page, and since it's postcopy nothing else
2100 * will dirty it, and we must make sure it doesn't get sent again
2101 * even if this queue request was received after the background
2102 * search already sent it.
2103 */
2104 if (block) {
2105 unsigned long page;
2106
2107 page = offset >> TARGET_PAGE_BITS;
2108 dirty = test_bit(page, block->bmap);
2109 if (!dirty) {
2110 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2111 page);
2112 } else {
2113 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2114 }
2115 }
a82d593b 2116
777f53c7 2117 } while (block && !dirty);
a82d593b 2118
b062106d 2119 if (!block) {
278e2f55
AG
2120 /*
2121 * Poll write faults too if background snapshot is enabled; that's
2122 * when we have vcpus got blocked by the write protected pages.
2123 */
2124 block = poll_fault_page(rs, &offset);
2125 }
2126
a82d593b 2127 if (block) {
a82d593b
DDAG
2128 /*
2129 * We want the background search to continue from the queued page
2130 * since the guest is likely to want other pages near to the page
2131 * it just requested.
2132 */
2133 pss->block = block;
a935e30f 2134 pss->page = offset >> TARGET_PAGE_BITS;
422314e7
WY
2135
2136 /*
2137 * This unqueued page would break the "one round" check, even is
2138 * really rare.
2139 */
2140 pss->complete_round = false;
a82d593b
DDAG
2141 }
2142
2143 return !!block;
2144}
2145
6c595cde 2146/**
5e58f968
JQ
2147 * migration_page_queue_free: drop any remaining pages in the ram
2148 * request queue
6c595cde 2149 *
3d0684b2
JQ
2150 * It should be empty at the end anyway, but in error cases there may
2151 * be some left. in case that there is any page left, we drop it.
2152 *
6c595cde 2153 */
83c13382 2154static void migration_page_queue_free(RAMState *rs)
6c595cde 2155{
ec481c6c 2156 struct RAMSrcPageRequest *mspr, *next_mspr;
6c595cde
DDAG
2157 /* This queue generally should be empty - but in the case of a failed
2158 * migration might have some droppings in.
2159 */
89ac5a1d 2160 RCU_READ_LOCK_GUARD();
ec481c6c 2161 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
6c595cde 2162 memory_region_unref(mspr->rb->mr);
ec481c6c 2163 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
6c595cde
DDAG
2164 g_free(mspr);
2165 }
6c595cde
DDAG
2166}
2167
2168/**
3d0684b2
JQ
2169 * ram_save_queue_pages: queue the page for transmission
2170 *
2171 * A request from postcopy destination for example.
2172 *
2173 * Returns zero on success or negative on error
2174 *
3d0684b2
JQ
2175 * @rbname: Name of the RAMBLock of the request. NULL means the
2176 * same that last one.
2177 * @start: starting address from the start of the RAMBlock
2178 * @len: length (in bytes) to send
6c595cde 2179 */
96506894 2180int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
6c595cde
DDAG
2181{
2182 RAMBlock *ramblock;
53518d94 2183 RAMState *rs = ram_state;
6c595cde 2184
9360447d 2185 ram_counters.postcopy_requests++;
89ac5a1d
DDAG
2186 RCU_READ_LOCK_GUARD();
2187
6c595cde
DDAG
2188 if (!rbname) {
2189 /* Reuse last RAMBlock */
68a098f3 2190 ramblock = rs->last_req_rb;
6c595cde
DDAG
2191
2192 if (!ramblock) {
2193 /*
2194 * Shouldn't happen, we can't reuse the last RAMBlock if
2195 * it's the 1st request.
2196 */
2197 error_report("ram_save_queue_pages no previous block");
03acb4e9 2198 return -1;
6c595cde
DDAG
2199 }
2200 } else {
2201 ramblock = qemu_ram_block_by_name(rbname);
2202
2203 if (!ramblock) {
2204 /* We shouldn't be asked for a non-existent RAMBlock */
2205 error_report("ram_save_queue_pages no block '%s'", rbname);
03acb4e9 2206 return -1;
6c595cde 2207 }
68a098f3 2208 rs->last_req_rb = ramblock;
6c595cde
DDAG
2209 }
2210 trace_ram_save_queue_pages(ramblock->idstr, start, len);
542147f4 2211 if (!offset_in_ramblock(ramblock, start + len - 1)) {
9458ad6b
JQ
2212 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2213 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
6c595cde 2214 __func__, start, len, ramblock->used_length);
03acb4e9 2215 return -1;
6c595cde
DDAG
2216 }
2217
93589827
PX
2218 /*
2219 * When with postcopy preempt, we send back the page directly in the
2220 * rp-return thread.
2221 */
2222 if (postcopy_preempt_active()) {
2223 ram_addr_t page_start = start >> TARGET_PAGE_BITS;
2224 size_t page_size = qemu_ram_pagesize(ramblock);
2225 PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY];
2226 int ret = 0;
2227
2228 qemu_mutex_lock(&rs->bitmap_mutex);
2229
2230 pss_init(pss, ramblock, page_start);
2231 /*
2232 * Always use the preempt channel, and make sure it's there. It's
2233 * safe to access without lock, because when rp-thread is running
2234 * we should be the only one who operates on the qemufile
2235 */
2236 pss->pss_channel = migrate_get_current()->postcopy_qemufile_src;
93589827
PX
2237 assert(pss->pss_channel);
2238
2239 /*
2240 * It must be either one or multiple of host page size. Just
2241 * assert; if something wrong we're mostly split brain anyway.
2242 */
2243 assert(len % page_size == 0);
2244 while (len) {
2245 if (ram_save_host_page_urgent(pss)) {
2246 error_report("%s: ram_save_host_page_urgent() failed: "
2247 "ramblock=%s, start_addr=0x"RAM_ADDR_FMT,
2248 __func__, ramblock->idstr, start);
2249 ret = -1;
2250 break;
2251 }
2252 /*
2253 * NOTE: after ram_save_host_page_urgent() succeeded, pss->page
2254 * will automatically be moved and point to the next host page
2255 * we're going to send, so no need to update here.
2256 *
2257 * Normally QEMU never sends >1 host page in requests, so
2258 * logically we don't even need that as the loop should only
2259 * run once, but just to be consistent.
2260 */
2261 len -= page_size;
2262 };
2263 qemu_mutex_unlock(&rs->bitmap_mutex);
2264
2265 return ret;
2266 }
2267
ec481c6c 2268 struct RAMSrcPageRequest *new_entry =
b21e2380 2269 g_new0(struct RAMSrcPageRequest, 1);
6c595cde
DDAG
2270 new_entry->rb = ramblock;
2271 new_entry->offset = start;
2272 new_entry->len = len;
2273
2274 memory_region_ref(ramblock->mr);
ec481c6c
JQ
2275 qemu_mutex_lock(&rs->src_page_req_mutex);
2276 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
e03a34f8 2277 migration_make_urgent_request();
ec481c6c 2278 qemu_mutex_unlock(&rs->src_page_req_mutex);
6c595cde
DDAG
2279
2280 return 0;
6c595cde
DDAG
2281}
2282
d7400a34
XG
2283static bool save_page_use_compression(RAMState *rs)
2284{
2285 if (!migrate_use_compression()) {
2286 return false;
2287 }
2288
2289 /*
1a373522
DH
2290 * If xbzrle is enabled (e.g., after first round of migration), stop
2291 * using the data compression. In theory, xbzrle can do better than
2292 * compression.
d7400a34 2293 */
1a373522
DH
2294 if (rs->xbzrle_enabled) {
2295 return false;
d7400a34
XG
2296 }
2297
1a373522 2298 return true;
d7400a34
XG
2299}
2300
5e5fdcff
XG
2301/*
2302 * try to compress the page before posting it out, return true if the page
2303 * has been properly handled by compression, otherwise needs other
2304 * paths to handle it
2305 */
ec6f3ab9
PX
2306static bool save_compress_page(RAMState *rs, PageSearchStatus *pss,
2307 RAMBlock *block, ram_addr_t offset)
5e5fdcff
XG
2308{
2309 if (!save_page_use_compression(rs)) {
2310 return false;
2311 }
2312
2313 /*
2314 * When starting the process of a new block, the first page of
2315 * the block should be sent out before other pages in the same
2316 * block, and all the pages in last block should have been sent
2317 * out, keeping this order is important, because the 'cont' flag
2318 * is used to avoid resending the block name.
2319 *
2320 * We post the fist page as normal page as compression will take
2321 * much CPU resource.
2322 */
ec6f3ab9 2323 if (block != pss->last_sent_block) {
5e5fdcff
XG
2324 flush_compressed_data(rs);
2325 return false;
2326 }
2327
eaa238ab 2328 if (compress_page_with_multi_thread(block, offset) > 0) {
5e5fdcff
XG
2329 return true;
2330 }
2331
76e03000 2332 compression_counters.busy++;
5e5fdcff
XG
2333 return false;
2334}
2335
a82d593b 2336/**
4010ba38 2337 * ram_save_target_page_legacy: save one target page
a82d593b 2338 *
3d0684b2 2339 * Returns the number of pages written
a82d593b 2340 *
6f37bb8b 2341 * @rs: current RAM state
3d0684b2 2342 * @pss: data about the page we want to send
a82d593b 2343 */
4010ba38 2344static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss)
a82d593b 2345{
a8ec91f9 2346 RAMBlock *block = pss->block;
8bba004c 2347 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
a8ec91f9
XG
2348 int res;
2349
61717ea9 2350 if (control_save_page(pss, block, offset, &res)) {
a8ec91f9
XG
2351 return res;
2352 }
2353
ec6f3ab9 2354 if (save_compress_page(rs, pss, block, offset)) {
5e5fdcff 2355 return 1;
d7400a34
XG
2356 }
2357
ec6f3ab9 2358 res = save_zero_page(pss, block, offset);
d7400a34
XG
2359 if (res > 0) {
2360 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2361 * page would be stale
2362 */
ef5c3d13 2363 if (rs->xbzrle_enabled) {
d7400a34
XG
2364 XBZRLE_cache_lock();
2365 xbzrle_cache_zero_page(rs, block->offset + offset);
2366 XBZRLE_cache_unlock();
2367 }
d7400a34
XG
2368 return res;
2369 }
2370
da3f56cb 2371 /*
6f39c90b
PX
2372 * Do not use multifd in postcopy as one whole host page should be
2373 * placed. Meanwhile postcopy requires atomic update of pages, so even
2374 * if host page size == guest page size the dest guest during run may
2375 * still see partially copied pages which is data corruption.
da3f56cb 2376 */
6f39c90b 2377 if (migrate_use_multifd() && !migration_in_postcopy()) {
61717ea9 2378 return ram_save_multifd_page(pss->pss_channel, block, offset);
a82d593b
DDAG
2379 }
2380
05931ec5 2381 return ram_save_page(rs, pss);
a82d593b
DDAG
2382}
2383
d9e474ea
PX
2384/* Should be called before sending a host page */
2385static void pss_host_page_prepare(PageSearchStatus *pss)
2386{
2387 /* How many guest pages are there in one host page? */
2388 size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2389
2390 pss->host_page_sending = true;
301d7ffe
PX
2391 if (guest_pfns <= 1) {
2392 /*
2393 * This covers both when guest psize == host psize, or when guest
2394 * has larger psize than the host (guest_pfns==0).
2395 *
2396 * For the latter, we always send one whole guest page per
2397 * iteration of the host page (example: an Alpha VM on x86 host
2398 * will have guest psize 8K while host psize 4K).
2399 */
2400 pss->host_page_start = pss->page;
2401 pss->host_page_end = pss->page + 1;
2402 } else {
2403 /*
2404 * The host page spans over multiple guest pages, we send them
2405 * within the same host page iteration.
2406 */
2407 pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
2408 pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
2409 }
d9e474ea
PX
2410}
2411
2412/*
2413 * Whether the page pointed by PSS is within the host page being sent.
2414 * Must be called after a previous pss_host_page_prepare().
2415 */
2416static bool pss_within_range(PageSearchStatus *pss)
2417{
2418 ram_addr_t ram_addr;
2419
2420 assert(pss->host_page_sending);
2421
2422 /* Over host-page boundary? */
2423 if (pss->page >= pss->host_page_end) {
2424 return false;
2425 }
2426
2427 ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2428
2429 return offset_in_ramblock(pss->block, ram_addr);
2430}
2431
2432static void pss_host_page_finish(PageSearchStatus *pss)
2433{
2434 pss->host_page_sending = false;
2435 /* This is not needed, but just to reset it */
2436 pss->host_page_start = pss->host_page_end = 0;
2437}
2438
93589827
PX
2439/*
2440 * Send an urgent host page specified by `pss'. Need to be called with
2441 * bitmap_mutex held.
2442 *
2443 * Returns 0 if save host page succeeded, false otherwise.
2444 */
2445static int ram_save_host_page_urgent(PageSearchStatus *pss)
2446{
2447 bool page_dirty, sent = false;
2448 RAMState *rs = ram_state;
2449 int ret = 0;
2450
2451 trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2452 pss_host_page_prepare(pss);
2453
2454 /*
2455 * If precopy is sending the same page, let it be done in precopy, or
2456 * we could send the same page in two channels and none of them will
2457 * receive the whole page.
2458 */
2459 if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) {
2460 trace_postcopy_preempt_hit(pss->block->idstr,
2461 pss->page << TARGET_PAGE_BITS);
2462 return 0;
2463 }
2464
2465 do {
2466 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2467
2468 if (page_dirty) {
2469 /* Be strict to return code; it must be 1, or what else? */
4010ba38 2470 if (migration_ops->ram_save_target_page(rs, pss) != 1) {
93589827
PX
2471 error_report_once("%s: ram_save_target_page failed", __func__);
2472 ret = -1;
2473 goto out;
2474 }
2475 sent = true;
2476 }
2477 pss_find_next_dirty(pss);
2478 } while (pss_within_range(pss));
2479out:
2480 pss_host_page_finish(pss);
2481 /* For urgent requests, flush immediately if sent */
2482 if (sent) {
2483 qemu_fflush(pss->pss_channel);
2484 }
2485 return ret;
2486}
2487
a82d593b 2488/**
3d0684b2 2489 * ram_save_host_page: save a whole host page
a82d593b 2490 *
3d0684b2
JQ
2491 * Starting at *offset send pages up to the end of the current host
2492 * page. It's valid for the initial offset to point into the middle of
2493 * a host page in which case the remainder of the hostpage is sent.
2494 * Only dirty target pages are sent. Note that the host page size may
2495 * be a huge page for this block.
f3321554 2496 *
1eb3fc0a
DDAG
2497 * The saving stops at the boundary of the used_length of the block
2498 * if the RAMBlock isn't a multiple of the host page size.
a82d593b 2499 *
f3321554
PX
2500 * The caller must be with ram_state.bitmap_mutex held to call this
2501 * function. Note that this function can temporarily release the lock, but
2502 * when the function is returned it'll make sure the lock is still held.
2503 *
3d0684b2
JQ
2504 * Returns the number of pages written or negative on error
2505 *
6f37bb8b 2506 * @rs: current RAM state
3d0684b2 2507 * @pss: data about the page we want to send
a82d593b 2508 */
05931ec5 2509static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
a82d593b 2510{
f3321554 2511 bool page_dirty, preempt_active = postcopy_preempt_active();
a82d593b 2512 int tmppages, pages = 0;
a935e30f
JQ
2513 size_t pagesize_bits =
2514 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
278e2f55
AG
2515 unsigned long start_page = pss->page;
2516 int res;
4c011c37 2517
fbd162e6 2518 if (ramblock_is_ignored(pss->block)) {
b895de50
CLG
2519 error_report("block %s should not be migrated !", pss->block->idstr);
2520 return 0;
2521 }
2522
d9e474ea
PX
2523 /* Update host page boundary information */
2524 pss_host_page_prepare(pss);
2525
a82d593b 2526 do {
f3321554 2527 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
a82d593b 2528
f3321554
PX
2529 /* Check the pages is dirty and if it is send it */
2530 if (page_dirty) {
ba1b7c81 2531 /*
f3321554
PX
2532 * Properly yield the lock only in postcopy preempt mode
2533 * because both migration thread and rp-return thread can
2534 * operate on the bitmaps.
ba1b7c81 2535 */
f3321554
PX
2536 if (preempt_active) {
2537 qemu_mutex_unlock(&rs->bitmap_mutex);
ba1b7c81 2538 }
4010ba38 2539 tmppages = migration_ops->ram_save_target_page(rs, pss);
f3321554
PX
2540 if (tmppages >= 0) {
2541 pages += tmppages;
2542 /*
2543 * Allow rate limiting to happen in the middle of huge pages if
2544 * something is sent in the current iteration.
2545 */
2546 if (pagesize_bits > 1 && tmppages > 0) {
2547 migration_rate_limit();
2548 }
2549 }
2550 if (preempt_active) {
2551 qemu_mutex_lock(&rs->bitmap_mutex);
2552 }
2553 } else {
2554 tmppages = 0;
23feba90 2555 }
f3321554
PX
2556
2557 if (tmppages < 0) {
d9e474ea 2558 pss_host_page_finish(pss);
f3321554
PX
2559 return tmppages;
2560 }
2561
d9e474ea
PX
2562 pss_find_next_dirty(pss);
2563 } while (pss_within_range(pss));
2564
2565 pss_host_page_finish(pss);
278e2f55
AG
2566
2567 res = ram_save_release_protection(rs, pss, start_page);
2568 return (res < 0 ? res : pages);
a82d593b 2569}
6c595cde 2570
56e93d26 2571/**
3d0684b2 2572 * ram_find_and_save_block: finds a dirty page and sends it to f
56e93d26
JQ
2573 *
2574 * Called within an RCU critical section.
2575 *
e8f3735f
XG
2576 * Returns the number of pages written where zero means no dirty pages,
2577 * or negative on error
56e93d26 2578 *
6f37bb8b 2579 * @rs: current RAM state
a82d593b
DDAG
2580 *
2581 * On systems where host-page-size > target-page-size it will send all the
2582 * pages in a host page that are dirty.
56e93d26 2583 */
05931ec5 2584static int ram_find_and_save_block(RAMState *rs)
56e93d26 2585{
f1668764 2586 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
56e93d26 2587 int pages = 0;
56e93d26 2588
0827b9e9 2589 /* No dirty page as there is zero RAM */
8d80e195 2590 if (!rs->ram_bytes_total) {
0827b9e9
AA
2591 return pages;
2592 }
2593
4934a5dd
PX
2594 /*
2595 * Always keep last_seen_block/last_page valid during this procedure,
2596 * because find_dirty_block() relies on these values (e.g., we compare
2597 * last_seen_block with pss.block to see whether we searched all the
2598 * ramblocks) to detect the completion of migration. Having NULL value
2599 * of last_seen_block can conditionally cause below loop to run forever.
2600 */
2601 if (!rs->last_seen_block) {
2602 rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks);
2603 rs->last_page = 0;
2604 }
2605
f1668764 2606 pss_init(pss, rs->last_seen_block, rs->last_page);
b8fb8cb7 2607
31e2ac74 2608 while (true){
51efd36f 2609 if (!get_queued_page(rs, pss)) {
b062106d 2610 /* priority queue empty, so just search for something dirty */
31e2ac74
JQ
2611 int res = find_dirty_block(rs, pss);
2612 if (res != PAGE_DIRTY_FOUND) {
2613 if (res == PAGE_ALL_CLEAN) {
51efd36f 2614 break;
31e2ac74
JQ
2615 } else if (res == PAGE_TRY_AGAIN) {
2616 continue;
51efd36f
JQ
2617 }
2618 }
56e93d26 2619 }
51efd36f 2620 pages = ram_save_host_page(rs, pss);
31e2ac74
JQ
2621 if (pages) {
2622 break;
2623 }
2624 }
56e93d26 2625
f1668764
PX
2626 rs->last_seen_block = pss->block;
2627 rs->last_page = pss->page;
56e93d26
JQ
2628
2629 return pages;
2630}
2631
2632void acct_update_position(QEMUFile *f, size_t size, bool zero)
2633{
2634 uint64_t pages = size / TARGET_PAGE_SIZE;
f7ccd61b 2635
56e93d26 2636 if (zero) {
23b7576d 2637 stat64_add(&ram_atomic_counters.duplicate, pages);
56e93d26 2638 } else {
23b7576d 2639 stat64_add(&ram_atomic_counters.normal, pages);
4c2d0f6d 2640 ram_transferred_add(size);
1a93bd2f 2641 qemu_file_credit_transfer(f, size);
56e93d26
JQ
2642 }
2643}
2644
8008a272 2645static uint64_t ram_bytes_total_with_ignored(void)
56e93d26
JQ
2646{
2647 RAMBlock *block;
2648 uint64_t total = 0;
2649
89ac5a1d
DDAG
2650 RCU_READ_LOCK_GUARD();
2651
8008a272
JQ
2652 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2653 total += block->used_length;
99e15582 2654 }
56e93d26
JQ
2655 return total;
2656}
2657
fbd162e6
YK
2658uint64_t ram_bytes_total(void)
2659{
8008a272
JQ
2660 RAMBlock *block;
2661 uint64_t total = 0;
2662
2663 RCU_READ_LOCK_GUARD();
2664
2665 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2666 total += block->used_length;
2667 }
2668 return total;
fbd162e6
YK
2669}
2670
f265e0e4 2671static void xbzrle_load_setup(void)
56e93d26 2672{
f265e0e4 2673 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
56e93d26
JQ
2674}
2675
f265e0e4
JQ
2676static void xbzrle_load_cleanup(void)
2677{
2678 g_free(XBZRLE.decoded_buf);
2679 XBZRLE.decoded_buf = NULL;
2680}
2681
7d7c96be
PX
2682static void ram_state_cleanup(RAMState **rsp)
2683{
b9ccaf6d
DDAG
2684 if (*rsp) {
2685 migration_page_queue_free(*rsp);
2686 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2687 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2688 g_free(*rsp);
2689 *rsp = NULL;
2690 }
7d7c96be
PX
2691}
2692
84593a08
PX
2693static void xbzrle_cleanup(void)
2694{
2695 XBZRLE_cache_lock();
2696 if (XBZRLE.cache) {
2697 cache_fini(XBZRLE.cache);
2698 g_free(XBZRLE.encoded_buf);
2699 g_free(XBZRLE.current_buf);
2700 g_free(XBZRLE.zero_target_page);
2701 XBZRLE.cache = NULL;
2702 XBZRLE.encoded_buf = NULL;
2703 XBZRLE.current_buf = NULL;
2704 XBZRLE.zero_target_page = NULL;
2705 }
2706 XBZRLE_cache_unlock();
2707}
2708
f265e0e4 2709static void ram_save_cleanup(void *opaque)
56e93d26 2710{
53518d94 2711 RAMState **rsp = opaque;
6b6712ef 2712 RAMBlock *block;
eb859c53 2713
278e2f55
AG
2714 /* We don't use dirty log with background snapshots */
2715 if (!migrate_background_snapshot()) {
2716 /* caller have hold iothread lock or is in a bh, so there is
2717 * no writing race against the migration bitmap
2718 */
63b41db4
HH
2719 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2720 /*
2721 * do not stop dirty log without starting it, since
2722 * memory_global_dirty_log_stop will assert that
2723 * memory_global_dirty_log_start/stop used in pairs
2724 */
2725 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2726 }
278e2f55 2727 }
6b6712ef 2728
fbd162e6 2729 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
002cad6b
PX
2730 g_free(block->clear_bmap);
2731 block->clear_bmap = NULL;
6b6712ef
JQ
2732 g_free(block->bmap);
2733 block->bmap = NULL;
56e93d26
JQ
2734 }
2735
84593a08 2736 xbzrle_cleanup();
f0afa331 2737 compress_threads_save_cleanup();
7d7c96be 2738 ram_state_cleanup(rsp);
4010ba38
JQ
2739 g_free(migration_ops);
2740 migration_ops = NULL;
56e93d26
JQ
2741}
2742
6f37bb8b 2743static void ram_state_reset(RAMState *rs)
56e93d26 2744{
ec6f3ab9
PX
2745 int i;
2746
2747 for (i = 0; i < RAM_CHANNEL_MAX; i++) {
2748 rs->pss[i].last_sent_block = NULL;
2749 }
2750
6f37bb8b 2751 rs->last_seen_block = NULL;
269ace29 2752 rs->last_page = 0;
6f37bb8b 2753 rs->last_version = ram_list.version;
1a373522 2754 rs->xbzrle_enabled = false;
56e93d26
JQ
2755}
2756
2757#define MAX_WAIT 50 /* ms, half buffered_file limit */
2758
e0b266f0
DDAG
2759/* **** functions for postcopy ***** */
2760
ced1c616
PB
2761void ram_postcopy_migrated_memory_release(MigrationState *ms)
2762{
2763 struct RAMBlock *block;
ced1c616 2764
fbd162e6 2765 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
6b6712ef
JQ
2766 unsigned long *bitmap = block->bmap;
2767 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2768 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
ced1c616
PB
2769
2770 while (run_start < range) {
2771 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
8bba004c
AR
2772 ram_discard_range(block->idstr,
2773 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2774 ((ram_addr_t)(run_end - run_start))
2775 << TARGET_PAGE_BITS);
ced1c616
PB
2776 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2777 }
2778 }
2779}
2780
3d0684b2
JQ
2781/**
2782 * postcopy_send_discard_bm_ram: discard a RAMBlock
2783 *
e0b266f0 2784 * Callback from postcopy_each_ram_send_discard for each RAMBlock
3d0684b2
JQ
2785 *
2786 * @ms: current migration state
89dab31b 2787 * @block: RAMBlock to discard
e0b266f0 2788 */
9e7d1223 2789static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
e0b266f0 2790{
6b6712ef 2791 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
e0b266f0 2792 unsigned long current;
1e7cf8c3 2793 unsigned long *bitmap = block->bmap;
e0b266f0 2794
6b6712ef 2795 for (current = 0; current < end; ) {
1e7cf8c3 2796 unsigned long one = find_next_bit(bitmap, end, current);
33a5cb62 2797 unsigned long zero, discard_length;
e0b266f0 2798
33a5cb62
WY
2799 if (one >= end) {
2800 break;
2801 }
e0b266f0 2802
1e7cf8c3 2803 zero = find_next_zero_bit(bitmap, end, one + 1);
33a5cb62
WY
2804
2805 if (zero >= end) {
2806 discard_length = end - one;
e0b266f0 2807 } else {
33a5cb62
WY
2808 discard_length = zero - one;
2809 }
810cf2bb 2810 postcopy_discard_send_range(ms, one, discard_length);
33a5cb62 2811 current = one + discard_length;
e0b266f0 2812 }
e0b266f0
DDAG
2813}
2814
f30c2e5b
PX
2815static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2816
3d0684b2
JQ
2817/**
2818 * postcopy_each_ram_send_discard: discard all RAMBlocks
2819 *
e0b266f0
DDAG
2820 * Utility for the outgoing postcopy code.
2821 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2822 * passing it bitmap indexes and name.
e0b266f0
DDAG
2823 * (qemu_ram_foreach_block ends up passing unscaled lengths
2824 * which would mean postcopy code would have to deal with target page)
3d0684b2
JQ
2825 *
2826 * @ms: current migration state
e0b266f0 2827 */
739fcc1b 2828static void postcopy_each_ram_send_discard(MigrationState *ms)
e0b266f0
DDAG
2829{
2830 struct RAMBlock *block;
e0b266f0 2831
fbd162e6 2832 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
810cf2bb 2833 postcopy_discard_send_init(ms, block->idstr);
e0b266f0 2834
f30c2e5b
PX
2835 /*
2836 * Deal with TPS != HPS and huge pages. It discard any partially sent
2837 * host-page size chunks, mark any partially dirty host-page size
2838 * chunks as all dirty. In this case the host-page is the host-page
2839 * for the particular RAMBlock, i.e. it might be a huge page.
2840 */
2841 postcopy_chunk_hostpages_pass(ms, block);
2842
e0b266f0
DDAG
2843 /*
2844 * Postcopy sends chunks of bitmap over the wire, but it
2845 * just needs indexes at this point, avoids it having
2846 * target page specific code.
2847 */
739fcc1b 2848 postcopy_send_discard_bm_ram(ms, block);
810cf2bb 2849 postcopy_discard_send_finish(ms);
e0b266f0 2850 }
e0b266f0
DDAG
2851}
2852
3d0684b2 2853/**
8324ef86 2854 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
3d0684b2
JQ
2855 *
2856 * Helper for postcopy_chunk_hostpages; it's called twice to
2857 * canonicalize the two bitmaps, that are similar, but one is
2858 * inverted.
99e314eb 2859 *
3d0684b2
JQ
2860 * Postcopy requires that all target pages in a hostpage are dirty or
2861 * clean, not a mix. This function canonicalizes the bitmaps.
99e314eb 2862 *
3d0684b2 2863 * @ms: current migration state
3d0684b2 2864 * @block: block that contains the page we want to canonicalize
99e314eb 2865 */
1e7cf8c3 2866static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
99e314eb 2867{
53518d94 2868 RAMState *rs = ram_state;
6b6712ef 2869 unsigned long *bitmap = block->bmap;
29c59172 2870 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
6b6712ef 2871 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
99e314eb
DDAG
2872 unsigned long run_start;
2873
29c59172
DDAG
2874 if (block->page_size == TARGET_PAGE_SIZE) {
2875 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2876 return;
2877 }
2878
1e7cf8c3
WY
2879 /* Find a dirty page */
2880 run_start = find_next_bit(bitmap, pages, 0);
99e314eb 2881
6b6712ef 2882 while (run_start < pages) {
99e314eb
DDAG
2883
2884 /*
2885 * If the start of this run of pages is in the middle of a host
2886 * page, then we need to fixup this host page.
2887 */
9dec3cc3 2888 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
99e314eb 2889 /* Find the end of this run */
1e7cf8c3 2890 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
99e314eb
DDAG
2891 /*
2892 * If the end isn't at the start of a host page, then the
2893 * run doesn't finish at the end of a host page
2894 * and we need to discard.
2895 */
99e314eb
DDAG
2896 }
2897
9dec3cc3 2898 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
99e314eb 2899 unsigned long page;
dad45ab2
WY
2900 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2901 host_ratio);
2902 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
99e314eb 2903
99e314eb
DDAG
2904 /* Clean up the bitmap */
2905 for (page = fixup_start_addr;
2906 page < fixup_start_addr + host_ratio; page++) {
99e314eb
DDAG
2907 /*
2908 * Remark them as dirty, updating the count for any pages
2909 * that weren't previously dirty.
2910 */
0d8ec885 2911 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
99e314eb
DDAG
2912 }
2913 }
2914
1e7cf8c3
WY
2915 /* Find the next dirty page for the next iteration */
2916 run_start = find_next_bit(bitmap, pages, run_start);
99e314eb
DDAG
2917 }
2918}
2919
3d0684b2
JQ
2920/**
2921 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2922 *
e0b266f0
DDAG
2923 * Transmit the set of pages to be discarded after precopy to the target
2924 * these are pages that:
2925 * a) Have been previously transmitted but are now dirty again
2926 * b) Pages that have never been transmitted, this ensures that
2927 * any pages on the destination that have been mapped by background
2928 * tasks get discarded (transparent huge pages is the specific concern)
2929 * Hopefully this is pretty sparse
3d0684b2
JQ
2930 *
2931 * @ms: current migration state
e0b266f0 2932 */
739fcc1b 2933void ram_postcopy_send_discard_bitmap(MigrationState *ms)
e0b266f0 2934{
53518d94 2935 RAMState *rs = ram_state;
e0b266f0 2936
89ac5a1d 2937 RCU_READ_LOCK_GUARD();
e0b266f0
DDAG
2938
2939 /* This should be our last sync, the src is now paused */
eb859c53 2940 migration_bitmap_sync(rs);
e0b266f0 2941
6b6712ef 2942 /* Easiest way to make sure we don't resume in the middle of a host-page */
ec6f3ab9 2943 rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL;
6b6712ef 2944 rs->last_seen_block = NULL;
6b6712ef 2945 rs->last_page = 0;
e0b266f0 2946
739fcc1b 2947 postcopy_each_ram_send_discard(ms);
e0b266f0 2948
739fcc1b 2949 trace_ram_postcopy_send_discard_bitmap();
e0b266f0
DDAG
2950}
2951
3d0684b2
JQ
2952/**
2953 * ram_discard_range: discard dirtied pages at the beginning of postcopy
e0b266f0 2954 *
3d0684b2 2955 * Returns zero on success
e0b266f0 2956 *
36449157
JQ
2957 * @rbname: name of the RAMBlock of the request. NULL means the
2958 * same that last one.
3d0684b2
JQ
2959 * @start: RAMBlock starting page
2960 * @length: RAMBlock size
e0b266f0 2961 */
aaa2064c 2962int ram_discard_range(const char *rbname, uint64_t start, size_t length)
e0b266f0 2963{
36449157 2964 trace_ram_discard_range(rbname, start, length);
d3a5038c 2965
89ac5a1d 2966 RCU_READ_LOCK_GUARD();
36449157 2967 RAMBlock *rb = qemu_ram_block_by_name(rbname);
e0b266f0
DDAG
2968
2969 if (!rb) {
36449157 2970 error_report("ram_discard_range: Failed to find block '%s'", rbname);
03acb4e9 2971 return -1;
e0b266f0
DDAG
2972 }
2973
814bb08f
PX
2974 /*
2975 * On source VM, we don't need to update the received bitmap since
2976 * we don't even have one.
2977 */
2978 if (rb->receivedmap) {
2979 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2980 length >> qemu_target_page_bits());
2981 }
2982
03acb4e9 2983 return ram_block_discard_range(rb, start, length);
e0b266f0
DDAG
2984}
2985
84593a08
PX
2986/*
2987 * For every allocation, we will try not to crash the VM if the
2988 * allocation failed.
2989 */
2990static int xbzrle_init(void)
2991{
2992 Error *local_err = NULL;
2993
2994 if (!migrate_use_xbzrle()) {
2995 return 0;
2996 }
2997
2998 XBZRLE_cache_lock();
2999
3000 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
3001 if (!XBZRLE.zero_target_page) {
3002 error_report("%s: Error allocating zero page", __func__);
3003 goto err_out;
3004 }
3005
3006 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
3007 TARGET_PAGE_SIZE, &local_err);
3008 if (!XBZRLE.cache) {
3009 error_report_err(local_err);
3010 goto free_zero_page;
3011 }
3012
3013 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3014 if (!XBZRLE.encoded_buf) {
3015 error_report("%s: Error allocating encoded_buf", __func__);
3016 goto free_cache;
3017 }
3018
3019 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3020 if (!XBZRLE.current_buf) {
3021 error_report("%s: Error allocating current_buf", __func__);
3022 goto free_encoded_buf;
3023 }
3024
3025 /* We are all good */
3026 XBZRLE_cache_unlock();
3027 return 0;
3028
3029free_encoded_buf:
3030 g_free(XBZRLE.encoded_buf);
3031 XBZRLE.encoded_buf = NULL;
3032free_cache:
3033 cache_fini(XBZRLE.cache);
3034 XBZRLE.cache = NULL;
3035free_zero_page:
3036 g_free(XBZRLE.zero_target_page);
3037 XBZRLE.zero_target_page = NULL;
3038err_out:
3039 XBZRLE_cache_unlock();
3040 return -ENOMEM;
3041}
3042
53518d94 3043static int ram_state_init(RAMState **rsp)
56e93d26 3044{
7d00ee6a
PX
3045 *rsp = g_try_new0(RAMState, 1);
3046
3047 if (!*rsp) {
3048 error_report("%s: Init ramstate fail", __func__);
3049 return -1;
3050 }
53518d94
JQ
3051
3052 qemu_mutex_init(&(*rsp)->bitmap_mutex);
3053 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3054 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
8d80e195 3055 (*rsp)->ram_bytes_total = ram_bytes_total();
56e93d26 3056
7d00ee6a 3057 /*
40c4d4a8
IR
3058 * Count the total number of pages used by ram blocks not including any
3059 * gaps due to alignment or unplugs.
03158519 3060 * This must match with the initial values of dirty bitmap.
7d00ee6a 3061 */
8d80e195 3062 (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS;
7d00ee6a
PX
3063 ram_state_reset(*rsp);
3064
3065 return 0;
3066}
3067
d6eff5d7 3068static void ram_list_init_bitmaps(void)
7d00ee6a 3069{
002cad6b 3070 MigrationState *ms = migrate_get_current();
d6eff5d7
PX
3071 RAMBlock *block;
3072 unsigned long pages;
002cad6b 3073 uint8_t shift;
56e93d26 3074
0827b9e9
AA
3075 /* Skip setting bitmap if there is no RAM */
3076 if (ram_bytes_total()) {
002cad6b
PX
3077 shift = ms->clear_bitmap_shift;
3078 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3079 error_report("clear_bitmap_shift (%u) too big, using "
3080 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3081 shift = CLEAR_BITMAP_SHIFT_MAX;
3082 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3083 error_report("clear_bitmap_shift (%u) too small, using "
3084 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3085 shift = CLEAR_BITMAP_SHIFT_MIN;
3086 }
3087
fbd162e6 3088 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
d6eff5d7 3089 pages = block->max_length >> TARGET_PAGE_BITS;
03158519
WY
3090 /*
3091 * The initial dirty bitmap for migration must be set with all
3092 * ones to make sure we'll migrate every guest RAM page to
3093 * destination.
40c4d4a8
IR
3094 * Here we set RAMBlock.bmap all to 1 because when rebegin a
3095 * new migration after a failed migration, ram_list.
3096 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3097 * guest memory.
03158519 3098 */
6b6712ef 3099 block->bmap = bitmap_new(pages);
40c4d4a8 3100 bitmap_set(block->bmap, 0, pages);
002cad6b
PX
3101 block->clear_bmap_shift = shift;
3102 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
0827b9e9 3103 }
f3f491fc 3104 }
d6eff5d7
PX
3105}
3106
be39b4cd
DH
3107static void migration_bitmap_clear_discarded_pages(RAMState *rs)
3108{
3109 unsigned long pages;
3110 RAMBlock *rb;
3111
3112 RCU_READ_LOCK_GUARD();
3113
3114 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3115 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
3116 rs->migration_dirty_pages -= pages;
3117 }
3118}
3119
d6eff5d7
PX
3120static void ram_init_bitmaps(RAMState *rs)
3121{
3122 /* For memory_global_dirty_log_start below. */
3123 qemu_mutex_lock_iothread();
3124 qemu_mutex_lock_ramlist();
f3f491fc 3125
89ac5a1d
DDAG
3126 WITH_RCU_READ_LOCK_GUARD() {
3127 ram_list_init_bitmaps();
278e2f55
AG
3128 /* We don't use dirty log with background snapshots */
3129 if (!migrate_background_snapshot()) {
63b41db4 3130 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
278e2f55
AG
3131 migration_bitmap_sync_precopy(rs);
3132 }
89ac5a1d 3133 }
56e93d26 3134 qemu_mutex_unlock_ramlist();
49877834 3135 qemu_mutex_unlock_iothread();
be39b4cd
DH
3136
3137 /*
3138 * After an eventual first bitmap sync, fixup the initial bitmap
3139 * containing all 1s to exclude any discarded pages from migration.
3140 */
3141 migration_bitmap_clear_discarded_pages(rs);
d6eff5d7
PX
3142}
3143
3144static int ram_init_all(RAMState **rsp)
3145{
3146 if (ram_state_init(rsp)) {
3147 return -1;
3148 }
3149
3150 if (xbzrle_init()) {
3151 ram_state_cleanup(rsp);
3152 return -1;
3153 }
3154
3155 ram_init_bitmaps(*rsp);
a91246c9
HZ
3156
3157 return 0;
3158}
3159
08614f34
PX
3160static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3161{
3162 RAMBlock *block;
3163 uint64_t pages = 0;
3164
3165 /*
3166 * Postcopy is not using xbzrle/compression, so no need for that.
3167 * Also, since source are already halted, we don't need to care
3168 * about dirty page logging as well.
3169 */
3170
fbd162e6 3171 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
08614f34
PX
3172 pages += bitmap_count_one(block->bmap,
3173 block->used_length >> TARGET_PAGE_BITS);
3174 }
3175
3176 /* This may not be aligned with current bitmaps. Recalculate. */
3177 rs->migration_dirty_pages = pages;
3178
1a373522 3179 ram_state_reset(rs);
08614f34
PX
3180
3181 /* Update RAMState cache of output QEMUFile */
7f401b80 3182 rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out;
08614f34
PX
3183
3184 trace_ram_state_resume_prepare(pages);
3185}
3186
6bcb05fc
WW
3187/*
3188 * This function clears bits of the free pages reported by the caller from the
3189 * migration dirty bitmap. @addr is the host address corresponding to the
3190 * start of the continuous guest free pages, and @len is the total bytes of
3191 * those pages.
3192 */
3193void qemu_guest_free_page_hint(void *addr, size_t len)
3194{
3195 RAMBlock *block;
3196 ram_addr_t offset;
3197 size_t used_len, start, npages;
3198 MigrationState *s = migrate_get_current();
3199
3200 /* This function is currently expected to be used during live migration */
3201 if (!migration_is_setup_or_active(s->state)) {
3202 return;
3203 }
3204
3205 for (; len > 0; len -= used_len, addr += used_len) {
3206 block = qemu_ram_block_from_host(addr, false, &offset);
3207 if (unlikely(!block || offset >= block->used_length)) {
3208 /*
3209 * The implementation might not support RAMBlock resize during
3210 * live migration, but it could happen in theory with future
3211 * updates. So we add a check here to capture that case.
3212 */
3213 error_report_once("%s unexpected error", __func__);
3214 return;
3215 }
3216
3217 if (len <= block->used_length - offset) {
3218 used_len = len;
3219 } else {
3220 used_len = block->used_length - offset;
3221 }
3222
3223 start = offset >> TARGET_PAGE_BITS;
3224 npages = used_len >> TARGET_PAGE_BITS;
3225
3226 qemu_mutex_lock(&ram_state->bitmap_mutex);
3143577d
WW
3227 /*
3228 * The skipped free pages are equavalent to be sent from clear_bmap's
3229 * perspective, so clear the bits from the memory region bitmap which
3230 * are initially set. Otherwise those skipped pages will be sent in
3231 * the next round after syncing from the memory region bitmap.
3232 */
1230a25f 3233 migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
6bcb05fc
WW
3234 ram_state->migration_dirty_pages -=
3235 bitmap_count_one_with_offset(block->bmap, start, npages);
3236 bitmap_clear(block->bmap, start, npages);
3237 qemu_mutex_unlock(&ram_state->bitmap_mutex);
3238 }
3239}
3240
3d0684b2
JQ
3241/*
3242 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
a91246c9
HZ
3243 * long-running RCU critical section. When rcu-reclaims in the code
3244 * start to become numerous it will be necessary to reduce the
3245 * granularity of these critical sections.
3246 */
3247
3d0684b2
JQ
3248/**
3249 * ram_save_setup: Setup RAM for migration
3250 *
3251 * Returns zero to indicate success and negative for error
3252 *
3253 * @f: QEMUFile where to send the data
3254 * @opaque: RAMState pointer
3255 */
a91246c9
HZ
3256static int ram_save_setup(QEMUFile *f, void *opaque)
3257{
53518d94 3258 RAMState **rsp = opaque;
a91246c9 3259 RAMBlock *block;
33d70973 3260 int ret;
a91246c9 3261
dcaf446e
XG
3262 if (compress_threads_save_setup()) {
3263 return -1;
3264 }
3265
a91246c9
HZ
3266 /* migration has already setup the bitmap, reuse it. */
3267 if (!migration_in_colo_state()) {
7d00ee6a 3268 if (ram_init_all(rsp) != 0) {
dcaf446e 3269 compress_threads_save_cleanup();
a91246c9 3270 return -1;
53518d94 3271 }
a91246c9 3272 }
7f401b80 3273 (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f;
a91246c9 3274
0e6ebd48 3275 WITH_RCU_READ_LOCK_GUARD() {
8008a272
JQ
3276 qemu_put_be64(f, ram_bytes_total_with_ignored()
3277 | RAM_SAVE_FLAG_MEM_SIZE);
56e93d26 3278
0e6ebd48
DDAG
3279 RAMBLOCK_FOREACH_MIGRATABLE(block) {
3280 qemu_put_byte(f, strlen(block->idstr));
3281 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3282 qemu_put_be64(f, block->used_length);
3283 if (migrate_postcopy_ram() && block->page_size !=
3284 qemu_host_page_size) {
3285 qemu_put_be64(f, block->page_size);
3286 }
3287 if (migrate_ignore_shared()) {
3288 qemu_put_be64(f, block->mr->addr);
3289 }
fbd162e6 3290 }
56e93d26
JQ
3291 }
3292
56e93d26
JQ
3293 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3294 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3295
4010ba38
JQ
3296 migration_ops = g_malloc0(sizeof(MigrationOps));
3297 migration_ops->ram_save_target_page = ram_save_target_page_legacy;
33d70973
LB
3298 ret = multifd_send_sync_main(f);
3299 if (ret < 0) {
3300 return ret;
3301 }
3302
56e93d26 3303 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
35374cbd 3304 qemu_fflush(f);
56e93d26
JQ
3305
3306 return 0;
3307}
3308
3d0684b2
JQ
3309/**
3310 * ram_save_iterate: iterative stage for migration
3311 *
3312 * Returns zero to indicate success and negative for error
3313 *
3314 * @f: QEMUFile where to send the data
3315 * @opaque: RAMState pointer
3316 */
56e93d26
JQ
3317static int ram_save_iterate(QEMUFile *f, void *opaque)
3318{
53518d94
JQ
3319 RAMState **temp = opaque;
3320 RAMState *rs = *temp;
3d4095b2 3321 int ret = 0;
56e93d26
JQ
3322 int i;
3323 int64_t t0;
5c90308f 3324 int done = 0;
56e93d26 3325
b2557345
PL
3326 if (blk_mig_bulk_active()) {
3327 /* Avoid transferring ram during bulk phase of block migration as
3328 * the bulk phase will usually take a long time and transferring
3329 * ram updates during that time is pointless. */
3330 goto out;
3331 }
3332
63268c49
PX
3333 /*
3334 * We'll take this lock a little bit long, but it's okay for two reasons.
3335 * Firstly, the only possible other thread to take it is who calls
3336 * qemu_guest_free_page_hint(), which should be rare; secondly, see
3337 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3338 * guarantees that we'll at least released it in a regular basis.
3339 */
3340 qemu_mutex_lock(&rs->bitmap_mutex);
89ac5a1d
DDAG
3341 WITH_RCU_READ_LOCK_GUARD() {
3342 if (ram_list.version != rs->last_version) {
3343 ram_state_reset(rs);
3344 }
56e93d26 3345
89ac5a1d
DDAG
3346 /* Read version before ram_list.blocks */
3347 smp_rmb();
56e93d26 3348
89ac5a1d 3349 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
56e93d26 3350
89ac5a1d
DDAG
3351 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3352 i = 0;
3353 while ((ret = qemu_file_rate_limit(f)) == 0 ||
a1fe28df 3354 postcopy_has_request(rs)) {
89ac5a1d 3355 int pages;
e03a34f8 3356
89ac5a1d
DDAG
3357 if (qemu_file_get_error(f)) {
3358 break;
3359 }
e8f3735f 3360
05931ec5 3361 pages = ram_find_and_save_block(rs);
89ac5a1d
DDAG
3362 /* no more pages to sent */
3363 if (pages == 0) {
3364 done = 1;
3365 break;
3366 }
e8f3735f 3367
89ac5a1d
DDAG
3368 if (pages < 0) {
3369 qemu_file_set_error(f, pages);
56e93d26
JQ
3370 break;
3371 }
89ac5a1d
DDAG
3372
3373 rs->target_page_count += pages;
3374
644acf99
WY
3375 /*
3376 * During postcopy, it is necessary to make sure one whole host
3377 * page is sent in one chunk.
3378 */
3379 if (migrate_postcopy_ram()) {
3380 flush_compressed_data(rs);
3381 }
3382
89ac5a1d
DDAG
3383 /*
3384 * we want to check in the 1st loop, just in case it was the 1st
3385 * time and we had to sync the dirty bitmap.
3386 * qemu_clock_get_ns() is a bit expensive, so we only check each
3387 * some iterations
3388 */
3389 if ((i & 63) == 0) {
3390 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3391 1000000;
3392 if (t1 > MAX_WAIT) {
3393 trace_ram_save_iterate_big_wait(t1, i);
3394 break;
3395 }
3396 }
3397 i++;
56e93d26 3398 }
56e93d26 3399 }
63268c49 3400 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26
JQ
3401
3402 /*
3403 * Must occur before EOS (or any QEMUFile operation)
3404 * because of RDMA protocol.
3405 */
3406 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3407
b2557345 3408out:
b69a0227
JQ
3409 if (ret >= 0
3410 && migration_is_setup_or_active(migrate_get_current()->state)) {
7f401b80 3411 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
33d70973
LB
3412 if (ret < 0) {
3413 return ret;
3414 }
3415
3d4095b2
JQ
3416 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3417 qemu_fflush(f);
4c2d0f6d 3418 ram_transferred_add(8);
56e93d26 3419
3d4095b2
JQ
3420 ret = qemu_file_get_error(f);
3421 }
56e93d26
JQ
3422 if (ret < 0) {
3423 return ret;
3424 }
3425
5c90308f 3426 return done;
56e93d26
JQ
3427}
3428
3d0684b2
JQ
3429/**
3430 * ram_save_complete: function called to send the remaining amount of ram
3431 *
e8f3735f 3432 * Returns zero to indicate success or negative on error
3d0684b2
JQ
3433 *
3434 * Called with iothread lock
3435 *
3436 * @f: QEMUFile where to send the data
3437 * @opaque: RAMState pointer
3438 */
56e93d26
JQ
3439static int ram_save_complete(QEMUFile *f, void *opaque)
3440{
53518d94
JQ
3441 RAMState **temp = opaque;
3442 RAMState *rs = *temp;
e8f3735f 3443 int ret = 0;
6f37bb8b 3444
05931ec5
JQ
3445 rs->last_stage = !migration_in_colo_state();
3446
89ac5a1d
DDAG
3447 WITH_RCU_READ_LOCK_GUARD() {
3448 if (!migration_in_postcopy()) {
3449 migration_bitmap_sync_precopy(rs);
3450 }
56e93d26 3451
89ac5a1d 3452 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
56e93d26 3453
89ac5a1d 3454 /* try transferring iterative blocks of memory */
56e93d26 3455
89ac5a1d 3456 /* flush all remaining blocks regardless of rate limiting */
c13221b5 3457 qemu_mutex_lock(&rs->bitmap_mutex);
89ac5a1d
DDAG
3458 while (true) {
3459 int pages;
56e93d26 3460
05931ec5 3461 pages = ram_find_and_save_block(rs);
89ac5a1d
DDAG
3462 /* no more blocks to sent */
3463 if (pages == 0) {
3464 break;
3465 }
3466 if (pages < 0) {
3467 ret = pages;
3468 break;
3469 }
e8f3735f 3470 }
c13221b5 3471 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26 3472
89ac5a1d
DDAG
3473 flush_compressed_data(rs);
3474 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3475 }
d09a6fde 3476
33d70973
LB
3477 if (ret < 0) {
3478 return ret;
3d4095b2 3479 }
56e93d26 3480
7f401b80 3481 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
33d70973
LB
3482 if (ret < 0) {
3483 return ret;
3484 }
3485
3486 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3487 qemu_fflush(f);
3488
3489 return 0;
56e93d26
JQ
3490}
3491
24beea4e
JQ
3492static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy,
3493 uint64_t *can_postcopy)
56e93d26 3494{
53518d94
JQ
3495 RAMState **temp = opaque;
3496 RAMState *rs = *temp;
56e93d26 3497
c8df4a7a 3498 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 3499
c8df4a7a
JQ
3500 if (migrate_postcopy_ram()) {
3501 /* We can do postcopy, and all the data is postcopiable */
24beea4e 3502 *can_postcopy += remaining_size;
c8df4a7a 3503 } else {
24beea4e 3504 *must_precopy += remaining_size;
c8df4a7a
JQ
3505 }
3506}
3507
24beea4e
JQ
3508static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy,
3509 uint64_t *can_postcopy)
c8df4a7a
JQ
3510{
3511 RAMState **temp = opaque;
3512 RAMState *rs = *temp;
3513
3514 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3515
3516 if (!migration_in_postcopy()) {
56e93d26 3517 qemu_mutex_lock_iothread();
89ac5a1d
DDAG
3518 WITH_RCU_READ_LOCK_GUARD() {
3519 migration_bitmap_sync_precopy(rs);
3520 }
56e93d26 3521 qemu_mutex_unlock_iothread();
9edabd4d 3522 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 3523 }
c31b098f 3524
86e1167e
VSO
3525 if (migrate_postcopy_ram()) {
3526 /* We can do postcopy, and all the data is postcopiable */
24beea4e 3527 *can_postcopy += remaining_size;
86e1167e 3528 } else {
24beea4e 3529 *must_precopy += remaining_size;
86e1167e 3530 }
56e93d26
JQ
3531}
3532
3533static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3534{
3535 unsigned int xh_len;
3536 int xh_flags;
063e760a 3537 uint8_t *loaded_data;
56e93d26 3538
56e93d26
JQ
3539 /* extract RLE header */
3540 xh_flags = qemu_get_byte(f);
3541 xh_len = qemu_get_be16(f);
3542
3543 if (xh_flags != ENCODING_FLAG_XBZRLE) {
3544 error_report("Failed to load XBZRLE page - wrong compression!");
3545 return -1;
3546 }
3547
3548 if (xh_len > TARGET_PAGE_SIZE) {
3549 error_report("Failed to load XBZRLE page - len overflow!");
3550 return -1;
3551 }
f265e0e4 3552 loaded_data = XBZRLE.decoded_buf;
56e93d26 3553 /* load data and decode */
f265e0e4 3554 /* it can change loaded_data to point to an internal buffer */
063e760a 3555 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
56e93d26
JQ
3556
3557 /* decode RLE */
063e760a 3558 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
56e93d26
JQ
3559 TARGET_PAGE_SIZE) == -1) {
3560 error_report("Failed to load XBZRLE page - decode error!");
3561 return -1;
3562 }
3563
3564 return 0;
3565}
3566
3d0684b2
JQ
3567/**
3568 * ram_block_from_stream: read a RAMBlock id from the migration stream
3569 *
3570 * Must be called from within a rcu critical section.
3571 *
56e93d26 3572 * Returns a pointer from within the RCU-protected ram_list.
a7180877 3573 *
755e8d7c 3574 * @mis: the migration incoming state pointer
3d0684b2
JQ
3575 * @f: QEMUFile where to read the data from
3576 * @flags: Page flags (mostly to see if it's a continuation of previous block)
c01b16ed 3577 * @channel: the channel we're using
a7180877 3578 */
755e8d7c 3579static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
c01b16ed
PX
3580 QEMUFile *f, int flags,
3581 int channel)
56e93d26 3582{
c01b16ed 3583 RAMBlock *block = mis->last_recv_block[channel];
56e93d26
JQ
3584 char id[256];
3585 uint8_t len;
3586
3587 if (flags & RAM_SAVE_FLAG_CONTINUE) {
4c4bad48 3588 if (!block) {
56e93d26
JQ
3589 error_report("Ack, bad migration stream!");
3590 return NULL;
3591 }
4c4bad48 3592 return block;
56e93d26
JQ
3593 }
3594
3595 len = qemu_get_byte(f);
3596 qemu_get_buffer(f, (uint8_t *)id, len);
3597 id[len] = 0;
3598
e3dd7493 3599 block = qemu_ram_block_by_name(id);
4c4bad48
HZ
3600 if (!block) {
3601 error_report("Can't find block %s", id);
3602 return NULL;
56e93d26
JQ
3603 }
3604
fbd162e6 3605 if (ramblock_is_ignored(block)) {
b895de50
CLG
3606 error_report("block %s should not be migrated !", id);
3607 return NULL;
3608 }
3609
c01b16ed 3610 mis->last_recv_block[channel] = block;
755e8d7c 3611
4c4bad48
HZ
3612 return block;
3613}
3614
3615static inline void *host_from_ram_block_offset(RAMBlock *block,
3616 ram_addr_t offset)
3617{
3618 if (!offset_in_ramblock(block, offset)) {
3619 return NULL;
3620 }
3621
3622 return block->host + offset;
56e93d26
JQ
3623}
3624
6a23f639
DH
3625static void *host_page_from_ram_block_offset(RAMBlock *block,
3626 ram_addr_t offset)
3627{
3628 /* Note: Explicitly no check against offset_in_ramblock(). */
3629 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3630 block->page_size);
3631}
3632
3633static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3634 ram_addr_t offset)
3635{
3636 return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3637}
3638
13af18f2 3639static inline void *colo_cache_from_block_offset(RAMBlock *block,
8af66371 3640 ram_addr_t offset, bool record_bitmap)
13af18f2
ZC
3641{
3642 if (!offset_in_ramblock(block, offset)) {
3643 return NULL;
3644 }
3645 if (!block->colo_cache) {
3646 error_report("%s: colo_cache is NULL in block :%s",
3647 __func__, block->idstr);
3648 return NULL;
3649 }
7d9acafa
ZC
3650
3651 /*
3652 * During colo checkpoint, we need bitmap of these migrated pages.
3653 * It help us to decide which pages in ram cache should be flushed
3654 * into VM's RAM later.
3655 */
8af66371
HZ
3656 if (record_bitmap &&
3657 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
7d9acafa
ZC
3658 ram_state->migration_dirty_pages++;
3659 }
13af18f2
ZC
3660 return block->colo_cache + offset;
3661}
3662
3d0684b2
JQ
3663/**
3664 * ram_handle_compressed: handle the zero page case
3665 *
56e93d26
JQ
3666 * If a page (or a whole RDMA chunk) has been
3667 * determined to be zero, then zap it.
3d0684b2
JQ
3668 *
3669 * @host: host address for the zero page
3670 * @ch: what the page is filled from. We only support zero
3671 * @size: size of the zero page
56e93d26
JQ
3672 */
3673void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3674{
bad452a7 3675 if (ch != 0 || !buffer_is_zero(host, size)) {
56e93d26
JQ
3676 memset(host, ch, size);
3677 }
3678}
3679
797ca154
XG
3680/* return the size after decompression, or negative value on error */
3681static int
3682qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3683 const uint8_t *source, size_t source_len)
3684{
3685 int err;
3686
3687 err = inflateReset(stream);
3688 if (err != Z_OK) {
3689 return -1;
3690 }
3691
3692 stream->avail_in = source_len;
3693 stream->next_in = (uint8_t *)source;
3694 stream->avail_out = dest_len;
3695 stream->next_out = dest;
3696
3697 err = inflate(stream, Z_NO_FLUSH);
3698 if (err != Z_STREAM_END) {
3699 return -1;
3700 }
3701
3702 return stream->total_out;
3703}
3704
56e93d26
JQ
3705static void *do_data_decompress(void *opaque)
3706{
3707 DecompressParam *param = opaque;
3708 unsigned long pagesize;
33d151f4 3709 uint8_t *des;
34ab9e97 3710 int len, ret;
56e93d26 3711
33d151f4 3712 qemu_mutex_lock(&param->mutex);
90e56fb4 3713 while (!param->quit) {
33d151f4
LL
3714 if (param->des) {
3715 des = param->des;
3716 len = param->len;
3717 param->des = 0;
3718 qemu_mutex_unlock(&param->mutex);
3719
56e93d26 3720 pagesize = TARGET_PAGE_SIZE;
34ab9e97
XG
3721
3722 ret = qemu_uncompress_data(&param->stream, des, pagesize,
3723 param->compbuf, len);
f548222c 3724 if (ret < 0 && migrate_get_current()->decompress_error_check) {
34ab9e97
XG
3725 error_report("decompress data failed");
3726 qemu_file_set_error(decomp_file, ret);
3727 }
73a8912b 3728
33d151f4
LL
3729 qemu_mutex_lock(&decomp_done_lock);
3730 param->done = true;
3731 qemu_cond_signal(&decomp_done_cond);
3732 qemu_mutex_unlock(&decomp_done_lock);
3733
3734 qemu_mutex_lock(&param->mutex);
3735 } else {
3736 qemu_cond_wait(&param->cond, &param->mutex);
3737 }
56e93d26 3738 }
33d151f4 3739 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
3740
3741 return NULL;
3742}
3743
34ab9e97 3744static int wait_for_decompress_done(void)
5533b2e9
LL
3745{
3746 int idx, thread_count;
3747
3748 if (!migrate_use_compression()) {
34ab9e97 3749 return 0;
5533b2e9
LL
3750 }
3751
3752 thread_count = migrate_decompress_threads();
3753 qemu_mutex_lock(&decomp_done_lock);
3754 for (idx = 0; idx < thread_count; idx++) {
3755 while (!decomp_param[idx].done) {
3756 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3757 }
3758 }
3759 qemu_mutex_unlock(&decomp_done_lock);
34ab9e97 3760 return qemu_file_get_error(decomp_file);
5533b2e9
LL
3761}
3762
f0afa331 3763static void compress_threads_load_cleanup(void)
56e93d26
JQ
3764{
3765 int i, thread_count;
3766
3416ab5b
JQ
3767 if (!migrate_use_compression()) {
3768 return;
3769 }
56e93d26
JQ
3770 thread_count = migrate_decompress_threads();
3771 for (i = 0; i < thread_count; i++) {
797ca154
XG
3772 /*
3773 * we use it as a indicator which shows if the thread is
3774 * properly init'd or not
3775 */
3776 if (!decomp_param[i].compbuf) {
3777 break;
3778 }
3779
56e93d26 3780 qemu_mutex_lock(&decomp_param[i].mutex);
90e56fb4 3781 decomp_param[i].quit = true;
56e93d26
JQ
3782 qemu_cond_signal(&decomp_param[i].cond);
3783 qemu_mutex_unlock(&decomp_param[i].mutex);
3784 }
3785 for (i = 0; i < thread_count; i++) {
797ca154
XG
3786 if (!decomp_param[i].compbuf) {
3787 break;
3788 }
3789
56e93d26
JQ
3790 qemu_thread_join(decompress_threads + i);
3791 qemu_mutex_destroy(&decomp_param[i].mutex);
3792 qemu_cond_destroy(&decomp_param[i].cond);
797ca154 3793 inflateEnd(&decomp_param[i].stream);
56e93d26 3794 g_free(decomp_param[i].compbuf);
797ca154 3795 decomp_param[i].compbuf = NULL;
56e93d26
JQ
3796 }
3797 g_free(decompress_threads);
3798 g_free(decomp_param);
56e93d26
JQ
3799 decompress_threads = NULL;
3800 decomp_param = NULL;
34ab9e97 3801 decomp_file = NULL;
56e93d26
JQ
3802}
3803
34ab9e97 3804static int compress_threads_load_setup(QEMUFile *f)
797ca154
XG
3805{
3806 int i, thread_count;
3807
3808 if (!migrate_use_compression()) {
3809 return 0;
3810 }
3811
3812 thread_count = migrate_decompress_threads();
3813 decompress_threads = g_new0(QemuThread, thread_count);
3814 decomp_param = g_new0(DecompressParam, thread_count);
3815 qemu_mutex_init(&decomp_done_lock);
3816 qemu_cond_init(&decomp_done_cond);
34ab9e97 3817 decomp_file = f;
797ca154
XG
3818 for (i = 0; i < thread_count; i++) {
3819 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3820 goto exit;
3821 }
3822
3823 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3824 qemu_mutex_init(&decomp_param[i].mutex);
3825 qemu_cond_init(&decomp_param[i].cond);
3826 decomp_param[i].done = true;
3827 decomp_param[i].quit = false;
3828 qemu_thread_create(decompress_threads + i, "decompress",
3829 do_data_decompress, decomp_param + i,
3830 QEMU_THREAD_JOINABLE);
3831 }
3832 return 0;
3833exit:
3834 compress_threads_load_cleanup();
3835 return -1;
3836}
3837
c1bc6626 3838static void decompress_data_with_multi_threads(QEMUFile *f,
56e93d26
JQ
3839 void *host, int len)
3840{
3841 int idx, thread_count;
3842
3843 thread_count = migrate_decompress_threads();
37396950 3844 QEMU_LOCK_GUARD(&decomp_done_lock);
56e93d26
JQ
3845 while (true) {
3846 for (idx = 0; idx < thread_count; idx++) {
73a8912b 3847 if (decomp_param[idx].done) {
33d151f4
LL
3848 decomp_param[idx].done = false;
3849 qemu_mutex_lock(&decomp_param[idx].mutex);
c1bc6626 3850 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
56e93d26
JQ
3851 decomp_param[idx].des = host;
3852 decomp_param[idx].len = len;
33d151f4
LL
3853 qemu_cond_signal(&decomp_param[idx].cond);
3854 qemu_mutex_unlock(&decomp_param[idx].mutex);
56e93d26
JQ
3855 break;
3856 }
3857 }
3858 if (idx < thread_count) {
3859 break;
73a8912b
LL
3860 } else {
3861 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
56e93d26
JQ
3862 }
3863 }
3864}
3865
b70cb3b4
RL
3866static void colo_init_ram_state(void)
3867{
3868 ram_state_init(&ram_state);
b70cb3b4
RL
3869}
3870
13af18f2
ZC
3871/*
3872 * colo cache: this is for secondary VM, we cache the whole
3873 * memory of the secondary VM, it is need to hold the global lock
3874 * to call this helper.
3875 */
3876int colo_init_ram_cache(void)
3877{
3878 RAMBlock *block;
3879
44901b5a
PB
3880 WITH_RCU_READ_LOCK_GUARD() {
3881 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3882 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
8dbe22c6 3883 NULL, false, false);
44901b5a
PB
3884 if (!block->colo_cache) {
3885 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3886 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3887 block->used_length);
3888 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3889 if (block->colo_cache) {
3890 qemu_anon_ram_free(block->colo_cache, block->used_length);
3891 block->colo_cache = NULL;
3892 }
89ac5a1d 3893 }
44901b5a 3894 return -errno;
89ac5a1d 3895 }
e5fdf920
LS
3896 if (!machine_dump_guest_core(current_machine)) {
3897 qemu_madvise(block->colo_cache, block->used_length,
3898 QEMU_MADV_DONTDUMP);
3899 }
13af18f2 3900 }
13af18f2 3901 }
44901b5a 3902
7d9acafa
ZC
3903 /*
3904 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3905 * with to decide which page in cache should be flushed into SVM's RAM. Here
3906 * we use the same name 'ram_bitmap' as for migration.
3907 */
3908 if (ram_bytes_total()) {
3909 RAMBlock *block;
3910
fbd162e6 3911 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7d9acafa 3912 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
7d9acafa 3913 block->bmap = bitmap_new(pages);
7d9acafa
ZC
3914 }
3915 }
7d9acafa 3916
b70cb3b4 3917 colo_init_ram_state();
13af18f2 3918 return 0;
13af18f2
ZC
3919}
3920
0393031a
HZ
3921/* TODO: duplicated with ram_init_bitmaps */
3922void colo_incoming_start_dirty_log(void)
3923{
3924 RAMBlock *block = NULL;
3925 /* For memory_global_dirty_log_start below. */
3926 qemu_mutex_lock_iothread();
3927 qemu_mutex_lock_ramlist();
3928
3929 memory_global_dirty_log_sync();
3930 WITH_RCU_READ_LOCK_GUARD() {
3931 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3932 ramblock_sync_dirty_bitmap(ram_state, block);
3933 /* Discard this dirty bitmap record */
3934 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3935 }
63b41db4 3936 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
0393031a
HZ
3937 }
3938 ram_state->migration_dirty_pages = 0;
3939 qemu_mutex_unlock_ramlist();
3940 qemu_mutex_unlock_iothread();
3941}
3942
13af18f2
ZC
3943/* It is need to hold the global lock to call this helper */
3944void colo_release_ram_cache(void)
3945{
3946 RAMBlock *block;
3947
63b41db4 3948 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
fbd162e6 3949 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7d9acafa
ZC
3950 g_free(block->bmap);
3951 block->bmap = NULL;
3952 }
3953
89ac5a1d
DDAG
3954 WITH_RCU_READ_LOCK_GUARD() {
3955 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3956 if (block->colo_cache) {
3957 qemu_anon_ram_free(block->colo_cache, block->used_length);
3958 block->colo_cache = NULL;
3959 }
13af18f2
ZC
3960 }
3961 }
0393031a 3962 ram_state_cleanup(&ram_state);
13af18f2
ZC
3963}
3964
f265e0e4
JQ
3965/**
3966 * ram_load_setup: Setup RAM for migration incoming side
3967 *
3968 * Returns zero to indicate success and negative for error
3969 *
3970 * @f: QEMUFile where to receive the data
3971 * @opaque: RAMState pointer
3972 */
3973static int ram_load_setup(QEMUFile *f, void *opaque)
3974{
34ab9e97 3975 if (compress_threads_load_setup(f)) {
797ca154
XG
3976 return -1;
3977 }
3978
f265e0e4 3979 xbzrle_load_setup();
f9494614 3980 ramblock_recv_map_init();
13af18f2 3981
f265e0e4
JQ
3982 return 0;
3983}
3984
3985static int ram_load_cleanup(void *opaque)
3986{
f9494614 3987 RAMBlock *rb;
56eb90af 3988
fbd162e6 3989 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
bd108a44 3990 qemu_ram_block_writeback(rb);
56eb90af
JH
3991 }
3992
f265e0e4 3993 xbzrle_load_cleanup();
f0afa331 3994 compress_threads_load_cleanup();
f9494614 3995
fbd162e6 3996 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
f9494614
AP
3997 g_free(rb->receivedmap);
3998 rb->receivedmap = NULL;
3999 }
13af18f2 4000
f265e0e4
JQ
4001 return 0;
4002}
4003
3d0684b2
JQ
4004/**
4005 * ram_postcopy_incoming_init: allocate postcopy data structures
4006 *
4007 * Returns 0 for success and negative if there was one error
4008 *
4009 * @mis: current migration incoming state
4010 *
4011 * Allocate data structures etc needed by incoming migration with
4012 * postcopy-ram. postcopy-ram's similarly names
4013 * postcopy_ram_incoming_init does the work.
1caddf8a
DDAG
4014 */
4015int ram_postcopy_incoming_init(MigrationIncomingState *mis)
4016{
c136180c 4017 return postcopy_ram_incoming_init(mis);
1caddf8a
DDAG
4018}
4019
3d0684b2
JQ
4020/**
4021 * ram_load_postcopy: load a page in postcopy case
4022 *
4023 * Returns 0 for success or -errno in case of error
4024 *
a7180877
DDAG
4025 * Called in postcopy mode by ram_load().
4026 * rcu_read_lock is taken prior to this being called.
3d0684b2
JQ
4027 *
4028 * @f: QEMUFile where to send the data
36f62f11 4029 * @channel: the channel to use for loading
a7180877 4030 */
36f62f11 4031int ram_load_postcopy(QEMUFile *f, int channel)
a7180877
DDAG
4032{
4033 int flags = 0, ret = 0;
4034 bool place_needed = false;
1aa83678 4035 bool matches_target_page_size = false;
a7180877 4036 MigrationIncomingState *mis = migration_incoming_get_current();
36f62f11 4037 PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
a7180877
DDAG
4038
4039 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4040 ram_addr_t addr;
a7180877
DDAG
4041 void *page_buffer = NULL;
4042 void *place_source = NULL;
df9ff5e1 4043 RAMBlock *block = NULL;
a7180877 4044 uint8_t ch;
644acf99 4045 int len;
a7180877
DDAG
4046
4047 addr = qemu_get_be64(f);
7a9ddfbf
PX
4048
4049 /*
4050 * If qemu file error, we should stop here, and then "addr"
4051 * may be invalid
4052 */
4053 ret = qemu_file_get_error(f);
4054 if (ret) {
4055 break;
4056 }
4057
a7180877
DDAG
4058 flags = addr & ~TARGET_PAGE_MASK;
4059 addr &= TARGET_PAGE_MASK;
4060
36f62f11 4061 trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
644acf99
WY
4062 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4063 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
c01b16ed 4064 block = ram_block_from_stream(mis, f, flags, channel);
6a23f639
DH
4065 if (!block) {
4066 ret = -EINVAL;
4067 break;
4068 }
4c4bad48 4069
898ba906
DH
4070 /*
4071 * Relying on used_length is racy and can result in false positives.
4072 * We might place pages beyond used_length in case RAM was shrunk
4073 * while in postcopy, which is fine - trying to place via
4074 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
4075 */
4076 if (!block->host || addr >= block->postcopy_length) {
a7180877
DDAG
4077 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4078 ret = -EINVAL;
4079 break;
4080 }
77dadc3f 4081 tmp_page->target_pages++;
1aa83678 4082 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
a7180877 4083 /*
28abd200
DDAG
4084 * Postcopy requires that we place whole host pages atomically;
4085 * these may be huge pages for RAMBlocks that are backed by
4086 * hugetlbfs.
a7180877
DDAG
4087 * To make it atomic, the data is read into a temporary page
4088 * that's moved into place later.
4089 * The migration protocol uses, possibly smaller, target-pages
4090 * however the source ensures it always sends all the components
91ba442f 4091 * of a host page in one chunk.
a7180877 4092 */
77dadc3f 4093 page_buffer = tmp_page->tmp_huge_page +
6a23f639
DH
4094 host_page_offset_from_ram_block_offset(block, addr);
4095 /* If all TP are zero then we can optimise the place */
77dadc3f
PX
4096 if (tmp_page->target_pages == 1) {
4097 tmp_page->host_addr =
4098 host_page_from_ram_block_offset(block, addr);
4099 } else if (tmp_page->host_addr !=
4100 host_page_from_ram_block_offset(block, addr)) {
c53b7ddc 4101 /* not the 1st TP within the HP */
36f62f11 4102 error_report("Non-same host page detected on channel %d: "
cfc7dc8a
PX
4103 "Target host page %p, received host page %p "
4104 "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
36f62f11 4105 channel, tmp_page->host_addr,
cfc7dc8a
PX
4106 host_page_from_ram_block_offset(block, addr),
4107 block->idstr, addr, tmp_page->target_pages);
6a23f639
DH
4108 ret = -EINVAL;
4109 break;
a7180877
DDAG
4110 }
4111
4112 /*
4113 * If it's the last part of a host page then we place the host
4114 * page
4115 */
77dadc3f
PX
4116 if (tmp_page->target_pages ==
4117 (block->page_size / TARGET_PAGE_SIZE)) {
4cbb3c63 4118 place_needed = true;
4cbb3c63 4119 }
77dadc3f 4120 place_source = tmp_page->tmp_huge_page;
a7180877
DDAG
4121 }
4122
4123 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
bb890ed5 4124 case RAM_SAVE_FLAG_ZERO:
a7180877 4125 ch = qemu_get_byte(f);
2e36bc1b
WY
4126 /*
4127 * Can skip to set page_buffer when
4128 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
4129 */
4130 if (ch || !matches_target_page_size) {
4131 memset(page_buffer, ch, TARGET_PAGE_SIZE);
4132 }
a7180877 4133 if (ch) {
77dadc3f 4134 tmp_page->all_zero = false;
a7180877
DDAG
4135 }
4136 break;
4137
4138 case RAM_SAVE_FLAG_PAGE:
77dadc3f 4139 tmp_page->all_zero = false;
1aa83678
PX
4140 if (!matches_target_page_size) {
4141 /* For huge pages, we always use temporary buffer */
a7180877
DDAG
4142 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4143 } else {
1aa83678
PX
4144 /*
4145 * For small pages that matches target page size, we
4146 * avoid the qemu_file copy. Instead we directly use
4147 * the buffer of QEMUFile to place the page. Note: we
4148 * cannot do any QEMUFile operation before using that
4149 * buffer to make sure the buffer is valid when
4150 * placing the page.
a7180877
DDAG
4151 */
4152 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4153 TARGET_PAGE_SIZE);
4154 }
4155 break;
644acf99 4156 case RAM_SAVE_FLAG_COMPRESS_PAGE:
77dadc3f 4157 tmp_page->all_zero = false;
644acf99
WY
4158 len = qemu_get_be32(f);
4159 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4160 error_report("Invalid compressed data length: %d", len);
4161 ret = -EINVAL;
4162 break;
4163 }
4164 decompress_data_with_multi_threads(f, page_buffer, len);
4165 break;
4166
a7180877
DDAG
4167 case RAM_SAVE_FLAG_EOS:
4168 /* normal exit */
6df264ac 4169 multifd_recv_sync_main();
a7180877
DDAG
4170 break;
4171 default:
29fccade 4172 error_report("Unknown combination of migration flags: 0x%x"
a7180877
DDAG
4173 " (postcopy mode)", flags);
4174 ret = -EINVAL;
7a9ddfbf
PX
4175 break;
4176 }
4177
644acf99
WY
4178 /* Got the whole host page, wait for decompress before placing. */
4179 if (place_needed) {
4180 ret |= wait_for_decompress_done();
4181 }
4182
7a9ddfbf
PX
4183 /* Detect for any possible file errors */
4184 if (!ret && qemu_file_get_error(f)) {
4185 ret = qemu_file_get_error(f);
a7180877
DDAG
4186 }
4187
7a9ddfbf 4188 if (!ret && place_needed) {
77dadc3f
PX
4189 if (tmp_page->all_zero) {
4190 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
a7180877 4191 } else {
77dadc3f
PX
4192 ret = postcopy_place_page(mis, tmp_page->host_addr,
4193 place_source, block);
a7180877 4194 }
ddf35bdf 4195 place_needed = false;
77dadc3f 4196 postcopy_temp_page_reset(tmp_page);
a7180877 4197 }
a7180877
DDAG
4198 }
4199
4200 return ret;
4201}
4202
acab30b8
DHB
4203static bool postcopy_is_running(void)
4204{
4205 PostcopyState ps = postcopy_state_get();
4206 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4207}
4208
e6f4aa18
ZC
4209/*
4210 * Flush content of RAM cache into SVM's memory.
4211 * Only flush the pages that be dirtied by PVM or SVM or both.
4212 */
24fa16f8 4213void colo_flush_ram_cache(void)
e6f4aa18
ZC
4214{
4215 RAMBlock *block = NULL;
4216 void *dst_host;
4217 void *src_host;
4218 unsigned long offset = 0;
4219
d1955d22 4220 memory_global_dirty_log_sync();
89ac5a1d
DDAG
4221 WITH_RCU_READ_LOCK_GUARD() {
4222 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4223 ramblock_sync_dirty_bitmap(ram_state, block);
4224 }
d1955d22 4225 }
d1955d22 4226
e6f4aa18 4227 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
89ac5a1d
DDAG
4228 WITH_RCU_READ_LOCK_GUARD() {
4229 block = QLIST_FIRST_RCU(&ram_list.blocks);
e6f4aa18 4230
89ac5a1d 4231 while (block) {
a6a83cef 4232 unsigned long num = 0;
e6f4aa18 4233
a6a83cef 4234 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
542147f4
DH
4235 if (!offset_in_ramblock(block,
4236 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
89ac5a1d 4237 offset = 0;
a6a83cef 4238 num = 0;
89ac5a1d
DDAG
4239 block = QLIST_NEXT_RCU(block, next);
4240 } else {
a6a83cef
RL
4241 unsigned long i = 0;
4242
4243 for (i = 0; i < num; i++) {
4244 migration_bitmap_clear_dirty(ram_state, block, offset + i);
4245 }
8bba004c
AR
4246 dst_host = block->host
4247 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4248 src_host = block->colo_cache
4249 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
a6a83cef
RL
4250 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
4251 offset += num;
89ac5a1d 4252 }
e6f4aa18
ZC
4253 }
4254 }
e6f4aa18
ZC
4255 trace_colo_flush_ram_cache_end();
4256}
4257
10da4a36
WY
4258/**
4259 * ram_load_precopy: load pages in precopy case
4260 *
4261 * Returns 0 for success or -errno in case of error
4262 *
4263 * Called in precopy mode by ram_load().
4264 * rcu_read_lock is taken prior to this being called.
4265 *
4266 * @f: QEMUFile where to send the data
4267 */
4268static int ram_load_precopy(QEMUFile *f)
56e93d26 4269{
755e8d7c 4270 MigrationIncomingState *mis = migration_incoming_get_current();
e65cec5e 4271 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
ef08fb38 4272 /* ADVISE is earlier, it shows the source has the postcopy capability on */
80fe315c 4273 bool postcopy_advised = migration_incoming_postcopy_advised();
edc60127
JQ
4274 if (!migrate_use_compression()) {
4275 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4276 }
a7180877 4277
10da4a36 4278 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
56e93d26 4279 ram_addr_t addr, total_ram_bytes;
0393031a 4280 void *host = NULL, *host_bak = NULL;
56e93d26
JQ
4281 uint8_t ch;
4282
e65cec5e
YK
4283 /*
4284 * Yield periodically to let main loop run, but an iteration of
4285 * the main loop is expensive, so do it each some iterations
4286 */
4287 if ((i & 32767) == 0 && qemu_in_coroutine()) {
4288 aio_co_schedule(qemu_get_current_aio_context(),
4289 qemu_coroutine_self());
4290 qemu_coroutine_yield();
4291 }
4292 i++;
4293
56e93d26
JQ
4294 addr = qemu_get_be64(f);
4295 flags = addr & ~TARGET_PAGE_MASK;
4296 addr &= TARGET_PAGE_MASK;
4297
edc60127
JQ
4298 if (flags & invalid_flags) {
4299 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4300 error_report("Received an unexpected compressed page");
4301 }
4302
4303 ret = -EINVAL;
4304 break;
4305 }
4306
bb890ed5 4307 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
a776aa15 4308 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
c01b16ed
PX
4309 RAMBlock *block = ram_block_from_stream(mis, f, flags,
4310 RAM_CHANNEL_PRECOPY);
4c4bad48 4311
0393031a 4312 host = host_from_ram_block_offset(block, addr);
13af18f2 4313 /*
0393031a
HZ
4314 * After going into COLO stage, we should not load the page
4315 * into SVM's memory directly, we put them into colo_cache firstly.
4316 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
4317 * Previously, we copied all these memory in preparing stage of COLO
4318 * while we need to stop VM, which is a time-consuming process.
4319 * Here we optimize it by a trick, back-up every page while in
4320 * migration process while COLO is enabled, though it affects the
4321 * speed of the migration, but it obviously reduce the downtime of
4322 * back-up all SVM'S memory in COLO preparing stage.
13af18f2 4323 */
0393031a
HZ
4324 if (migration_incoming_colo_enabled()) {
4325 if (migration_incoming_in_colo_state()) {
4326 /* In COLO stage, put all pages into cache temporarily */
8af66371 4327 host = colo_cache_from_block_offset(block, addr, true);
0393031a
HZ
4328 } else {
4329 /*
4330 * In migration stage but before COLO stage,
4331 * Put all pages into both cache and SVM's memory.
4332 */
8af66371 4333 host_bak = colo_cache_from_block_offset(block, addr, false);
0393031a 4334 }
13af18f2 4335 }
a776aa15
DDAG
4336 if (!host) {
4337 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4338 ret = -EINVAL;
4339 break;
4340 }
13af18f2
ZC
4341 if (!migration_incoming_in_colo_state()) {
4342 ramblock_recv_bitmap_set(block, host);
4343 }
4344
1db9d8e5 4345 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
a776aa15
DDAG
4346 }
4347
56e93d26
JQ
4348 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4349 case RAM_SAVE_FLAG_MEM_SIZE:
4350 /* Synchronize RAM block list */
4351 total_ram_bytes = addr;
4352 while (!ret && total_ram_bytes) {
4353 RAMBlock *block;
56e93d26
JQ
4354 char id[256];
4355 ram_addr_t length;
4356
4357 len = qemu_get_byte(f);
4358 qemu_get_buffer(f, (uint8_t *)id, len);
4359 id[len] = 0;
4360 length = qemu_get_be64(f);
4361
e3dd7493 4362 block = qemu_ram_block_by_name(id);
b895de50
CLG
4363 if (block && !qemu_ram_is_migratable(block)) {
4364 error_report("block %s should not be migrated !", id);
4365 ret = -EINVAL;
4366 } else if (block) {
e3dd7493
DDAG
4367 if (length != block->used_length) {
4368 Error *local_err = NULL;
56e93d26 4369
fa53a0e5 4370 ret = qemu_ram_resize(block, length,
e3dd7493
DDAG
4371 &local_err);
4372 if (local_err) {
4373 error_report_err(local_err);
56e93d26 4374 }
56e93d26 4375 }
ef08fb38 4376 /* For postcopy we need to check hugepage sizes match */
e846b746 4377 if (postcopy_advised && migrate_postcopy_ram() &&
ef08fb38
DDAG
4378 block->page_size != qemu_host_page_size) {
4379 uint64_t remote_page_size = qemu_get_be64(f);
4380 if (remote_page_size != block->page_size) {
4381 error_report("Mismatched RAM page size %s "
4382 "(local) %zd != %" PRId64,
4383 id, block->page_size,
4384 remote_page_size);
4385 ret = -EINVAL;
4386 }
4387 }
fbd162e6
YK
4388 if (migrate_ignore_shared()) {
4389 hwaddr addr = qemu_get_be64(f);
fbd162e6
YK
4390 if (ramblock_is_ignored(block) &&
4391 block->mr->addr != addr) {
4392 error_report("Mismatched GPAs for block %s "
4393 "%" PRId64 "!= %" PRId64,
4394 id, (uint64_t)addr,
4395 (uint64_t)block->mr->addr);
4396 ret = -EINVAL;
4397 }
4398 }
e3dd7493
DDAG
4399 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4400 block->idstr);
4401 } else {
56e93d26
JQ
4402 error_report("Unknown ramblock \"%s\", cannot "
4403 "accept migration", id);
4404 ret = -EINVAL;
4405 }
4406
4407 total_ram_bytes -= length;
4408 }
4409 break;
a776aa15 4410
bb890ed5 4411 case RAM_SAVE_FLAG_ZERO:
56e93d26
JQ
4412 ch = qemu_get_byte(f);
4413 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4414 break;
a776aa15 4415
56e93d26 4416 case RAM_SAVE_FLAG_PAGE:
56e93d26
JQ
4417 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4418 break;
56e93d26 4419
a776aa15 4420 case RAM_SAVE_FLAG_COMPRESS_PAGE:
56e93d26
JQ
4421 len = qemu_get_be32(f);
4422 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4423 error_report("Invalid compressed data length: %d", len);
4424 ret = -EINVAL;
4425 break;
4426 }
c1bc6626 4427 decompress_data_with_multi_threads(f, host, len);
56e93d26 4428 break;
a776aa15 4429
56e93d26 4430 case RAM_SAVE_FLAG_XBZRLE:
56e93d26
JQ
4431 if (load_xbzrle(f, addr, host) < 0) {
4432 error_report("Failed to decompress XBZRLE page at "
4433 RAM_ADDR_FMT, addr);
4434 ret = -EINVAL;
4435 break;
4436 }
4437 break;
4438 case RAM_SAVE_FLAG_EOS:
4439 /* normal exit */
6df264ac 4440 multifd_recv_sync_main();
56e93d26
JQ
4441 break;
4442 default:
4443 if (flags & RAM_SAVE_FLAG_HOOK) {
632e3a5c 4444 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
56e93d26 4445 } else {
29fccade 4446 error_report("Unknown combination of migration flags: 0x%x",
56e93d26
JQ
4447 flags);
4448 ret = -EINVAL;
4449 }
4450 }
4451 if (!ret) {
4452 ret = qemu_file_get_error(f);
4453 }
0393031a
HZ
4454 if (!ret && host_bak) {
4455 memcpy(host_bak, host, TARGET_PAGE_SIZE);
4456 }
56e93d26
JQ
4457 }
4458
ca1a6b70 4459 ret |= wait_for_decompress_done();
10da4a36
WY
4460 return ret;
4461}
4462
4463static int ram_load(QEMUFile *f, void *opaque, int version_id)
4464{
4465 int ret = 0;
4466 static uint64_t seq_iter;
4467 /*
4468 * If system is running in postcopy mode, page inserts to host memory must
4469 * be atomic
4470 */
4471 bool postcopy_running = postcopy_is_running();
4472
4473 seq_iter++;
4474
4475 if (version_id != 4) {
4476 return -EINVAL;
4477 }
4478
4479 /*
4480 * This RCU critical section can be very long running.
4481 * When RCU reclaims in the code start to become numerous,
4482 * it will be necessary to reduce the granularity of this
4483 * critical section.
4484 */
89ac5a1d
DDAG
4485 WITH_RCU_READ_LOCK_GUARD() {
4486 if (postcopy_running) {
36f62f11
PX
4487 /*
4488 * Note! Here RAM_CHANNEL_PRECOPY is the precopy channel of
4489 * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
4490 * service fast page faults.
4491 */
4492 ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
89ac5a1d
DDAG
4493 } else {
4494 ret = ram_load_precopy(f);
4495 }
10da4a36 4496 }
55c4446b 4497 trace_ram_load_complete(ret, seq_iter);
e6f4aa18 4498
56e93d26
JQ
4499 return ret;
4500}
4501
c6467627
VSO
4502static bool ram_has_postcopy(void *opaque)
4503{
469dd51b 4504 RAMBlock *rb;
fbd162e6 4505 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
469dd51b
JH
4506 if (ramblock_is_pmem(rb)) {
4507 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4508 "is not supported now!", rb->idstr, rb->host);
4509 return false;
4510 }
4511 }
4512
c6467627
VSO
4513 return migrate_postcopy_ram();
4514}
4515
edd090c7
PX
4516/* Sync all the dirty bitmap with destination VM. */
4517static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4518{
4519 RAMBlock *block;
4520 QEMUFile *file = s->to_dst_file;
4521 int ramblock_count = 0;
4522
4523 trace_ram_dirty_bitmap_sync_start();
4524
fbd162e6 4525 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
edd090c7
PX
4526 qemu_savevm_send_recv_bitmap(file, block->idstr);
4527 trace_ram_dirty_bitmap_request(block->idstr);
4528 ramblock_count++;
4529 }
4530
4531 trace_ram_dirty_bitmap_sync_wait();
4532
4533 /* Wait until all the ramblocks' dirty bitmap synced */
4534 while (ramblock_count--) {
4535 qemu_sem_wait(&s->rp_state.rp_sem);
4536 }
4537
4538 trace_ram_dirty_bitmap_sync_complete();
4539
4540 return 0;
4541}
4542
4543static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4544{
4545 qemu_sem_post(&s->rp_state.rp_sem);
4546}
4547
a335debb
PX
4548/*
4549 * Read the received bitmap, revert it as the initial dirty bitmap.
4550 * This is only used when the postcopy migration is paused but wants
4551 * to resume from a middle point.
4552 */
4553int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4554{
4555 int ret = -EINVAL;
43044ac0 4556 /* from_dst_file is always valid because we're within rp_thread */
a335debb
PX
4557 QEMUFile *file = s->rp_state.from_dst_file;
4558 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
a725ef9f 4559 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
4560 uint64_t size, end_mark;
4561
4562 trace_ram_dirty_bitmap_reload_begin(block->idstr);
4563
4564 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4565 error_report("%s: incorrect state %s", __func__,
4566 MigrationStatus_str(s->state));
4567 return -EINVAL;
4568 }
4569
4570 /*
4571 * Note: see comments in ramblock_recv_bitmap_send() on why we
3a4452d8 4572 * need the endianness conversion, and the paddings.
a335debb
PX
4573 */
4574 local_size = ROUND_UP(local_size, 8);
4575
4576 /* Add paddings */
4577 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4578
4579 size = qemu_get_be64(file);
4580
4581 /* The size of the bitmap should match with our ramblock */
4582 if (size != local_size) {
4583 error_report("%s: ramblock '%s' bitmap size mismatch "
4584 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4585 block->idstr, size, local_size);
4586 ret = -EINVAL;
4587 goto out;
4588 }
4589
4590 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4591 end_mark = qemu_get_be64(file);
4592
4593 ret = qemu_file_get_error(file);
4594 if (ret || size != local_size) {
4595 error_report("%s: read bitmap failed for ramblock '%s': %d"
4596 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4597 __func__, block->idstr, ret, local_size, size);
4598 ret = -EIO;
4599 goto out;
4600 }
4601
4602 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
af3bbbe9 4603 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
a335debb
PX
4604 __func__, block->idstr, end_mark);
4605 ret = -EINVAL;
4606 goto out;
4607 }
4608
4609 /*
3a4452d8 4610 * Endianness conversion. We are during postcopy (though paused).
a335debb
PX
4611 * The dirty bitmap won't change. We can directly modify it.
4612 */
4613 bitmap_from_le(block->bmap, le_bitmap, nbits);
4614
4615 /*
4616 * What we received is "received bitmap". Revert it as the initial
4617 * dirty bitmap for this ramblock.
4618 */
4619 bitmap_complement(block->bmap, block->bmap, nbits);
4620
be39b4cd
DH
4621 /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4622 ramblock_dirty_bitmap_clear_discarded_pages(block);
4623
4624 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
a335debb
PX
4625 trace_ram_dirty_bitmap_reload_complete(block->idstr);
4626
edd090c7
PX
4627 /*
4628 * We succeeded to sync bitmap for current ramblock. If this is
4629 * the last one to sync, we need to notify the main send thread.
4630 */
4631 ram_dirty_bitmap_reload_notify(s);
4632
a335debb
PX
4633 ret = 0;
4634out:
bf269906 4635 g_free(le_bitmap);
a335debb
PX
4636 return ret;
4637}
4638
edd090c7
PX
4639static int ram_resume_prepare(MigrationState *s, void *opaque)
4640{
4641 RAMState *rs = *(RAMState **)opaque;
08614f34 4642 int ret;
edd090c7 4643
08614f34
PX
4644 ret = ram_dirty_bitmap_sync_all(s, rs);
4645 if (ret) {
4646 return ret;
4647 }
4648
4649 ram_state_resume_prepare(rs, s->to_dst_file);
4650
4651 return 0;
edd090c7
PX
4652}
4653
36f62f11
PX
4654void postcopy_preempt_shutdown_file(MigrationState *s)
4655{
4656 qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
4657 qemu_fflush(s->postcopy_qemufile_src);
4658}
4659
56e93d26 4660static SaveVMHandlers savevm_ram_handlers = {
9907e842 4661 .save_setup = ram_save_setup,
56e93d26 4662 .save_live_iterate = ram_save_iterate,
763c906b 4663 .save_live_complete_postcopy = ram_save_complete,
a3e06c3d 4664 .save_live_complete_precopy = ram_save_complete,
c6467627 4665 .has_postcopy = ram_has_postcopy,
c8df4a7a
JQ
4666 .state_pending_exact = ram_state_pending_exact,
4667 .state_pending_estimate = ram_state_pending_estimate,
56e93d26 4668 .load_state = ram_load,
f265e0e4
JQ
4669 .save_cleanup = ram_save_cleanup,
4670 .load_setup = ram_load_setup,
4671 .load_cleanup = ram_load_cleanup,
edd090c7 4672 .resume_prepare = ram_resume_prepare,
56e93d26
JQ
4673};
4674
c7c0e724
DH
4675static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4676 size_t old_size, size_t new_size)
4677{
cc61c703 4678 PostcopyState ps = postcopy_state_get();
c7c0e724
DH
4679 ram_addr_t offset;
4680 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4681 Error *err = NULL;
4682
4683 if (ramblock_is_ignored(rb)) {
4684 return;
4685 }
4686
4687 if (!migration_is_idle()) {
4688 /*
4689 * Precopy code on the source cannot deal with the size of RAM blocks
4690 * changing at random points in time - especially after sending the
4691 * RAM block sizes in the migration stream, they must no longer change.
4692 * Abort and indicate a proper reason.
4693 */
4694 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
458fecca 4695 migration_cancel(err);
c7c0e724 4696 error_free(err);
c7c0e724 4697 }
cc61c703
DH
4698
4699 switch (ps) {
4700 case POSTCOPY_INCOMING_ADVISE:
4701 /*
4702 * Update what ram_postcopy_incoming_init()->init_range() does at the
4703 * time postcopy was advised. Syncing RAM blocks with the source will
4704 * result in RAM resizes.
4705 */
4706 if (old_size < new_size) {
4707 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4708 error_report("RAM block '%s' discard of resized RAM failed",
4709 rb->idstr);
4710 }
4711 }
898ba906 4712 rb->postcopy_length = new_size;
cc61c703
DH
4713 break;
4714 case POSTCOPY_INCOMING_NONE:
4715 case POSTCOPY_INCOMING_RUNNING:
4716 case POSTCOPY_INCOMING_END:
4717 /*
4718 * Once our guest is running, postcopy does no longer care about
4719 * resizes. When growing, the new memory was not available on the
4720 * source, no handler needed.
4721 */
4722 break;
4723 default:
4724 error_report("RAM block '%s' resized during postcopy state: %d",
4725 rb->idstr, ps);
4726 exit(-1);
4727 }
c7c0e724
DH
4728}
4729
4730static RAMBlockNotifier ram_mig_ram_notifier = {
4731 .ram_block_resized = ram_mig_ram_block_resized,
4732};
4733
56e93d26
JQ
4734void ram_mig_init(void)
4735{
4736 qemu_mutex_init(&XBZRLE.lock);
ce62df53 4737 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
c7c0e724 4738 ram_block_notifier_add(&ram_mig_ram_notifier);
56e93d26 4739}