]> git.proxmox.com Git - mirror_qemu.git/blame - migration/ram.c
migration: Create options.c
[mirror_qemu.git] / migration / ram.c
CommitLineData
56e93d26
JQ
1/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
76cc7b58
JQ
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
56e93d26
JQ
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
e688df6b 28
1393a485 29#include "qemu/osdep.h"
f348b6d1 30#include "qemu/cutils.h"
56e93d26
JQ
31#include "qemu/bitops.h"
32#include "qemu/bitmap.h"
b85ea5fa 33#include "qemu/madvise.h"
7205c9ec 34#include "qemu/main-loop.h"
c0e0825c 35#include "io/channel-null.h"
709e3fe8 36#include "xbzrle.h"
7b1e1a22 37#include "ram.h"
6666c96a 38#include "migration.h"
f2a8f0a6 39#include "migration/register.h"
7b1e1a22 40#include "migration/misc.h"
08a0aee1 41#include "qemu-file.h"
be07b0ac 42#include "postcopy-ram.h"
53d37d36 43#include "page_cache.h"
56e93d26 44#include "qemu/error-report.h"
e688df6b 45#include "qapi/error.h"
ab7cbb0b 46#include "qapi/qapi-types-migration.h"
9af23989 47#include "qapi/qapi-events-migration.h"
8acabf69 48#include "qapi/qmp/qerror.h"
56e93d26 49#include "trace.h"
56e93d26 50#include "exec/ram_addr.h"
f9494614 51#include "exec/target_page.h"
56e93d26 52#include "qemu/rcu_queue.h"
a91246c9 53#include "migration/colo.h"
53d37d36 54#include "block.h"
b0c3cf94 55#include "sysemu/cpu-throttle.h"
edd090c7 56#include "savevm.h"
b9ee2f7d 57#include "qemu/iov.h"
d32ca5ad 58#include "multifd.h"
278e2f55 59#include "sysemu/runstate.h"
1f0776f1 60#include "options.h"
278e2f55 61
e5fdf920
LS
62#include "hw/boards.h" /* for machine_dump_guest_core() */
63
278e2f55
AG
64#if defined(__linux__)
65#include "qemu/userfaultfd.h"
66#endif /* defined(__linux__) */
56e93d26 67
56e93d26
JQ
68/***********************************************************/
69/* ram save/restore */
70
7b548761
JQ
71/*
72 * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
73 * worked for pages that were filled with the same char. We switched
bb890ed5 74 * it to only search for the zero value. And to avoid confusion with
7b548761 75 * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it.
bb890ed5 76 */
7b548761
JQ
77/*
78 * RAM_SAVE_FLAG_FULL was obsoleted in 2009, it can be reused now
79 */
80#define RAM_SAVE_FLAG_FULL 0x01
bb890ed5 81#define RAM_SAVE_FLAG_ZERO 0x02
56e93d26
JQ
82#define RAM_SAVE_FLAG_MEM_SIZE 0x04
83#define RAM_SAVE_FLAG_PAGE 0x08
84#define RAM_SAVE_FLAG_EOS 0x10
85#define RAM_SAVE_FLAG_CONTINUE 0x20
86#define RAM_SAVE_FLAG_XBZRLE 0x40
7b548761 87/* 0x80 is reserved in qemu-file.h for RAM_SAVE_FLAG_HOOK */
56e93d26 88#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
7b548761 89/* We can't use any flag that is bigger than 0x200 */
56e93d26 90
04ffce13 91int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int,
92 uint8_t *, int) = xbzrle_encode_buffer;
93#if defined(CONFIG_AVX512BW_OPT)
94#include "qemu/cpuid.h"
95static void __attribute__((constructor)) init_cpu_flag(void)
96{
97 unsigned max = __get_cpuid_max(0, NULL);
98 int a, b, c, d;
99 if (max >= 1) {
100 __cpuid(1, a, b, c, d);
101 /* We must check that AVX is not just available, but usable. */
102 if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
103 int bv;
104 __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
105 __cpuid_count(7, 0, a, b, c, d);
106 /* 0xe6:
107 * XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
108 * and ZMM16-ZMM31 state are enabled by OS)
109 * XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
110 */
111 if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
112 xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512;
113 }
114 }
115 }
116}
117#endif
118
9360447d
JQ
119XBZRLECacheStats xbzrle_counters;
120
f1668764
PX
121/* used by the search for pages to send */
122struct PageSearchStatus {
123 /* The migration channel used for a specific host page */
124 QEMUFile *pss_channel;
ec6f3ab9
PX
125 /* Last block from where we have sent data */
126 RAMBlock *last_sent_block;
f1668764
PX
127 /* Current block being searched */
128 RAMBlock *block;
129 /* Current page to search from */
130 unsigned long page;
131 /* Set once we wrap around */
132 bool complete_round;
f1668764
PX
133 /* Whether we're sending a host page */
134 bool host_page_sending;
135 /* The start/end of current host page. Invalid if host_page_sending==false */
136 unsigned long host_page_start;
137 unsigned long host_page_end;
138};
139typedef struct PageSearchStatus PageSearchStatus;
140
56e93d26
JQ
141/* struct contains XBZRLE cache and a static page
142 used by the compression */
143static struct {
144 /* buffer used for XBZRLE encoding */
145 uint8_t *encoded_buf;
146 /* buffer for storing page content */
147 uint8_t *current_buf;
148 /* Cache for XBZRLE, Protected by lock. */
149 PageCache *cache;
150 QemuMutex lock;
c00e0928
JQ
151 /* it will store a page full of zeros */
152 uint8_t *zero_target_page;
f265e0e4
JQ
153 /* buffer used for XBZRLE decoding */
154 uint8_t *decoded_buf;
56e93d26
JQ
155} XBZRLE;
156
56e93d26
JQ
157static void XBZRLE_cache_lock(void)
158{
f4c51a6b 159 if (migrate_use_xbzrle()) {
56e93d26 160 qemu_mutex_lock(&XBZRLE.lock);
f4c51a6b 161 }
56e93d26
JQ
162}
163
164static void XBZRLE_cache_unlock(void)
165{
f4c51a6b 166 if (migrate_use_xbzrle()) {
56e93d26 167 qemu_mutex_unlock(&XBZRLE.lock);
f4c51a6b 168 }
56e93d26
JQ
169}
170
3d0684b2
JQ
171/**
172 * xbzrle_cache_resize: resize the xbzrle cache
173 *
cbde7be9 174 * This function is called from migrate_params_apply in main
3d0684b2
JQ
175 * thread, possibly while a migration is in progress. A running
176 * migration may be using the cache and might finish during this call,
177 * hence changes to the cache are protected by XBZRLE.lock().
178 *
c9dede2d 179 * Returns 0 for success or -1 for error
3d0684b2
JQ
180 *
181 * @new_size: new cache size
8acabf69 182 * @errp: set *errp if the check failed, with reason
56e93d26 183 */
8b9407a0 184int xbzrle_cache_resize(uint64_t new_size, Error **errp)
56e93d26
JQ
185{
186 PageCache *new_cache;
c9dede2d 187 int64_t ret = 0;
56e93d26 188
8acabf69
JQ
189 /* Check for truncation */
190 if (new_size != (size_t)new_size) {
191 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
192 "exceeding address space");
193 return -1;
194 }
195
2a313e5c
JQ
196 if (new_size == migrate_xbzrle_cache_size()) {
197 /* nothing to do */
c9dede2d 198 return 0;
2a313e5c
JQ
199 }
200
56e93d26
JQ
201 XBZRLE_cache_lock();
202
203 if (XBZRLE.cache != NULL) {
80f8dfde 204 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
56e93d26 205 if (!new_cache) {
56e93d26
JQ
206 ret = -1;
207 goto out;
208 }
209
210 cache_fini(XBZRLE.cache);
211 XBZRLE.cache = new_cache;
212 }
56e93d26
JQ
213out:
214 XBZRLE_cache_unlock();
215 return ret;
216}
217
20123ee1
PX
218static bool postcopy_preempt_active(void)
219{
220 return migrate_postcopy_preempt() && migration_in_postcopy();
221}
222
3ded54b1 223bool ramblock_is_ignored(RAMBlock *block)
fbd162e6
YK
224{
225 return !qemu_ram_is_migratable(block) ||
226 (migrate_ignore_shared() && qemu_ram_is_shared(block));
227}
228
343f632c
DDAG
229#undef RAMBLOCK_FOREACH
230
fbd162e6
YK
231int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
232{
233 RAMBlock *block;
234 int ret = 0;
235
89ac5a1d
DDAG
236 RCU_READ_LOCK_GUARD();
237
fbd162e6
YK
238 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
239 ret = func(block, opaque);
240 if (ret) {
241 break;
242 }
243 }
fbd162e6
YK
244 return ret;
245}
246
f9494614
AP
247static void ramblock_recv_map_init(void)
248{
249 RAMBlock *rb;
250
fbd162e6 251 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
f9494614
AP
252 assert(!rb->receivedmap);
253 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
254 }
255}
256
257int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
258{
259 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
260 rb->receivedmap);
261}
262
1cba9f6e
DDAG
263bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
264{
265 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
266}
267
f9494614
AP
268void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
269{
270 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
271}
272
273void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
274 size_t nr)
275{
276 bitmap_set_atomic(rb->receivedmap,
277 ramblock_recv_bitmap_offset(host_addr, rb),
278 nr);
279}
280
a335debb
PX
281#define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
282
283/*
284 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
285 *
286 * Returns >0 if success with sent bytes, or <0 if error.
287 */
288int64_t ramblock_recv_bitmap_send(QEMUFile *file,
289 const char *block_name)
290{
291 RAMBlock *block = qemu_ram_block_by_name(block_name);
292 unsigned long *le_bitmap, nbits;
293 uint64_t size;
294
295 if (!block) {
296 error_report("%s: invalid block name: %s", __func__, block_name);
297 return -1;
298 }
299
898ba906 300 nbits = block->postcopy_length >> TARGET_PAGE_BITS;
a335debb
PX
301
302 /*
303 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
304 * machines we may need 4 more bytes for padding (see below
305 * comment). So extend it a bit before hand.
306 */
307 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
308
309 /*
310 * Always use little endian when sending the bitmap. This is
311 * required that when source and destination VMs are not using the
3a4452d8 312 * same endianness. (Note: big endian won't work.)
a335debb
PX
313 */
314 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
315
316 /* Size of the bitmap, in bytes */
a725ef9f 317 size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
318
319 /*
320 * size is always aligned to 8 bytes for 64bit machines, but it
321 * may not be true for 32bit machines. We need this padding to
322 * make sure the migration can survive even between 32bit and
323 * 64bit machines.
324 */
325 size = ROUND_UP(size, 8);
326
327 qemu_put_be64(file, size);
328 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
329 /*
330 * Mark as an end, in case the middle part is screwed up due to
3a4452d8 331 * some "mysterious" reason.
a335debb
PX
332 */
333 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
334 qemu_fflush(file);
335
bf269906 336 g_free(le_bitmap);
a335debb
PX
337
338 if (qemu_file_get_error(file)) {
339 return qemu_file_get_error(file);
340 }
341
342 return size + sizeof(size);
343}
344
ec481c6c
JQ
345/*
346 * An outstanding page request, on the source, having been received
347 * and queued
348 */
349struct RAMSrcPageRequest {
350 RAMBlock *rb;
351 hwaddr offset;
352 hwaddr len;
353
354 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
355};
356
6f37bb8b
JQ
357/* State of RAM for migration */
358struct RAMState {
f1668764
PX
359 /*
360 * PageSearchStatus structures for the channels when send pages.
361 * Protected by the bitmap_mutex.
362 */
363 PageSearchStatus pss[RAM_CHANNEL_MAX];
278e2f55
AG
364 /* UFFD file descriptor, used in 'write-tracking' migration */
365 int uffdio_fd;
8d80e195
JQ
366 /* total ram size in bytes */
367 uint64_t ram_bytes_total;
6f37bb8b
JQ
368 /* Last block that we have visited searching for dirty pages */
369 RAMBlock *last_seen_block;
269ace29
JQ
370 /* Last dirty target page we have sent */
371 ram_addr_t last_page;
6f37bb8b
JQ
372 /* last ram version we have seen */
373 uint32_t last_version;
8d820d6f
JQ
374 /* How many times we have dirty too many pages */
375 int dirty_rate_high_cnt;
f664da80
JQ
376 /* these variables are used for bitmap sync */
377 /* last time we did a full bitmap_sync */
378 int64_t time_last_bitmap_sync;
eac74159 379 /* bytes transferred at start_time */
c4bdf0cf 380 uint64_t bytes_xfer_prev;
a66cd90c 381 /* number of dirty pages since start_time */
68908ed6 382 uint64_t num_dirty_pages_period;
b5833fde
JQ
383 /* xbzrle misses since the beginning of the period */
384 uint64_t xbzrle_cache_miss_prev;
e460a4b1
WW
385 /* Amount of xbzrle pages since the beginning of the period */
386 uint64_t xbzrle_pages_prev;
387 /* Amount of xbzrle encoded bytes since the beginning of the period */
388 uint64_t xbzrle_bytes_prev;
1a373522
DH
389 /* Start using XBZRLE (e.g., after the first round). */
390 bool xbzrle_enabled;
05931ec5
JQ
391 /* Are we on the last stage of migration */
392 bool last_stage;
76e03000
XG
393 /* compression statistics since the beginning of the period */
394 /* amount of count that no free thread to compress data */
395 uint64_t compress_thread_busy_prev;
396 /* amount bytes after compression */
397 uint64_t compressed_size_prev;
398 /* amount of compressed pages */
399 uint64_t compress_pages_prev;
400
be8b02ed
XG
401 /* total handled target pages at the beginning of period */
402 uint64_t target_page_count_prev;
403 /* total handled target pages since start */
404 uint64_t target_page_count;
9360447d 405 /* number of dirty bits in the bitmap */
2dfaf12e 406 uint64_t migration_dirty_pages;
f1668764
PX
407 /*
408 * Protects:
409 * - dirty/clear bitmap
410 * - migration_dirty_pages
411 * - pss structures
412 */
108cfae0 413 QemuMutex bitmap_mutex;
68a098f3
JQ
414 /* The RAMBlock used in the last src_page_requests */
415 RAMBlock *last_req_rb;
ec481c6c
JQ
416 /* Queue of outstanding page requests from the destination */
417 QemuMutex src_page_req_mutex;
b58deb34 418 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
6f37bb8b
JQ
419};
420typedef struct RAMState RAMState;
421
53518d94 422static RAMState *ram_state;
6f37bb8b 423
bd227060
WW
424static NotifierWithReturnList precopy_notifier_list;
425
a1fe28df
PX
426/* Whether postcopy has queued requests? */
427static bool postcopy_has_request(RAMState *rs)
428{
429 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
430}
431
bd227060
WW
432void precopy_infrastructure_init(void)
433{
434 notifier_with_return_list_init(&precopy_notifier_list);
435}
436
437void precopy_add_notifier(NotifierWithReturn *n)
438{
439 notifier_with_return_list_add(&precopy_notifier_list, n);
440}
441
442void precopy_remove_notifier(NotifierWithReturn *n)
443{
444 notifier_with_return_remove(n);
445}
446
447int precopy_notify(PrecopyNotifyReason reason, Error **errp)
448{
449 PrecopyNotifyData pnd;
450 pnd.reason = reason;
451 pnd.errp = errp;
452
453 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
454}
455
9edabd4d 456uint64_t ram_bytes_remaining(void)
2f4fde93 457{
bae416e5
DDAG
458 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
459 0;
2f4fde93
JQ
460}
461
abce5fa1 462RAMStats ram_counters;
96506894 463
26a26069 464void ram_transferred_add(uint64_t bytes)
4c2d0f6d 465{
ae680668 466 if (runstate_is_running()) {
b013b5d1 467 stat64_add(&ram_counters.precopy_bytes, bytes);
ae680668 468 } else if (migration_in_postcopy()) {
abce5fa1 469 stat64_add(&ram_counters.postcopy_bytes, bytes);
ae680668 470 } else {
296a4ac2 471 stat64_add(&ram_counters.downtime_bytes, bytes);
ae680668 472 }
abce5fa1 473 stat64_add(&ram_counters.transferred, bytes);
4c2d0f6d
DE
474}
475
4010ba38
JQ
476struct MigrationOps {
477 int (*ram_save_target_page)(RAMState *rs, PageSearchStatus *pss);
478};
479typedef struct MigrationOps MigrationOps;
480
481MigrationOps *migration_ops;
482
76e03000
XG
483CompressionStats compression_counters;
484
56e93d26 485struct CompressParam {
56e93d26 486 bool done;
90e56fb4 487 bool quit;
5e5fdcff 488 bool zero_page;
56e93d26
JQ
489 QEMUFile *file;
490 QemuMutex mutex;
491 QemuCond cond;
492 RAMBlock *block;
493 ram_addr_t offset;
34ab9e97
XG
494
495 /* internally used fields */
dcaf446e 496 z_stream stream;
34ab9e97 497 uint8_t *originbuf;
56e93d26
JQ
498};
499typedef struct CompressParam CompressParam;
500
501struct DecompressParam {
73a8912b 502 bool done;
90e56fb4 503 bool quit;
56e93d26
JQ
504 QemuMutex mutex;
505 QemuCond cond;
506 void *des;
d341d9f3 507 uint8_t *compbuf;
56e93d26 508 int len;
797ca154 509 z_stream stream;
56e93d26
JQ
510};
511typedef struct DecompressParam DecompressParam;
512
513static CompressParam *comp_param;
514static QemuThread *compress_threads;
515/* comp_done_cond is used to wake up the migration thread when
516 * one of the compression threads has finished the compression.
517 * comp_done_lock is used to co-work with comp_done_cond.
518 */
0d9f9a5c
LL
519static QemuMutex comp_done_lock;
520static QemuCond comp_done_cond;
56e93d26 521
34ab9e97 522static QEMUFile *decomp_file;
56e93d26
JQ
523static DecompressParam *decomp_param;
524static QemuThread *decompress_threads;
73a8912b
LL
525static QemuMutex decomp_done_lock;
526static QemuCond decomp_done_cond;
56e93d26 527
93589827
PX
528static int ram_save_host_page_urgent(PageSearchStatus *pss);
529
5e5fdcff 530static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
6ef3771c 531 ram_addr_t offset, uint8_t *source_buf);
56e93d26 532
ebd88a49
PX
533/* NOTE: page is the PFN not real ram_addr_t. */
534static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page)
535{
536 pss->block = rb;
537 pss->page = page;
538 pss->complete_round = false;
539}
540
93589827
PX
541/*
542 * Check whether two PSSs are actively sending the same page. Return true
543 * if it is, false otherwise.
544 */
545static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2)
546{
547 return pss1->host_page_sending && pss2->host_page_sending &&
548 (pss1->host_page_start == pss2->host_page_start);
549}
550
56e93d26
JQ
551static void *do_data_compress(void *opaque)
552{
553 CompressParam *param = opaque;
a7a9a88f
LL
554 RAMBlock *block;
555 ram_addr_t offset;
5e5fdcff 556 bool zero_page;
56e93d26 557
a7a9a88f 558 qemu_mutex_lock(&param->mutex);
90e56fb4 559 while (!param->quit) {
a7a9a88f
LL
560 if (param->block) {
561 block = param->block;
562 offset = param->offset;
563 param->block = NULL;
564 qemu_mutex_unlock(&param->mutex);
565
5e5fdcff
XG
566 zero_page = do_compress_ram_page(param->file, &param->stream,
567 block, offset, param->originbuf);
a7a9a88f 568
0d9f9a5c 569 qemu_mutex_lock(&comp_done_lock);
a7a9a88f 570 param->done = true;
5e5fdcff 571 param->zero_page = zero_page;
0d9f9a5c
LL
572 qemu_cond_signal(&comp_done_cond);
573 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
574
575 qemu_mutex_lock(&param->mutex);
576 } else {
56e93d26
JQ
577 qemu_cond_wait(&param->cond, &param->mutex);
578 }
56e93d26 579 }
a7a9a88f 580 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
581
582 return NULL;
583}
584
f0afa331 585static void compress_threads_save_cleanup(void)
56e93d26
JQ
586{
587 int i, thread_count;
588
05306935 589 if (!migrate_use_compression() || !comp_param) {
56e93d26
JQ
590 return;
591 }
05306935 592
56e93d26
JQ
593 thread_count = migrate_compress_threads();
594 for (i = 0; i < thread_count; i++) {
dcaf446e
XG
595 /*
596 * we use it as a indicator which shows if the thread is
597 * properly init'd or not
598 */
599 if (!comp_param[i].file) {
600 break;
601 }
05306935
FL
602
603 qemu_mutex_lock(&comp_param[i].mutex);
604 comp_param[i].quit = true;
605 qemu_cond_signal(&comp_param[i].cond);
606 qemu_mutex_unlock(&comp_param[i].mutex);
607
56e93d26 608 qemu_thread_join(compress_threads + i);
56e93d26
JQ
609 qemu_mutex_destroy(&comp_param[i].mutex);
610 qemu_cond_destroy(&comp_param[i].cond);
dcaf446e 611 deflateEnd(&comp_param[i].stream);
34ab9e97 612 g_free(comp_param[i].originbuf);
dcaf446e
XG
613 qemu_fclose(comp_param[i].file);
614 comp_param[i].file = NULL;
56e93d26 615 }
0d9f9a5c
LL
616 qemu_mutex_destroy(&comp_done_lock);
617 qemu_cond_destroy(&comp_done_cond);
56e93d26
JQ
618 g_free(compress_threads);
619 g_free(comp_param);
56e93d26
JQ
620 compress_threads = NULL;
621 comp_param = NULL;
56e93d26
JQ
622}
623
dcaf446e 624static int compress_threads_save_setup(void)
56e93d26
JQ
625{
626 int i, thread_count;
627
628 if (!migrate_use_compression()) {
dcaf446e 629 return 0;
56e93d26 630 }
56e93d26
JQ
631 thread_count = migrate_compress_threads();
632 compress_threads = g_new0(QemuThread, thread_count);
633 comp_param = g_new0(CompressParam, thread_count);
0d9f9a5c
LL
634 qemu_cond_init(&comp_done_cond);
635 qemu_mutex_init(&comp_done_lock);
56e93d26 636 for (i = 0; i < thread_count; i++) {
34ab9e97
XG
637 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
638 if (!comp_param[i].originbuf) {
639 goto exit;
640 }
641
dcaf446e
XG
642 if (deflateInit(&comp_param[i].stream,
643 migrate_compress_level()) != Z_OK) {
34ab9e97 644 g_free(comp_param[i].originbuf);
dcaf446e
XG
645 goto exit;
646 }
647
e110aa91
C
648 /* comp_param[i].file is just used as a dummy buffer to save data,
649 * set its ops to empty.
56e93d26 650 */
77ef2dc1 651 comp_param[i].file = qemu_file_new_output(
c0e0825c 652 QIO_CHANNEL(qio_channel_null_new()));
56e93d26 653 comp_param[i].done = true;
90e56fb4 654 comp_param[i].quit = false;
56e93d26
JQ
655 qemu_mutex_init(&comp_param[i].mutex);
656 qemu_cond_init(&comp_param[i].cond);
657 qemu_thread_create(compress_threads + i, "compress",
658 do_data_compress, comp_param + i,
659 QEMU_THREAD_JOINABLE);
660 }
dcaf446e
XG
661 return 0;
662
663exit:
664 compress_threads_save_cleanup();
665 return -1;
56e93d26
JQ
666}
667
668/**
3d0684b2 669 * save_page_header: write page header to wire
56e93d26
JQ
670 *
671 * If this is the 1st block, it also writes the block identification
672 *
3d0684b2 673 * Returns the number of bytes written
56e93d26 674 *
ec6f3ab9 675 * @pss: current PSS channel status
56e93d26
JQ
676 * @block: block that contains the page we want to send
677 * @offset: offset inside the block for the page
678 * in the lower bits, it contains flags
679 */
37502df3
LS
680static size_t save_page_header(PageSearchStatus *pss, QEMUFile *f,
681 RAMBlock *block, ram_addr_t offset)
56e93d26 682{
9f5f380b 683 size_t size, len;
ec6f3ab9 684 bool same_block = (block == pss->last_sent_block);
56e93d26 685
10661f11 686 if (same_block) {
24795694
JQ
687 offset |= RAM_SAVE_FLAG_CONTINUE;
688 }
2bf3aa85 689 qemu_put_be64(f, offset);
56e93d26
JQ
690 size = 8;
691
10661f11 692 if (!same_block) {
9f5f380b 693 len = strlen(block->idstr);
2bf3aa85
JQ
694 qemu_put_byte(f, len);
695 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
9f5f380b 696 size += 1 + len;
ec6f3ab9 697 pss->last_sent_block = block;
56e93d26
JQ
698 }
699 return size;
700}
701
3d0684b2 702/**
179a8080 703 * mig_throttle_guest_down: throttle down the guest
3d0684b2
JQ
704 *
705 * Reduce amount of guest cpu execution to hopefully slow down memory
706 * writes. If guest dirty memory rate is reduced below the rate at
707 * which we can transfer pages to the destination then we should be
708 * able to complete migration. Some workloads dirty memory way too
709 * fast and will not effectively converge, even with auto-converge.
070afca2 710 */
cbbf8182
KZ
711static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
712 uint64_t bytes_dirty_threshold)
070afca2
JH
713{
714 MigrationState *s = migrate_get_current();
2594f56d 715 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
cbbf8182
KZ
716 uint64_t pct_increment = s->parameters.cpu_throttle_increment;
717 bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
4cbc9c7f 718 int pct_max = s->parameters.max_cpu_throttle;
070afca2 719
cbbf8182
KZ
720 uint64_t throttle_now = cpu_throttle_get_percentage();
721 uint64_t cpu_now, cpu_ideal, throttle_inc;
722
070afca2
JH
723 /* We have not started throttling yet. Let's start it. */
724 if (!cpu_throttle_active()) {
725 cpu_throttle_set(pct_initial);
726 } else {
727 /* Throttling already on, just increase the rate */
cbbf8182
KZ
728 if (!pct_tailslow) {
729 throttle_inc = pct_increment;
730 } else {
731 /* Compute the ideal CPU percentage used by Guest, which may
732 * make the dirty rate match the dirty rate threshold. */
733 cpu_now = 100 - throttle_now;
734 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
735 bytes_dirty_period);
736 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
737 }
738 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
070afca2
JH
739 }
740}
741
91fe9a8d
RL
742void mig_throttle_counter_reset(void)
743{
744 RAMState *rs = ram_state;
745
746 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
747 rs->num_dirty_pages_period = 0;
abce5fa1 748 rs->bytes_xfer_prev = stat64_get(&ram_counters.transferred);
91fe9a8d
RL
749}
750
3d0684b2
JQ
751/**
752 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
753 *
6f37bb8b 754 * @rs: current RAM state
3d0684b2
JQ
755 * @current_addr: address for the zero page
756 *
757 * Update the xbzrle cache to reflect a page that's been sent as all 0.
56e93d26
JQ
758 * The important thing is that a stale (not-yet-0'd) page be replaced
759 * by the new data.
760 * As a bonus, if the page wasn't in the cache it gets added so that
3d0684b2 761 * when a small write is made into the 0'd page it gets XBZRLE sent.
56e93d26 762 */
6f37bb8b 763static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
56e93d26 764{
56e93d26
JQ
765 /* We don't care if this fails to allocate a new cache page
766 * as long as it updated an old one */
c00e0928 767 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
536b5a4e 768 stat64_get(&ram_counters.dirty_sync_count));
56e93d26
JQ
769}
770
771#define ENCODING_FLAG_XBZRLE 0x1
772
773/**
774 * save_xbzrle_page: compress and send current page
775 *
776 * Returns: 1 means that we wrote the page
777 * 0 means that page is identical to the one already sent
778 * -1 means that xbzrle would be longer than normal
779 *
5a987738 780 * @rs: current RAM state
ec6f3ab9 781 * @pss: current PSS channel
3d0684b2
JQ
782 * @current_data: pointer to the address of the page contents
783 * @current_addr: addr of the page
56e93d26
JQ
784 * @block: block that contains the page we want to send
785 * @offset: offset inside the block for the page
56e93d26 786 */
ec6f3ab9 787static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
61717ea9
PX
788 uint8_t **current_data, ram_addr_t current_addr,
789 RAMBlock *block, ram_addr_t offset)
56e93d26
JQ
790{
791 int encoded_len = 0, bytes_xbzrle;
792 uint8_t *prev_cached_page;
ec6f3ab9 793 QEMUFile *file = pss->pss_channel;
536b5a4e 794 uint64_t generation = stat64_get(&ram_counters.dirty_sync_count);
56e93d26 795
536b5a4e 796 if (!cache_is_cached(XBZRLE.cache, current_addr, generation)) {
9360447d 797 xbzrle_counters.cache_miss++;
05931ec5 798 if (!rs->last_stage) {
56e93d26 799 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
536b5a4e 800 generation) == -1) {
56e93d26
JQ
801 return -1;
802 } else {
803 /* update *current_data when the page has been
804 inserted into cache */
805 *current_data = get_cached_data(XBZRLE.cache, current_addr);
806 }
807 }
808 return -1;
809 }
810
e460a4b1
WW
811 /*
812 * Reaching here means the page has hit the xbzrle cache, no matter what
813 * encoding result it is (normal encoding, overflow or skipping the page),
3a4452d8 814 * count the page as encoded. This is used to calculate the encoding rate.
e460a4b1
WW
815 *
816 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
817 * 2nd page turns out to be skipped (i.e. no new bytes written to the
818 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
819 * skipped page included. In this way, the encoding rate can tell if the
820 * guest page is good for xbzrle encoding.
821 */
822 xbzrle_counters.pages++;
56e93d26
JQ
823 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
824
825 /* save current buffer into memory */
826 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
827
828 /* XBZRLE encoding (if there is no overflow) */
04ffce13 829 encoded_len = xbzrle_encode_buffer_func(prev_cached_page, XBZRLE.current_buf,
830 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
831 TARGET_PAGE_SIZE);
ca353803
WY
832
833 /*
834 * Update the cache contents, so that it corresponds to the data
835 * sent, in all cases except where we skip the page.
836 */
05931ec5 837 if (!rs->last_stage && encoded_len != 0) {
ca353803
WY
838 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
839 /*
840 * In the case where we couldn't compress, ensure that the caller
841 * sends the data from the cache, since the guest might have
842 * changed the RAM since we copied it.
843 */
844 *current_data = prev_cached_page;
845 }
846
56e93d26 847 if (encoded_len == 0) {
55c4446b 848 trace_save_xbzrle_page_skipping();
56e93d26
JQ
849 return 0;
850 } else if (encoded_len == -1) {
55c4446b 851 trace_save_xbzrle_page_overflow();
9360447d 852 xbzrle_counters.overflow++;
e460a4b1 853 xbzrle_counters.bytes += TARGET_PAGE_SIZE;
56e93d26
JQ
854 return -1;
855 }
856
56e93d26 857 /* Send XBZRLE based compressed page */
37502df3 858 bytes_xbzrle = save_page_header(pss, pss->pss_channel, block,
204b88b8 859 offset | RAM_SAVE_FLAG_XBZRLE);
61717ea9
PX
860 qemu_put_byte(file, ENCODING_FLAG_XBZRLE);
861 qemu_put_be16(file, encoded_len);
862 qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len);
56e93d26 863 bytes_xbzrle += encoded_len + 1 + 2;
e460a4b1
WW
864 /*
865 * Like compressed_size (please see update_compress_thread_counts),
866 * the xbzrle encoded bytes don't count the 8 byte header with
867 * RAM_SAVE_FLAG_CONTINUE.
868 */
869 xbzrle_counters.bytes += bytes_xbzrle - 8;
4c2d0f6d 870 ram_transferred_add(bytes_xbzrle);
56e93d26
JQ
871
872 return 1;
873}
874
3d0684b2 875/**
d9e474ea 876 * pss_find_next_dirty: find the next dirty page of current ramblock
f3f491fc 877 *
d9e474ea
PX
878 * This function updates pss->page to point to the next dirty page index
879 * within the ramblock to migrate, or the end of ramblock when nothing
880 * found. Note that when pss->host_page_sending==true it means we're
881 * during sending a host page, so we won't look for dirty page that is
882 * outside the host page boundary.
3d0684b2 883 *
d9e474ea 884 * @pss: the current page search status
f3f491fc 885 */
d9e474ea 886static void pss_find_next_dirty(PageSearchStatus *pss)
56e93d26 887{
d9e474ea 888 RAMBlock *rb = pss->block;
6b6712ef
JQ
889 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
890 unsigned long *bitmap = rb->bmap;
56e93d26 891
fbd162e6 892 if (ramblock_is_ignored(rb)) {
d9e474ea
PX
893 /* Points directly to the end, so we know no dirty page */
894 pss->page = size;
895 return;
896 }
897
898 /*
899 * If during sending a host page, only look for dirty pages within the
900 * current host page being send.
901 */
902 if (pss->host_page_sending) {
903 assert(pss->host_page_end);
904 size = MIN(size, pss->host_page_end);
b895de50
CLG
905 }
906
d9e474ea 907 pss->page = find_next_bit(bitmap, size, pss->page);
56e93d26
JQ
908}
909
1230a25f 910static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
3143577d
WW
911 unsigned long page)
912{
913 uint8_t shift;
914 hwaddr size, start;
915
916 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
917 return;
918 }
919
920 shift = rb->clear_bmap_shift;
921 /*
922 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
923 * can make things easier sometimes since then start address
924 * of the small chunk will always be 64 pages aligned so the
925 * bitmap will always be aligned to unsigned long. We should
926 * even be able to remove this restriction but I'm simply
927 * keeping it.
928 */
929 assert(shift >= 6);
930
931 size = 1ULL << (TARGET_PAGE_BITS + shift);
7648297d 932 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
3143577d
WW
933 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
934 memory_region_clear_dirty_bitmap(rb->mr, start, size);
935}
936
937static void
1230a25f 938migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
3143577d
WW
939 unsigned long start,
940 unsigned long npages)
941{
942 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
943 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
944 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
945
946 /*
947 * Clear pages from start to start + npages - 1, so the end boundary is
948 * exclusive.
949 */
950 for (i = chunk_start; i < chunk_end; i += chunk_pages) {
1230a25f 951 migration_clear_memory_region_dirty_bitmap(rb, i);
3143577d
WW
952 }
953}
954
a6a83cef
RL
955/*
956 * colo_bitmap_find_diry:find contiguous dirty pages from start
957 *
958 * Returns the page offset within memory region of the start of the contiguout
959 * dirty page
960 *
961 * @rs: current RAM state
962 * @rb: RAMBlock where to search for dirty pages
963 * @start: page where we start the search
964 * @num: the number of contiguous dirty pages
965 */
966static inline
967unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
968 unsigned long start, unsigned long *num)
969{
970 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
971 unsigned long *bitmap = rb->bmap;
972 unsigned long first, next;
973
974 *num = 0;
975
976 if (ramblock_is_ignored(rb)) {
977 return size;
978 }
979
980 first = find_next_bit(bitmap, size, start);
981 if (first >= size) {
982 return first;
983 }
984 next = find_next_zero_bit(bitmap, size, first + 1);
985 assert(next >= first);
986 *num = next - first;
987 return first;
988}
989
06b10688 990static inline bool migration_bitmap_clear_dirty(RAMState *rs,
f20e2865
JQ
991 RAMBlock *rb,
992 unsigned long page)
a82d593b
DDAG
993{
994 bool ret;
a82d593b 995
002cad6b
PX
996 /*
997 * Clear dirty bitmap if needed. This _must_ be called before we
998 * send any of the page in the chunk because we need to make sure
999 * we can capture further page content changes when we sync dirty
1000 * log the next time. So as long as we are going to send any of
1001 * the page in the chunk we clear the remote dirty bitmap for all.
1002 * Clearing it earlier won't be a problem, but too late will.
1003 */
1230a25f 1004 migration_clear_memory_region_dirty_bitmap(rb, page);
002cad6b 1005
6b6712ef 1006 ret = test_and_clear_bit(page, rb->bmap);
a82d593b 1007 if (ret) {
0d8ec885 1008 rs->migration_dirty_pages--;
a82d593b 1009 }
386a907b 1010
a82d593b
DDAG
1011 return ret;
1012}
1013
be39b4cd
DH
1014static void dirty_bitmap_clear_section(MemoryRegionSection *section,
1015 void *opaque)
1016{
1017 const hwaddr offset = section->offset_within_region;
1018 const hwaddr size = int128_get64(section->size);
1019 const unsigned long start = offset >> TARGET_PAGE_BITS;
1020 const unsigned long npages = size >> TARGET_PAGE_BITS;
1021 RAMBlock *rb = section->mr->ram_block;
1022 uint64_t *cleared_bits = opaque;
1023
1024 /*
1025 * We don't grab ram_state->bitmap_mutex because we expect to run
1026 * only when starting migration or during postcopy recovery where
1027 * we don't have concurrent access.
1028 */
1029 if (!migration_in_postcopy() && !migrate_background_snapshot()) {
1030 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
1031 }
1032 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
1033 bitmap_clear(rb->bmap, start, npages);
1034}
1035
1036/*
1037 * Exclude all dirty pages from migration that fall into a discarded range as
1038 * managed by a RamDiscardManager responsible for the mapped memory region of
1039 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
1040 *
1041 * Discarded pages ("logically unplugged") have undefined content and must
1042 * not get migrated, because even reading these pages for migration might
1043 * result in undesired behavior.
1044 *
1045 * Returns the number of cleared bits in the RAMBlock dirty bitmap.
1046 *
1047 * Note: The result is only stable while migrating (precopy/postcopy).
1048 */
1049static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
1050{
1051 uint64_t cleared_bits = 0;
1052
1053 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
1054 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1055 MemoryRegionSection section = {
1056 .mr = rb->mr,
1057 .offset_within_region = 0,
1058 .size = int128_make64(qemu_ram_get_used_length(rb)),
1059 };
1060
1061 ram_discard_manager_replay_discarded(rdm, &section,
1062 dirty_bitmap_clear_section,
1063 &cleared_bits);
1064 }
1065 return cleared_bits;
1066}
1067
9470c5e0
DH
1068/*
1069 * Check if a host-page aligned page falls into a discarded range as managed by
1070 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
1071 *
1072 * Note: The result is only stable while migrating (precopy/postcopy).
1073 */
1074bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
1075{
1076 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1077 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1078 MemoryRegionSection section = {
1079 .mr = rb->mr,
1080 .offset_within_region = start,
1081 .size = int128_make64(qemu_ram_pagesize(rb)),
1082 };
1083
1084 return !ram_discard_manager_is_populated(rdm, &section);
1085 }
1086 return false;
1087}
1088
267691b6 1089/* Called with RCU critical section */
7a3e9571 1090static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
56e93d26 1091{
fb613580
KZ
1092 uint64_t new_dirty_pages =
1093 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
1094
1095 rs->migration_dirty_pages += new_dirty_pages;
1096 rs->num_dirty_pages_period += new_dirty_pages;
56e93d26
JQ
1097}
1098
3d0684b2
JQ
1099/**
1100 * ram_pagesize_summary: calculate all the pagesizes of a VM
1101 *
1102 * Returns a summary bitmap of the page sizes of all RAMBlocks
1103 *
1104 * For VMs with just normal pages this is equivalent to the host page
1105 * size. If it's got some huge pages then it's the OR of all the
1106 * different page sizes.
e8ca1db2
DDAG
1107 */
1108uint64_t ram_pagesize_summary(void)
1109{
1110 RAMBlock *block;
1111 uint64_t summary = 0;
1112
fbd162e6 1113 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
e8ca1db2
DDAG
1114 summary |= block->page_size;
1115 }
1116
1117 return summary;
1118}
1119
aecbfe9c
XG
1120uint64_t ram_get_total_transferred_pages(void)
1121{
8c0cda8f 1122 return stat64_get(&ram_counters.normal_pages) +
1a386e8d 1123 stat64_get(&ram_counters.zero_pages) +
23b7576d 1124 compression_counters.pages + xbzrle_counters.pages;
aecbfe9c
XG
1125}
1126
b734035b
XG
1127static void migration_update_rates(RAMState *rs, int64_t end_time)
1128{
be8b02ed 1129 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
76e03000 1130 double compressed_size;
b734035b
XG
1131
1132 /* calculate period counters */
1133 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1134 / (end_time - rs->time_last_bitmap_sync);
1135
be8b02ed 1136 if (!page_count) {
b734035b
XG
1137 return;
1138 }
1139
1140 if (migrate_use_xbzrle()) {
e460a4b1
WW
1141 double encoded_size, unencoded_size;
1142
b734035b 1143 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
be8b02ed 1144 rs->xbzrle_cache_miss_prev) / page_count;
b734035b 1145 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
e460a4b1
WW
1146 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1147 TARGET_PAGE_SIZE;
1148 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
92271402 1149 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
e460a4b1 1150 xbzrle_counters.encoding_rate = 0;
e460a4b1
WW
1151 } else {
1152 xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1153 }
1154 rs->xbzrle_pages_prev = xbzrle_counters.pages;
1155 rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
b734035b 1156 }
76e03000
XG
1157
1158 if (migrate_use_compression()) {
1159 compression_counters.busy_rate = (double)(compression_counters.busy -
1160 rs->compress_thread_busy_prev) / page_count;
1161 rs->compress_thread_busy_prev = compression_counters.busy;
1162
1163 compressed_size = compression_counters.compressed_size -
1164 rs->compressed_size_prev;
1165 if (compressed_size) {
1166 double uncompressed_size = (compression_counters.pages -
1167 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1168
1169 /* Compression-Ratio = Uncompressed-size / Compressed-size */
1170 compression_counters.compression_rate =
1171 uncompressed_size / compressed_size;
1172
1173 rs->compress_pages_prev = compression_counters.pages;
1174 rs->compressed_size_prev = compression_counters.compressed_size;
1175 }
1176 }
b734035b
XG
1177}
1178
dc14a470
KZ
1179static void migration_trigger_throttle(RAMState *rs)
1180{
1181 MigrationState *s = migrate_get_current();
1182 uint64_t threshold = s->parameters.throttle_trigger_threshold;
23b7576d 1183 uint64_t bytes_xfer_period =
abce5fa1 1184 stat64_get(&ram_counters.transferred) - rs->bytes_xfer_prev;
dc14a470
KZ
1185 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1186 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1187
1188 /* During block migration the auto-converge logic incorrectly detects
1189 * that ram migration makes no progress. Avoid this by disabling the
1190 * throttling logic during the bulk phase of block migration. */
1191 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1192 /* The following detection logic can be refined later. For now:
1193 Check to see if the ratio between dirtied bytes and the approx.
1194 amount of bytes that just got transferred since the last time
1195 we were in this routine reaches the threshold. If that happens
1196 twice, start or increase throttling. */
1197
1198 if ((bytes_dirty_period > bytes_dirty_threshold) &&
1199 (++rs->dirty_rate_high_cnt >= 2)) {
1200 trace_migration_throttle();
1201 rs->dirty_rate_high_cnt = 0;
cbbf8182
KZ
1202 mig_throttle_guest_down(bytes_dirty_period,
1203 bytes_dirty_threshold);
dc14a470
KZ
1204 }
1205 }
1206}
1207
8d820d6f 1208static void migration_bitmap_sync(RAMState *rs)
56e93d26
JQ
1209{
1210 RAMBlock *block;
56e93d26 1211 int64_t end_time;
56e93d26 1212
536b5a4e 1213 stat64_add(&ram_counters.dirty_sync_count, 1);
56e93d26 1214
f664da80
JQ
1215 if (!rs->time_last_bitmap_sync) {
1216 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
56e93d26
JQ
1217 }
1218
1219 trace_migration_bitmap_sync_start();
9c1f8f44 1220 memory_global_dirty_log_sync();
56e93d26 1221
108cfae0 1222 qemu_mutex_lock(&rs->bitmap_mutex);
89ac5a1d
DDAG
1223 WITH_RCU_READ_LOCK_GUARD() {
1224 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1225 ramblock_sync_dirty_bitmap(rs, block);
1226 }
1227 ram_counters.remaining = ram_bytes_remaining();
56e93d26 1228 }
108cfae0 1229 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26 1230
9458a9a1 1231 memory_global_after_dirty_log_sync();
a66cd90c 1232 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1ffb5dfd 1233
56e93d26
JQ
1234 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1235
1236 /* more than 1 second = 1000 millisecons */
f664da80 1237 if (end_time > rs->time_last_bitmap_sync + 1000) {
dc14a470 1238 migration_trigger_throttle(rs);
070afca2 1239
b734035b
XG
1240 migration_update_rates(rs, end_time);
1241
be8b02ed 1242 rs->target_page_count_prev = rs->target_page_count;
d693c6f1
FF
1243
1244 /* reset period counters */
f664da80 1245 rs->time_last_bitmap_sync = end_time;
a66cd90c 1246 rs->num_dirty_pages_period = 0;
abce5fa1 1247 rs->bytes_xfer_prev = stat64_get(&ram_counters.transferred);
56e93d26 1248 }
4addcd4f 1249 if (migrate_use_events()) {
536b5a4e
JQ
1250 uint64_t generation = stat64_get(&ram_counters.dirty_sync_count);
1251 qapi_event_send_migration_pass(generation);
4addcd4f 1252 }
56e93d26
JQ
1253}
1254
bd227060
WW
1255static void migration_bitmap_sync_precopy(RAMState *rs)
1256{
1257 Error *local_err = NULL;
1258
1259 /*
1260 * The current notifier usage is just an optimization to migration, so we
1261 * don't stop the normal migration process in the error case.
1262 */
1263 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1264 error_report_err(local_err);
b4a1733c 1265 local_err = NULL;
bd227060
WW
1266 }
1267
1268 migration_bitmap_sync(rs);
1269
1270 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1271 error_report_err(local_err);
1272 }
1273}
1274
a4dbaf8e 1275void ram_release_page(const char *rbname, uint64_t offset)
47fe16ff
JQ
1276{
1277 if (!migrate_release_ram() || !migration_in_postcopy()) {
1278 return;
1279 }
1280
1281 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1282}
1283
6c97ec5f
XG
1284/**
1285 * save_zero_page_to_file: send the zero page to the file
1286 *
1287 * Returns the size of data written to the file, 0 means the page is not
1288 * a zero page
1289 *
ec6f3ab9 1290 * @pss: current PSS channel
6c97ec5f
XG
1291 * @block: block that contains the page we want to send
1292 * @offset: offset inside the block for the page
1293 */
37502df3 1294static int save_zero_page_to_file(PageSearchStatus *pss, QEMUFile *file,
6c97ec5f
XG
1295 RAMBlock *block, ram_addr_t offset)
1296{
1297 uint8_t *p = block->host + offset;
1298 int len = 0;
1299
bad452a7 1300 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
37502df3 1301 len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO);
6c97ec5f
XG
1302 qemu_put_byte(file, 0);
1303 len += 1;
47fe16ff 1304 ram_release_page(block->idstr, offset);
6c97ec5f
XG
1305 }
1306 return len;
1307}
1308
56e93d26 1309/**
3d0684b2 1310 * save_zero_page: send the zero page to the stream
56e93d26 1311 *
3d0684b2 1312 * Returns the number of pages written.
56e93d26 1313 *
ec6f3ab9 1314 * @pss: current PSS channel
56e93d26
JQ
1315 * @block: block that contains the page we want to send
1316 * @offset: offset inside the block for the page
56e93d26 1317 */
37502df3 1318static int save_zero_page(PageSearchStatus *pss, QEMUFile *f, RAMBlock *block,
61717ea9 1319 ram_addr_t offset)
56e93d26 1320{
37502df3 1321 int len = save_zero_page_to_file(pss, f, block, offset);
56e93d26 1322
6c97ec5f 1323 if (len) {
1a386e8d 1324 stat64_add(&ram_counters.zero_pages, 1);
4c2d0f6d 1325 ram_transferred_add(len);
6c97ec5f 1326 return 1;
56e93d26 1327 }
6c97ec5f 1328 return -1;
56e93d26
JQ
1329}
1330
059ff0fb
XG
1331/*
1332 * @pages: the number of pages written by the control path,
1333 * < 0 - error
1334 * > 0 - number of pages written
1335 *
1336 * Return true if the pages has been saved, otherwise false is returned.
1337 */
61717ea9
PX
1338static bool control_save_page(PageSearchStatus *pss, RAMBlock *block,
1339 ram_addr_t offset, int *pages)
059ff0fb
XG
1340{
1341 uint64_t bytes_xmit = 0;
1342 int ret;
1343
1344 *pages = -1;
61717ea9
PX
1345 ret = ram_control_save_page(pss->pss_channel, block->offset, offset,
1346 TARGET_PAGE_SIZE, &bytes_xmit);
059ff0fb
XG
1347 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1348 return false;
1349 }
1350
1351 if (bytes_xmit) {
4c2d0f6d 1352 ram_transferred_add(bytes_xmit);
059ff0fb
XG
1353 *pages = 1;
1354 }
1355
1356 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1357 return true;
1358 }
1359
1360 if (bytes_xmit > 0) {
8c0cda8f 1361 stat64_add(&ram_counters.normal_pages, 1);
059ff0fb 1362 } else if (bytes_xmit == 0) {
1a386e8d 1363 stat64_add(&ram_counters.zero_pages, 1);
059ff0fb
XG
1364 }
1365
1366 return true;
1367}
1368
65dacaa0
XG
1369/*
1370 * directly send the page to the stream
1371 *
1372 * Returns the number of pages written.
1373 *
ec6f3ab9 1374 * @pss: current PSS channel
65dacaa0
XG
1375 * @block: block that contains the page we want to send
1376 * @offset: offset inside the block for the page
1377 * @buf: the page to be sent
1378 * @async: send to page asyncly
1379 */
ec6f3ab9 1380static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
61717ea9 1381 ram_addr_t offset, uint8_t *buf, bool async)
65dacaa0 1382{
ec6f3ab9
PX
1383 QEMUFile *file = pss->pss_channel;
1384
37502df3 1385 ram_transferred_add(save_page_header(pss, pss->pss_channel, block,
4c2d0f6d 1386 offset | RAM_SAVE_FLAG_PAGE));
65dacaa0 1387 if (async) {
61717ea9 1388 qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
f912ec5b 1389 migrate_release_ram() &&
65dacaa0
XG
1390 migration_in_postcopy());
1391 } else {
61717ea9 1392 qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
65dacaa0 1393 }
4c2d0f6d 1394 ram_transferred_add(TARGET_PAGE_SIZE);
8c0cda8f 1395 stat64_add(&ram_counters.normal_pages, 1);
65dacaa0
XG
1396 return 1;
1397}
1398
56e93d26 1399/**
3d0684b2 1400 * ram_save_page: send the given page to the stream
56e93d26 1401 *
3d0684b2 1402 * Returns the number of pages written.
3fd3c4b3
DDAG
1403 * < 0 - error
1404 * >=0 - Number of pages written - this might legally be 0
1405 * if xbzrle noticed the page was the same.
56e93d26 1406 *
6f37bb8b 1407 * @rs: current RAM state
56e93d26
JQ
1408 * @block: block that contains the page we want to send
1409 * @offset: offset inside the block for the page
56e93d26 1410 */
05931ec5 1411static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
56e93d26
JQ
1412{
1413 int pages = -1;
56e93d26 1414 uint8_t *p;
56e93d26 1415 bool send_async = true;
a08f6890 1416 RAMBlock *block = pss->block;
8bba004c 1417 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
059ff0fb 1418 ram_addr_t current_addr = block->offset + offset;
56e93d26 1419
2f68e399 1420 p = block->host + offset;
1db9d8e5 1421 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
56e93d26 1422
56e93d26 1423 XBZRLE_cache_lock();
1a373522 1424 if (rs->xbzrle_enabled && !migration_in_postcopy()) {
ec6f3ab9 1425 pages = save_xbzrle_page(rs, pss, &p, current_addr,
61717ea9 1426 block, offset);
05931ec5 1427 if (!rs->last_stage) {
059ff0fb
XG
1428 /* Can't send this cached data async, since the cache page
1429 * might get updated before it gets to the wire
56e93d26 1430 */
059ff0fb 1431 send_async = false;
56e93d26
JQ
1432 }
1433 }
1434
1435 /* XBZRLE overflow or normal page */
1436 if (pages == -1) {
ec6f3ab9 1437 pages = save_normal_page(pss, block, offset, p, send_async);
56e93d26
JQ
1438 }
1439
1440 XBZRLE_cache_unlock();
1441
1442 return pages;
1443}
1444
61717ea9 1445static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block,
b9ee2f7d
JQ
1446 ram_addr_t offset)
1447{
61717ea9 1448 if (multifd_queue_page(file, block, offset) < 0) {
713f762a
IR
1449 return -1;
1450 }
8c0cda8f 1451 stat64_add(&ram_counters.normal_pages, 1);
b9ee2f7d
JQ
1452
1453 return 1;
1454}
1455
5e5fdcff 1456static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
6ef3771c 1457 ram_addr_t offset, uint8_t *source_buf)
56e93d26 1458{
53518d94 1459 RAMState *rs = ram_state;
ec6f3ab9 1460 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
20d549cb 1461 uint8_t *p = block->host + offset;
6ef3771c 1462 int ret;
56e93d26 1463
37502df3 1464 if (save_zero_page_to_file(pss, f, block, offset)) {
e7f2e190 1465 return true;
5e5fdcff
XG
1466 }
1467
37502df3 1468 save_page_header(pss, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
34ab9e97
XG
1469
1470 /*
1471 * copy it to a internal buffer to avoid it being modified by VM
1472 * so that we can catch up the error during compression and
1473 * decompression
1474 */
1475 memcpy(source_buf, p, TARGET_PAGE_SIZE);
6ef3771c
XG
1476 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1477 if (ret < 0) {
1478 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
b3be2896 1479 error_report("compressed data failed!");
b3be2896 1480 }
e7f2e190 1481 return false;
5e5fdcff
XG
1482}
1483
1484static void
1485update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1486{
4c2d0f6d 1487 ram_transferred_add(bytes_xmit);
76e03000 1488
5e5fdcff 1489 if (param->zero_page) {
1a386e8d 1490 stat64_add(&ram_counters.zero_pages, 1);
76e03000 1491 return;
5e5fdcff 1492 }
76e03000
XG
1493
1494 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1495 compression_counters.compressed_size += bytes_xmit - 8;
1496 compression_counters.pages++;
56e93d26
JQ
1497}
1498
32b05495
XG
1499static bool save_page_use_compression(RAMState *rs);
1500
ce25d337 1501static void flush_compressed_data(RAMState *rs)
56e93d26 1502{
eaa238ab 1503 MigrationState *ms = migrate_get_current();
56e93d26
JQ
1504 int idx, len, thread_count;
1505
32b05495 1506 if (!save_page_use_compression(rs)) {
56e93d26
JQ
1507 return;
1508 }
1509 thread_count = migrate_compress_threads();
a7a9a88f 1510
0d9f9a5c 1511 qemu_mutex_lock(&comp_done_lock);
56e93d26 1512 for (idx = 0; idx < thread_count; idx++) {
a7a9a88f 1513 while (!comp_param[idx].done) {
0d9f9a5c 1514 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26 1515 }
a7a9a88f 1516 }
0d9f9a5c 1517 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
1518
1519 for (idx = 0; idx < thread_count; idx++) {
1520 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 1521 if (!comp_param[idx].quit) {
eaa238ab 1522 len = qemu_put_qemu_file(ms->to_dst_file, comp_param[idx].file);
5e5fdcff
XG
1523 /*
1524 * it's safe to fetch zero_page without holding comp_done_lock
1525 * as there is no further request submitted to the thread,
1526 * i.e, the thread should be waiting for a request at this point.
1527 */
1528 update_compress_thread_counts(&comp_param[idx], len);
56e93d26 1529 }
a7a9a88f 1530 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26
JQ
1531 }
1532}
1533
1534static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1535 ram_addr_t offset)
1536{
1537 param->block = block;
1538 param->offset = offset;
1539}
1540
eaa238ab 1541static int compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset)
56e93d26
JQ
1542{
1543 int idx, thread_count, bytes_xmit = -1, pages = -1;
1d58872a 1544 bool wait = migrate_compress_wait_thread();
eaa238ab 1545 MigrationState *ms = migrate_get_current();
56e93d26
JQ
1546
1547 thread_count = migrate_compress_threads();
0d9f9a5c 1548 qemu_mutex_lock(&comp_done_lock);
1d58872a
XG
1549retry:
1550 for (idx = 0; idx < thread_count; idx++) {
1551 if (comp_param[idx].done) {
1552 comp_param[idx].done = false;
eaa238ab
PX
1553 bytes_xmit = qemu_put_qemu_file(ms->to_dst_file,
1554 comp_param[idx].file);
1d58872a
XG
1555 qemu_mutex_lock(&comp_param[idx].mutex);
1556 set_compress_params(&comp_param[idx], block, offset);
1557 qemu_cond_signal(&comp_param[idx].cond);
1558 qemu_mutex_unlock(&comp_param[idx].mutex);
1559 pages = 1;
5e5fdcff 1560 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
56e93d26 1561 break;
56e93d26
JQ
1562 }
1563 }
1d58872a
XG
1564
1565 /*
1566 * wait for the free thread if the user specifies 'compress-wait-thread',
1567 * otherwise we will post the page out in the main thread as normal page.
1568 */
1569 if (pages < 0 && wait) {
1570 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1571 goto retry;
1572 }
0d9f9a5c 1573 qemu_mutex_unlock(&comp_done_lock);
56e93d26
JQ
1574
1575 return pages;
1576}
1577
31e2ac74
JQ
1578#define PAGE_ALL_CLEAN 0
1579#define PAGE_TRY_AGAIN 1
1580#define PAGE_DIRTY_FOUND 2
3d0684b2
JQ
1581/**
1582 * find_dirty_block: find the next dirty page and update any state
1583 * associated with the search process.
b9e60928 1584 *
31e2ac74
JQ
1585 * Returns:
1586 * PAGE_ALL_CLEAN: no dirty page found, give up
1587 * PAGE_TRY_AGAIN: no dirty page found, retry for next block
1588 * PAGE_DIRTY_FOUND: dirty page found
b9e60928 1589 *
6f37bb8b 1590 * @rs: current RAM state
3d0684b2
JQ
1591 * @pss: data about the state of the current dirty page scan
1592 * @again: set to false if the search has scanned the whole of RAM
b9e60928 1593 */
31e2ac74 1594static int find_dirty_block(RAMState *rs, PageSearchStatus *pss)
b9e60928 1595{
d9e474ea
PX
1596 /* Update pss->page for the next dirty bit in ramblock */
1597 pss_find_next_dirty(pss);
1598
6f37bb8b 1599 if (pss->complete_round && pss->block == rs->last_seen_block &&
a935e30f 1600 pss->page >= rs->last_page) {
b9e60928
DDAG
1601 /*
1602 * We've been once around the RAM and haven't found anything.
1603 * Give up.
1604 */
31e2ac74 1605 return PAGE_ALL_CLEAN;
b9e60928 1606 }
542147f4
DH
1607 if (!offset_in_ramblock(pss->block,
1608 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
b9e60928 1609 /* Didn't find anything in this RAM Block */
a935e30f 1610 pss->page = 0;
b9e60928
DDAG
1611 pss->block = QLIST_NEXT_RCU(pss->block, next);
1612 if (!pss->block) {
48df9d80
XG
1613 /*
1614 * If memory migration starts over, we will meet a dirtied page
1615 * which may still exists in compression threads's ring, so we
1616 * should flush the compressed data to make sure the new page
1617 * is not overwritten by the old one in the destination.
1618 *
1619 * Also If xbzrle is on, stop using the data compression at this
1620 * point. In theory, xbzrle can do better than compression.
1621 */
1622 flush_compressed_data(rs);
1623
b9e60928
DDAG
1624 /* Hit the end of the list */
1625 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1626 /* Flag that we've looped */
1627 pss->complete_round = true;
1a373522
DH
1628 /* After the first round, enable XBZRLE. */
1629 if (migrate_use_xbzrle()) {
1630 rs->xbzrle_enabled = true;
1631 }
b9e60928
DDAG
1632 }
1633 /* Didn't find anything this time, but try again on the new block */
31e2ac74 1634 return PAGE_TRY_AGAIN;
b9e60928 1635 } else {
31e2ac74
JQ
1636 /* We've found something */
1637 return PAGE_DIRTY_FOUND;
b9e60928
DDAG
1638 }
1639}
1640
3d0684b2
JQ
1641/**
1642 * unqueue_page: gets a page of the queue
1643 *
a82d593b 1644 * Helper for 'get_queued_page' - gets a page off the queue
a82d593b 1645 *
3d0684b2
JQ
1646 * Returns the block of the page (or NULL if none available)
1647 *
ec481c6c 1648 * @rs: current RAM state
3d0684b2 1649 * @offset: used to return the offset within the RAMBlock
a82d593b 1650 */
f20e2865 1651static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
a82d593b 1652{
a1fe28df 1653 struct RAMSrcPageRequest *entry;
a82d593b
DDAG
1654 RAMBlock *block = NULL;
1655
a1fe28df 1656 if (!postcopy_has_request(rs)) {
ae526e32
XG
1657 return NULL;
1658 }
1659
6e8a355d 1660 QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
a1fe28df
PX
1661
1662 /*
1663 * This should _never_ change even after we take the lock, because no one
1664 * should be taking anything off the request list other than us.
1665 */
1666 assert(postcopy_has_request(rs));
1667
1668 entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1669 block = entry->rb;
1670 *offset = entry->offset;
1671
777f53c7
TH
1672 if (entry->len > TARGET_PAGE_SIZE) {
1673 entry->len -= TARGET_PAGE_SIZE;
1674 entry->offset += TARGET_PAGE_SIZE;
a1fe28df
PX
1675 } else {
1676 memory_region_unref(block->mr);
1677 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1678 g_free(entry);
1679 migration_consume_urgent_request();
a82d593b 1680 }
a82d593b
DDAG
1681
1682 return block;
1683}
1684
278e2f55
AG
1685#if defined(__linux__)
1686/**
1687 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1688 * is found, return RAM block pointer and page offset
1689 *
1690 * Returns pointer to the RAMBlock containing faulting page,
1691 * NULL if no write faults are pending
1692 *
1693 * @rs: current RAM state
1694 * @offset: page offset from the beginning of the block
1695 */
1696static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1697{
1698 struct uffd_msg uffd_msg;
1699 void *page_address;
82ea3e3b 1700 RAMBlock *block;
278e2f55
AG
1701 int res;
1702
1703 if (!migrate_background_snapshot()) {
1704 return NULL;
1705 }
1706
1707 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1708 if (res <= 0) {
1709 return NULL;
1710 }
1711
1712 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
82ea3e3b
AG
1713 block = qemu_ram_block_from_host(page_address, false, offset);
1714 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1715 return block;
278e2f55
AG
1716}
1717
1718/**
1719 * ram_save_release_protection: release UFFD write protection after
1720 * a range of pages has been saved
1721 *
1722 * @rs: current RAM state
1723 * @pss: page-search-status structure
1724 * @start_page: index of the first page in the range relative to pss->block
1725 *
1726 * Returns 0 on success, negative value in case of an error
1727*/
1728static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1729 unsigned long start_page)
1730{
1731 int res = 0;
1732
1733 /* Check if page is from UFFD-managed region. */
1734 if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1735 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
258f5c98 1736 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
278e2f55
AG
1737
1738 /* Flush async buffers before un-protect. */
61717ea9 1739 qemu_fflush(pss->pss_channel);
278e2f55
AG
1740 /* Un-protect memory range. */
1741 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1742 false, false);
1743 }
1744
1745 return res;
1746}
1747
1748/* ram_write_tracking_available: check if kernel supports required UFFD features
1749 *
1750 * Returns true if supports, false otherwise
1751 */
1752bool ram_write_tracking_available(void)
1753{
1754 uint64_t uffd_features;
1755 int res;
1756
1757 res = uffd_query_features(&uffd_features);
1758 return (res == 0 &&
1759 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1760}
1761
1762/* ram_write_tracking_compatible: check if guest configuration is
1763 * compatible with 'write-tracking'
1764 *
1765 * Returns true if compatible, false otherwise
1766 */
1767bool ram_write_tracking_compatible(void)
1768{
1769 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1770 int uffd_fd;
82ea3e3b 1771 RAMBlock *block;
278e2f55
AG
1772 bool ret = false;
1773
1774 /* Open UFFD file descriptor */
1775 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1776 if (uffd_fd < 0) {
1777 return false;
1778 }
1779
1780 RCU_READ_LOCK_GUARD();
1781
82ea3e3b 1782 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
278e2f55
AG
1783 uint64_t uffd_ioctls;
1784
1785 /* Nothing to do with read-only and MMIO-writable regions */
82ea3e3b 1786 if (block->mr->readonly || block->mr->rom_device) {
278e2f55
AG
1787 continue;
1788 }
1789 /* Try to register block memory via UFFD-IO to track writes */
82ea3e3b 1790 if (uffd_register_memory(uffd_fd, block->host, block->max_length,
278e2f55
AG
1791 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1792 goto out;
1793 }
1794 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1795 goto out;
1796 }
1797 }
1798 ret = true;
1799
1800out:
1801 uffd_close_fd(uffd_fd);
1802 return ret;
1803}
1804
f7b9dcfb
DH
1805static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1806 ram_addr_t size)
1807{
5f19a449
DH
1808 const ram_addr_t end = offset + size;
1809
f7b9dcfb
DH
1810 /*
1811 * We read one byte of each page; this will preallocate page tables if
1812 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1813 * where no page was populated yet. This might require adaption when
1814 * supporting other mappings, like shmem.
1815 */
5f19a449 1816 for (; offset < end; offset += block->page_size) {
f7b9dcfb
DH
1817 char tmp = *((char *)block->host + offset);
1818
1819 /* Don't optimize the read out */
1820 asm volatile("" : "+r" (tmp));
1821 }
1822}
1823
6fee3a1f
DH
1824static inline int populate_read_section(MemoryRegionSection *section,
1825 void *opaque)
1826{
1827 const hwaddr size = int128_get64(section->size);
1828 hwaddr offset = section->offset_within_region;
1829 RAMBlock *block = section->mr->ram_block;
1830
1831 populate_read_range(block, offset, size);
1832 return 0;
1833}
1834
eeccb99c 1835/*
f7b9dcfb
DH
1836 * ram_block_populate_read: preallocate page tables and populate pages in the
1837 * RAM block by reading a byte of each page.
eeccb99c
AG
1838 *
1839 * Since it's solely used for userfault_fd WP feature, here we just
1840 * hardcode page size to qemu_real_host_page_size.
1841 *
82ea3e3b 1842 * @block: RAM block to populate
eeccb99c 1843 */
6fee3a1f 1844static void ram_block_populate_read(RAMBlock *rb)
eeccb99c 1845{
6fee3a1f
DH
1846 /*
1847 * Skip populating all pages that fall into a discarded range as managed by
1848 * a RamDiscardManager responsible for the mapped memory region of the
1849 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1850 * must not get populated automatically. We don't have to track
1851 * modifications via userfaultfd WP reliably, because these pages will
1852 * not be part of the migration stream either way -- see
1853 * ramblock_dirty_bitmap_exclude_discarded_pages().
1854 *
1855 * Note: The result is only stable while migrating (precopy/postcopy).
1856 */
1857 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1858 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1859 MemoryRegionSection section = {
1860 .mr = rb->mr,
1861 .offset_within_region = 0,
1862 .size = rb->mr->size,
1863 };
1864
1865 ram_discard_manager_replay_populated(rdm, &section,
1866 populate_read_section, NULL);
1867 } else {
1868 populate_read_range(rb, 0, rb->used_length);
1869 }
eeccb99c
AG
1870}
1871
1872/*
1873 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1874 */
1875void ram_write_tracking_prepare(void)
1876{
82ea3e3b 1877 RAMBlock *block;
eeccb99c
AG
1878
1879 RCU_READ_LOCK_GUARD();
1880
82ea3e3b 1881 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
eeccb99c 1882 /* Nothing to do with read-only and MMIO-writable regions */
82ea3e3b 1883 if (block->mr->readonly || block->mr->rom_device) {
eeccb99c
AG
1884 continue;
1885 }
1886
1887 /*
1888 * Populate pages of the RAM block before enabling userfault_fd
1889 * write protection.
1890 *
1891 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1892 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1893 * pages with pte_none() entries in page table.
1894 */
f7b9dcfb 1895 ram_block_populate_read(block);
eeccb99c
AG
1896 }
1897}
1898
e41c5770
DH
1899static inline int uffd_protect_section(MemoryRegionSection *section,
1900 void *opaque)
1901{
1902 const hwaddr size = int128_get64(section->size);
1903 const hwaddr offset = section->offset_within_region;
1904 RAMBlock *rb = section->mr->ram_block;
1905 int uffd_fd = (uintptr_t)opaque;
1906
1907 return uffd_change_protection(uffd_fd, rb->host + offset, size, true,
1908 false);
1909}
1910
1911static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd)
1912{
1913 assert(rb->flags & RAM_UF_WRITEPROTECT);
1914
1915 /* See ram_block_populate_read() */
1916 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1917 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1918 MemoryRegionSection section = {
1919 .mr = rb->mr,
1920 .offset_within_region = 0,
1921 .size = rb->mr->size,
1922 };
1923
1924 return ram_discard_manager_replay_populated(rdm, &section,
1925 uffd_protect_section,
1926 (void *)(uintptr_t)uffd_fd);
1927 }
1928 return uffd_change_protection(uffd_fd, rb->host,
1929 rb->used_length, true, false);
1930}
1931
278e2f55
AG
1932/*
1933 * ram_write_tracking_start: start UFFD-WP memory tracking
1934 *
1935 * Returns 0 for success or negative value in case of error
1936 */
1937int ram_write_tracking_start(void)
1938{
1939 int uffd_fd;
1940 RAMState *rs = ram_state;
82ea3e3b 1941 RAMBlock *block;
278e2f55
AG
1942
1943 /* Open UFFD file descriptor */
1944 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1945 if (uffd_fd < 0) {
1946 return uffd_fd;
1947 }
1948 rs->uffdio_fd = uffd_fd;
1949
1950 RCU_READ_LOCK_GUARD();
1951
82ea3e3b 1952 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
278e2f55 1953 /* Nothing to do with read-only and MMIO-writable regions */
82ea3e3b 1954 if (block->mr->readonly || block->mr->rom_device) {
278e2f55
AG
1955 continue;
1956 }
1957
1958 /* Register block memory with UFFD to track writes */
82ea3e3b
AG
1959 if (uffd_register_memory(rs->uffdio_fd, block->host,
1960 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
278e2f55
AG
1961 goto fail;
1962 }
72ef3a37
DH
1963 block->flags |= RAM_UF_WRITEPROTECT;
1964 memory_region_ref(block->mr);
1965
278e2f55 1966 /* Apply UFFD write protection to the block memory range */
e41c5770 1967 if (ram_block_uffd_protect(block, uffd_fd)) {
278e2f55
AG
1968 goto fail;
1969 }
278e2f55 1970
82ea3e3b
AG
1971 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1972 block->host, block->max_length);
278e2f55
AG
1973 }
1974
1975 return 0;
1976
1977fail:
1978 error_report("ram_write_tracking_start() failed: restoring initial memory state");
1979
82ea3e3b
AG
1980 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1981 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
278e2f55
AG
1982 continue;
1983 }
82ea3e3b 1984 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
278e2f55 1985 /* Cleanup flags and remove reference */
82ea3e3b
AG
1986 block->flags &= ~RAM_UF_WRITEPROTECT;
1987 memory_region_unref(block->mr);
278e2f55
AG
1988 }
1989
1990 uffd_close_fd(uffd_fd);
1991 rs->uffdio_fd = -1;
1992 return -1;
1993}
1994
1995/**
1996 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1997 */
1998void ram_write_tracking_stop(void)
1999{
2000 RAMState *rs = ram_state;
82ea3e3b 2001 RAMBlock *block;
278e2f55
AG
2002
2003 RCU_READ_LOCK_GUARD();
2004
82ea3e3b
AG
2005 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2006 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
278e2f55
AG
2007 continue;
2008 }
82ea3e3b 2009 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
278e2f55 2010
82ea3e3b
AG
2011 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
2012 block->host, block->max_length);
278e2f55
AG
2013
2014 /* Cleanup flags and remove reference */
82ea3e3b
AG
2015 block->flags &= ~RAM_UF_WRITEPROTECT;
2016 memory_region_unref(block->mr);
278e2f55
AG
2017 }
2018
2019 /* Finally close UFFD file descriptor */
2020 uffd_close_fd(rs->uffdio_fd);
2021 rs->uffdio_fd = -1;
2022}
2023
2024#else
2025/* No target OS support, stubs just fail or ignore */
2026
2027static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
2028{
2029 (void) rs;
2030 (void) offset;
2031
2032 return NULL;
2033}
2034
2035static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
2036 unsigned long start_page)
2037{
2038 (void) rs;
2039 (void) pss;
2040 (void) start_page;
2041
2042 return 0;
2043}
2044
2045bool ram_write_tracking_available(void)
2046{
2047 return false;
2048}
2049
2050bool ram_write_tracking_compatible(void)
2051{
2052 assert(0);
2053 return false;
2054}
2055
2056int ram_write_tracking_start(void)
2057{
2058 assert(0);
2059 return -1;
2060}
2061
2062void ram_write_tracking_stop(void)
2063{
2064 assert(0);
2065}
2066#endif /* defined(__linux__) */
2067
3d0684b2 2068/**
ff1543af 2069 * get_queued_page: unqueue a page from the postcopy requests
3d0684b2
JQ
2070 *
2071 * Skips pages that are already sent (!dirty)
a82d593b 2072 *
a5f7b1a6 2073 * Returns true if a queued page is found
a82d593b 2074 *
6f37bb8b 2075 * @rs: current RAM state
3d0684b2 2076 * @pss: data about the state of the current dirty page scan
a82d593b 2077 */
f20e2865 2078static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
a82d593b
DDAG
2079{
2080 RAMBlock *block;
2081 ram_addr_t offset;
777f53c7
TH
2082 bool dirty;
2083
2084 do {
2085 block = unqueue_page(rs, &offset);
2086 /*
2087 * We're sending this page, and since it's postcopy nothing else
2088 * will dirty it, and we must make sure it doesn't get sent again
2089 * even if this queue request was received after the background
2090 * search already sent it.
2091 */
2092 if (block) {
2093 unsigned long page;
2094
2095 page = offset >> TARGET_PAGE_BITS;
2096 dirty = test_bit(page, block->bmap);
2097 if (!dirty) {
2098 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2099 page);
2100 } else {
2101 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2102 }
2103 }
a82d593b 2104
777f53c7 2105 } while (block && !dirty);
a82d593b 2106
b062106d 2107 if (!block) {
278e2f55
AG
2108 /*
2109 * Poll write faults too if background snapshot is enabled; that's
2110 * when we have vcpus got blocked by the write protected pages.
2111 */
2112 block = poll_fault_page(rs, &offset);
2113 }
2114
a82d593b 2115 if (block) {
a82d593b
DDAG
2116 /*
2117 * We want the background search to continue from the queued page
2118 * since the guest is likely to want other pages near to the page
2119 * it just requested.
2120 */
2121 pss->block = block;
a935e30f 2122 pss->page = offset >> TARGET_PAGE_BITS;
422314e7
WY
2123
2124 /*
2125 * This unqueued page would break the "one round" check, even is
2126 * really rare.
2127 */
2128 pss->complete_round = false;
a82d593b
DDAG
2129 }
2130
2131 return !!block;
2132}
2133
6c595cde 2134/**
5e58f968
JQ
2135 * migration_page_queue_free: drop any remaining pages in the ram
2136 * request queue
6c595cde 2137 *
3d0684b2
JQ
2138 * It should be empty at the end anyway, but in error cases there may
2139 * be some left. in case that there is any page left, we drop it.
2140 *
6c595cde 2141 */
83c13382 2142static void migration_page_queue_free(RAMState *rs)
6c595cde 2143{
ec481c6c 2144 struct RAMSrcPageRequest *mspr, *next_mspr;
6c595cde
DDAG
2145 /* This queue generally should be empty - but in the case of a failed
2146 * migration might have some droppings in.
2147 */
89ac5a1d 2148 RCU_READ_LOCK_GUARD();
ec481c6c 2149 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
6c595cde 2150 memory_region_unref(mspr->rb->mr);
ec481c6c 2151 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
6c595cde
DDAG
2152 g_free(mspr);
2153 }
6c595cde
DDAG
2154}
2155
2156/**
3d0684b2
JQ
2157 * ram_save_queue_pages: queue the page for transmission
2158 *
2159 * A request from postcopy destination for example.
2160 *
2161 * Returns zero on success or negative on error
2162 *
3d0684b2
JQ
2163 * @rbname: Name of the RAMBLock of the request. NULL means the
2164 * same that last one.
2165 * @start: starting address from the start of the RAMBlock
2166 * @len: length (in bytes) to send
6c595cde 2167 */
96506894 2168int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
6c595cde
DDAG
2169{
2170 RAMBlock *ramblock;
53518d94 2171 RAMState *rs = ram_state;
6c595cde 2172
3c764f9b 2173 stat64_add(&ram_counters.postcopy_requests, 1);
89ac5a1d
DDAG
2174 RCU_READ_LOCK_GUARD();
2175
6c595cde
DDAG
2176 if (!rbname) {
2177 /* Reuse last RAMBlock */
68a098f3 2178 ramblock = rs->last_req_rb;
6c595cde
DDAG
2179
2180 if (!ramblock) {
2181 /*
2182 * Shouldn't happen, we can't reuse the last RAMBlock if
2183 * it's the 1st request.
2184 */
2185 error_report("ram_save_queue_pages no previous block");
03acb4e9 2186 return -1;
6c595cde
DDAG
2187 }
2188 } else {
2189 ramblock = qemu_ram_block_by_name(rbname);
2190
2191 if (!ramblock) {
2192 /* We shouldn't be asked for a non-existent RAMBlock */
2193 error_report("ram_save_queue_pages no block '%s'", rbname);
03acb4e9 2194 return -1;
6c595cde 2195 }
68a098f3 2196 rs->last_req_rb = ramblock;
6c595cde
DDAG
2197 }
2198 trace_ram_save_queue_pages(ramblock->idstr, start, len);
542147f4 2199 if (!offset_in_ramblock(ramblock, start + len - 1)) {
9458ad6b
JQ
2200 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2201 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
6c595cde 2202 __func__, start, len, ramblock->used_length);
03acb4e9 2203 return -1;
6c595cde
DDAG
2204 }
2205
93589827
PX
2206 /*
2207 * When with postcopy preempt, we send back the page directly in the
2208 * rp-return thread.
2209 */
2210 if (postcopy_preempt_active()) {
2211 ram_addr_t page_start = start >> TARGET_PAGE_BITS;
2212 size_t page_size = qemu_ram_pagesize(ramblock);
2213 PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY];
2214 int ret = 0;
2215
2216 qemu_mutex_lock(&rs->bitmap_mutex);
2217
2218 pss_init(pss, ramblock, page_start);
2219 /*
2220 * Always use the preempt channel, and make sure it's there. It's
2221 * safe to access without lock, because when rp-thread is running
2222 * we should be the only one who operates on the qemufile
2223 */
2224 pss->pss_channel = migrate_get_current()->postcopy_qemufile_src;
93589827
PX
2225 assert(pss->pss_channel);
2226
2227 /*
2228 * It must be either one or multiple of host page size. Just
2229 * assert; if something wrong we're mostly split brain anyway.
2230 */
2231 assert(len % page_size == 0);
2232 while (len) {
2233 if (ram_save_host_page_urgent(pss)) {
2234 error_report("%s: ram_save_host_page_urgent() failed: "
2235 "ramblock=%s, start_addr=0x"RAM_ADDR_FMT,
2236 __func__, ramblock->idstr, start);
2237 ret = -1;
2238 break;
2239 }
2240 /*
2241 * NOTE: after ram_save_host_page_urgent() succeeded, pss->page
2242 * will automatically be moved and point to the next host page
2243 * we're going to send, so no need to update here.
2244 *
2245 * Normally QEMU never sends >1 host page in requests, so
2246 * logically we don't even need that as the loop should only
2247 * run once, but just to be consistent.
2248 */
2249 len -= page_size;
2250 };
2251 qemu_mutex_unlock(&rs->bitmap_mutex);
2252
2253 return ret;
2254 }
2255
ec481c6c 2256 struct RAMSrcPageRequest *new_entry =
b21e2380 2257 g_new0(struct RAMSrcPageRequest, 1);
6c595cde
DDAG
2258 new_entry->rb = ramblock;
2259 new_entry->offset = start;
2260 new_entry->len = len;
2261
2262 memory_region_ref(ramblock->mr);
ec481c6c
JQ
2263 qemu_mutex_lock(&rs->src_page_req_mutex);
2264 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
e03a34f8 2265 migration_make_urgent_request();
ec481c6c 2266 qemu_mutex_unlock(&rs->src_page_req_mutex);
6c595cde
DDAG
2267
2268 return 0;
6c595cde
DDAG
2269}
2270
d7400a34
XG
2271static bool save_page_use_compression(RAMState *rs)
2272{
2273 if (!migrate_use_compression()) {
2274 return false;
2275 }
2276
2277 /*
1a373522
DH
2278 * If xbzrle is enabled (e.g., after first round of migration), stop
2279 * using the data compression. In theory, xbzrle can do better than
2280 * compression.
d7400a34 2281 */
1a373522
DH
2282 if (rs->xbzrle_enabled) {
2283 return false;
d7400a34
XG
2284 }
2285
1a373522 2286 return true;
d7400a34
XG
2287}
2288
5e5fdcff
XG
2289/*
2290 * try to compress the page before posting it out, return true if the page
2291 * has been properly handled by compression, otherwise needs other
2292 * paths to handle it
2293 */
ec6f3ab9
PX
2294static bool save_compress_page(RAMState *rs, PageSearchStatus *pss,
2295 RAMBlock *block, ram_addr_t offset)
5e5fdcff
XG
2296{
2297 if (!save_page_use_compression(rs)) {
2298 return false;
2299 }
2300
2301 /*
2302 * When starting the process of a new block, the first page of
2303 * the block should be sent out before other pages in the same
2304 * block, and all the pages in last block should have been sent
2305 * out, keeping this order is important, because the 'cont' flag
2306 * is used to avoid resending the block name.
2307 *
2308 * We post the fist page as normal page as compression will take
2309 * much CPU resource.
2310 */
ec6f3ab9 2311 if (block != pss->last_sent_block) {
5e5fdcff
XG
2312 flush_compressed_data(rs);
2313 return false;
2314 }
2315
eaa238ab 2316 if (compress_page_with_multi_thread(block, offset) > 0) {
5e5fdcff
XG
2317 return true;
2318 }
2319
76e03000 2320 compression_counters.busy++;
5e5fdcff
XG
2321 return false;
2322}
2323
a82d593b 2324/**
4010ba38 2325 * ram_save_target_page_legacy: save one target page
a82d593b 2326 *
3d0684b2 2327 * Returns the number of pages written
a82d593b 2328 *
6f37bb8b 2329 * @rs: current RAM state
3d0684b2 2330 * @pss: data about the page we want to send
a82d593b 2331 */
4010ba38 2332static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss)
a82d593b 2333{
a8ec91f9 2334 RAMBlock *block = pss->block;
8bba004c 2335 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
a8ec91f9
XG
2336 int res;
2337
61717ea9 2338 if (control_save_page(pss, block, offset, &res)) {
a8ec91f9
XG
2339 return res;
2340 }
2341
ec6f3ab9 2342 if (save_compress_page(rs, pss, block, offset)) {
5e5fdcff 2343 return 1;
d7400a34
XG
2344 }
2345
37502df3 2346 res = save_zero_page(pss, pss->pss_channel, block, offset);
d7400a34
XG
2347 if (res > 0) {
2348 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2349 * page would be stale
2350 */
ef5c3d13 2351 if (rs->xbzrle_enabled) {
d7400a34
XG
2352 XBZRLE_cache_lock();
2353 xbzrle_cache_zero_page(rs, block->offset + offset);
2354 XBZRLE_cache_unlock();
2355 }
d7400a34
XG
2356 return res;
2357 }
2358
da3f56cb 2359 /*
6f39c90b
PX
2360 * Do not use multifd in postcopy as one whole host page should be
2361 * placed. Meanwhile postcopy requires atomic update of pages, so even
2362 * if host page size == guest page size the dest guest during run may
2363 * still see partially copied pages which is data corruption.
da3f56cb 2364 */
6f39c90b 2365 if (migrate_use_multifd() && !migration_in_postcopy()) {
61717ea9 2366 return ram_save_multifd_page(pss->pss_channel, block, offset);
a82d593b
DDAG
2367 }
2368
05931ec5 2369 return ram_save_page(rs, pss);
a82d593b
DDAG
2370}
2371
d9e474ea
PX
2372/* Should be called before sending a host page */
2373static void pss_host_page_prepare(PageSearchStatus *pss)
2374{
2375 /* How many guest pages are there in one host page? */
2376 size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2377
2378 pss->host_page_sending = true;
301d7ffe
PX
2379 if (guest_pfns <= 1) {
2380 /*
2381 * This covers both when guest psize == host psize, or when guest
2382 * has larger psize than the host (guest_pfns==0).
2383 *
2384 * For the latter, we always send one whole guest page per
2385 * iteration of the host page (example: an Alpha VM on x86 host
2386 * will have guest psize 8K while host psize 4K).
2387 */
2388 pss->host_page_start = pss->page;
2389 pss->host_page_end = pss->page + 1;
2390 } else {
2391 /*
2392 * The host page spans over multiple guest pages, we send them
2393 * within the same host page iteration.
2394 */
2395 pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
2396 pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
2397 }
d9e474ea
PX
2398}
2399
2400/*
2401 * Whether the page pointed by PSS is within the host page being sent.
2402 * Must be called after a previous pss_host_page_prepare().
2403 */
2404static bool pss_within_range(PageSearchStatus *pss)
2405{
2406 ram_addr_t ram_addr;
2407
2408 assert(pss->host_page_sending);
2409
2410 /* Over host-page boundary? */
2411 if (pss->page >= pss->host_page_end) {
2412 return false;
2413 }
2414
2415 ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2416
2417 return offset_in_ramblock(pss->block, ram_addr);
2418}
2419
2420static void pss_host_page_finish(PageSearchStatus *pss)
2421{
2422 pss->host_page_sending = false;
2423 /* This is not needed, but just to reset it */
2424 pss->host_page_start = pss->host_page_end = 0;
2425}
2426
93589827
PX
2427/*
2428 * Send an urgent host page specified by `pss'. Need to be called with
2429 * bitmap_mutex held.
2430 *
2431 * Returns 0 if save host page succeeded, false otherwise.
2432 */
2433static int ram_save_host_page_urgent(PageSearchStatus *pss)
2434{
2435 bool page_dirty, sent = false;
2436 RAMState *rs = ram_state;
2437 int ret = 0;
2438
2439 trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2440 pss_host_page_prepare(pss);
2441
2442 /*
2443 * If precopy is sending the same page, let it be done in precopy, or
2444 * we could send the same page in two channels and none of them will
2445 * receive the whole page.
2446 */
2447 if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) {
2448 trace_postcopy_preempt_hit(pss->block->idstr,
2449 pss->page << TARGET_PAGE_BITS);
2450 return 0;
2451 }
2452
2453 do {
2454 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2455
2456 if (page_dirty) {
2457 /* Be strict to return code; it must be 1, or what else? */
4010ba38 2458 if (migration_ops->ram_save_target_page(rs, pss) != 1) {
93589827
PX
2459 error_report_once("%s: ram_save_target_page failed", __func__);
2460 ret = -1;
2461 goto out;
2462 }
2463 sent = true;
2464 }
2465 pss_find_next_dirty(pss);
2466 } while (pss_within_range(pss));
2467out:
2468 pss_host_page_finish(pss);
2469 /* For urgent requests, flush immediately if sent */
2470 if (sent) {
2471 qemu_fflush(pss->pss_channel);
2472 }
2473 return ret;
2474}
2475
a82d593b 2476/**
3d0684b2 2477 * ram_save_host_page: save a whole host page
a82d593b 2478 *
3d0684b2
JQ
2479 * Starting at *offset send pages up to the end of the current host
2480 * page. It's valid for the initial offset to point into the middle of
2481 * a host page in which case the remainder of the hostpage is sent.
2482 * Only dirty target pages are sent. Note that the host page size may
2483 * be a huge page for this block.
f3321554 2484 *
1eb3fc0a
DDAG
2485 * The saving stops at the boundary of the used_length of the block
2486 * if the RAMBlock isn't a multiple of the host page size.
a82d593b 2487 *
f3321554
PX
2488 * The caller must be with ram_state.bitmap_mutex held to call this
2489 * function. Note that this function can temporarily release the lock, but
2490 * when the function is returned it'll make sure the lock is still held.
2491 *
3d0684b2
JQ
2492 * Returns the number of pages written or negative on error
2493 *
6f37bb8b 2494 * @rs: current RAM state
3d0684b2 2495 * @pss: data about the page we want to send
a82d593b 2496 */
05931ec5 2497static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
a82d593b 2498{
f3321554 2499 bool page_dirty, preempt_active = postcopy_preempt_active();
a82d593b 2500 int tmppages, pages = 0;
a935e30f
JQ
2501 size_t pagesize_bits =
2502 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
278e2f55
AG
2503 unsigned long start_page = pss->page;
2504 int res;
4c011c37 2505
fbd162e6 2506 if (ramblock_is_ignored(pss->block)) {
b895de50
CLG
2507 error_report("block %s should not be migrated !", pss->block->idstr);
2508 return 0;
2509 }
2510
d9e474ea
PX
2511 /* Update host page boundary information */
2512 pss_host_page_prepare(pss);
2513
a82d593b 2514 do {
f3321554 2515 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
a82d593b 2516
f3321554
PX
2517 /* Check the pages is dirty and if it is send it */
2518 if (page_dirty) {
ba1b7c81 2519 /*
f3321554
PX
2520 * Properly yield the lock only in postcopy preempt mode
2521 * because both migration thread and rp-return thread can
2522 * operate on the bitmaps.
ba1b7c81 2523 */
f3321554
PX
2524 if (preempt_active) {
2525 qemu_mutex_unlock(&rs->bitmap_mutex);
ba1b7c81 2526 }
4010ba38 2527 tmppages = migration_ops->ram_save_target_page(rs, pss);
f3321554
PX
2528 if (tmppages >= 0) {
2529 pages += tmppages;
2530 /*
2531 * Allow rate limiting to happen in the middle of huge pages if
2532 * something is sent in the current iteration.
2533 */
2534 if (pagesize_bits > 1 && tmppages > 0) {
2535 migration_rate_limit();
2536 }
2537 }
2538 if (preempt_active) {
2539 qemu_mutex_lock(&rs->bitmap_mutex);
2540 }
2541 } else {
2542 tmppages = 0;
23feba90 2543 }
f3321554
PX
2544
2545 if (tmppages < 0) {
d9e474ea 2546 pss_host_page_finish(pss);
f3321554
PX
2547 return tmppages;
2548 }
2549
d9e474ea
PX
2550 pss_find_next_dirty(pss);
2551 } while (pss_within_range(pss));
2552
2553 pss_host_page_finish(pss);
278e2f55
AG
2554
2555 res = ram_save_release_protection(rs, pss, start_page);
2556 return (res < 0 ? res : pages);
a82d593b 2557}
6c595cde 2558
56e93d26 2559/**
3d0684b2 2560 * ram_find_and_save_block: finds a dirty page and sends it to f
56e93d26
JQ
2561 *
2562 * Called within an RCU critical section.
2563 *
e8f3735f
XG
2564 * Returns the number of pages written where zero means no dirty pages,
2565 * or negative on error
56e93d26 2566 *
6f37bb8b 2567 * @rs: current RAM state
a82d593b
DDAG
2568 *
2569 * On systems where host-page-size > target-page-size it will send all the
2570 * pages in a host page that are dirty.
56e93d26 2571 */
05931ec5 2572static int ram_find_and_save_block(RAMState *rs)
56e93d26 2573{
f1668764 2574 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
56e93d26 2575 int pages = 0;
56e93d26 2576
0827b9e9 2577 /* No dirty page as there is zero RAM */
8d80e195 2578 if (!rs->ram_bytes_total) {
0827b9e9
AA
2579 return pages;
2580 }
2581
4934a5dd
PX
2582 /*
2583 * Always keep last_seen_block/last_page valid during this procedure,
2584 * because find_dirty_block() relies on these values (e.g., we compare
2585 * last_seen_block with pss.block to see whether we searched all the
2586 * ramblocks) to detect the completion of migration. Having NULL value
2587 * of last_seen_block can conditionally cause below loop to run forever.
2588 */
2589 if (!rs->last_seen_block) {
2590 rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks);
2591 rs->last_page = 0;
2592 }
2593
f1668764 2594 pss_init(pss, rs->last_seen_block, rs->last_page);
b8fb8cb7 2595
31e2ac74 2596 while (true){
51efd36f 2597 if (!get_queued_page(rs, pss)) {
b062106d 2598 /* priority queue empty, so just search for something dirty */
31e2ac74
JQ
2599 int res = find_dirty_block(rs, pss);
2600 if (res != PAGE_DIRTY_FOUND) {
2601 if (res == PAGE_ALL_CLEAN) {
51efd36f 2602 break;
31e2ac74
JQ
2603 } else if (res == PAGE_TRY_AGAIN) {
2604 continue;
51efd36f
JQ
2605 }
2606 }
56e93d26 2607 }
51efd36f 2608 pages = ram_save_host_page(rs, pss);
31e2ac74
JQ
2609 if (pages) {
2610 break;
2611 }
2612 }
56e93d26 2613
f1668764
PX
2614 rs->last_seen_block = pss->block;
2615 rs->last_page = pss->page;
56e93d26
JQ
2616
2617 return pages;
2618}
2619
2620void acct_update_position(QEMUFile *f, size_t size, bool zero)
2621{
2622 uint64_t pages = size / TARGET_PAGE_SIZE;
f7ccd61b 2623
56e93d26 2624 if (zero) {
1a386e8d 2625 stat64_add(&ram_counters.zero_pages, pages);
56e93d26 2626 } else {
8c0cda8f 2627 stat64_add(&ram_counters.normal_pages, pages);
4c2d0f6d 2628 ram_transferred_add(size);
1a93bd2f 2629 qemu_file_credit_transfer(f, size);
56e93d26
JQ
2630 }
2631}
2632
8008a272 2633static uint64_t ram_bytes_total_with_ignored(void)
56e93d26
JQ
2634{
2635 RAMBlock *block;
2636 uint64_t total = 0;
2637
89ac5a1d
DDAG
2638 RCU_READ_LOCK_GUARD();
2639
8008a272
JQ
2640 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2641 total += block->used_length;
99e15582 2642 }
56e93d26
JQ
2643 return total;
2644}
2645
fbd162e6
YK
2646uint64_t ram_bytes_total(void)
2647{
8008a272
JQ
2648 RAMBlock *block;
2649 uint64_t total = 0;
2650
2651 RCU_READ_LOCK_GUARD();
2652
2653 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2654 total += block->used_length;
2655 }
2656 return total;
fbd162e6
YK
2657}
2658
f265e0e4 2659static void xbzrle_load_setup(void)
56e93d26 2660{
f265e0e4 2661 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
56e93d26
JQ
2662}
2663
f265e0e4
JQ
2664static void xbzrle_load_cleanup(void)
2665{
2666 g_free(XBZRLE.decoded_buf);
2667 XBZRLE.decoded_buf = NULL;
2668}
2669
7d7c96be
PX
2670static void ram_state_cleanup(RAMState **rsp)
2671{
b9ccaf6d
DDAG
2672 if (*rsp) {
2673 migration_page_queue_free(*rsp);
2674 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2675 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2676 g_free(*rsp);
2677 *rsp = NULL;
2678 }
7d7c96be
PX
2679}
2680
84593a08
PX
2681static void xbzrle_cleanup(void)
2682{
2683 XBZRLE_cache_lock();
2684 if (XBZRLE.cache) {
2685 cache_fini(XBZRLE.cache);
2686 g_free(XBZRLE.encoded_buf);
2687 g_free(XBZRLE.current_buf);
2688 g_free(XBZRLE.zero_target_page);
2689 XBZRLE.cache = NULL;
2690 XBZRLE.encoded_buf = NULL;
2691 XBZRLE.current_buf = NULL;
2692 XBZRLE.zero_target_page = NULL;
2693 }
2694 XBZRLE_cache_unlock();
2695}
2696
f265e0e4 2697static void ram_save_cleanup(void *opaque)
56e93d26 2698{
53518d94 2699 RAMState **rsp = opaque;
6b6712ef 2700 RAMBlock *block;
eb859c53 2701
278e2f55
AG
2702 /* We don't use dirty log with background snapshots */
2703 if (!migrate_background_snapshot()) {
2704 /* caller have hold iothread lock or is in a bh, so there is
2705 * no writing race against the migration bitmap
2706 */
63b41db4
HH
2707 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2708 /*
2709 * do not stop dirty log without starting it, since
2710 * memory_global_dirty_log_stop will assert that
2711 * memory_global_dirty_log_start/stop used in pairs
2712 */
2713 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2714 }
278e2f55 2715 }
6b6712ef 2716
fbd162e6 2717 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
002cad6b
PX
2718 g_free(block->clear_bmap);
2719 block->clear_bmap = NULL;
6b6712ef
JQ
2720 g_free(block->bmap);
2721 block->bmap = NULL;
56e93d26
JQ
2722 }
2723
84593a08 2724 xbzrle_cleanup();
f0afa331 2725 compress_threads_save_cleanup();
7d7c96be 2726 ram_state_cleanup(rsp);
4010ba38
JQ
2727 g_free(migration_ops);
2728 migration_ops = NULL;
56e93d26
JQ
2729}
2730
6f37bb8b 2731static void ram_state_reset(RAMState *rs)
56e93d26 2732{
ec6f3ab9
PX
2733 int i;
2734
2735 for (i = 0; i < RAM_CHANNEL_MAX; i++) {
2736 rs->pss[i].last_sent_block = NULL;
2737 }
2738
6f37bb8b 2739 rs->last_seen_block = NULL;
269ace29 2740 rs->last_page = 0;
6f37bb8b 2741 rs->last_version = ram_list.version;
1a373522 2742 rs->xbzrle_enabled = false;
56e93d26
JQ
2743}
2744
2745#define MAX_WAIT 50 /* ms, half buffered_file limit */
2746
e0b266f0
DDAG
2747/* **** functions for postcopy ***** */
2748
ced1c616
PB
2749void ram_postcopy_migrated_memory_release(MigrationState *ms)
2750{
2751 struct RAMBlock *block;
ced1c616 2752
fbd162e6 2753 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
6b6712ef
JQ
2754 unsigned long *bitmap = block->bmap;
2755 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2756 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
ced1c616
PB
2757
2758 while (run_start < range) {
2759 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
8bba004c
AR
2760 ram_discard_range(block->idstr,
2761 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2762 ((ram_addr_t)(run_end - run_start))
2763 << TARGET_PAGE_BITS);
ced1c616
PB
2764 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2765 }
2766 }
2767}
2768
3d0684b2
JQ
2769/**
2770 * postcopy_send_discard_bm_ram: discard a RAMBlock
2771 *
e0b266f0 2772 * Callback from postcopy_each_ram_send_discard for each RAMBlock
3d0684b2
JQ
2773 *
2774 * @ms: current migration state
89dab31b 2775 * @block: RAMBlock to discard
e0b266f0 2776 */
9e7d1223 2777static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
e0b266f0 2778{
6b6712ef 2779 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
e0b266f0 2780 unsigned long current;
1e7cf8c3 2781 unsigned long *bitmap = block->bmap;
e0b266f0 2782
6b6712ef 2783 for (current = 0; current < end; ) {
1e7cf8c3 2784 unsigned long one = find_next_bit(bitmap, end, current);
33a5cb62 2785 unsigned long zero, discard_length;
e0b266f0 2786
33a5cb62
WY
2787 if (one >= end) {
2788 break;
2789 }
e0b266f0 2790
1e7cf8c3 2791 zero = find_next_zero_bit(bitmap, end, one + 1);
33a5cb62
WY
2792
2793 if (zero >= end) {
2794 discard_length = end - one;
e0b266f0 2795 } else {
33a5cb62
WY
2796 discard_length = zero - one;
2797 }
810cf2bb 2798 postcopy_discard_send_range(ms, one, discard_length);
33a5cb62 2799 current = one + discard_length;
e0b266f0 2800 }
e0b266f0
DDAG
2801}
2802
f30c2e5b
PX
2803static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2804
3d0684b2
JQ
2805/**
2806 * postcopy_each_ram_send_discard: discard all RAMBlocks
2807 *
e0b266f0
DDAG
2808 * Utility for the outgoing postcopy code.
2809 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2810 * passing it bitmap indexes and name.
e0b266f0
DDAG
2811 * (qemu_ram_foreach_block ends up passing unscaled lengths
2812 * which would mean postcopy code would have to deal with target page)
3d0684b2
JQ
2813 *
2814 * @ms: current migration state
e0b266f0 2815 */
739fcc1b 2816static void postcopy_each_ram_send_discard(MigrationState *ms)
e0b266f0
DDAG
2817{
2818 struct RAMBlock *block;
e0b266f0 2819
fbd162e6 2820 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
810cf2bb 2821 postcopy_discard_send_init(ms, block->idstr);
e0b266f0 2822
f30c2e5b
PX
2823 /*
2824 * Deal with TPS != HPS and huge pages. It discard any partially sent
2825 * host-page size chunks, mark any partially dirty host-page size
2826 * chunks as all dirty. In this case the host-page is the host-page
2827 * for the particular RAMBlock, i.e. it might be a huge page.
2828 */
2829 postcopy_chunk_hostpages_pass(ms, block);
2830
e0b266f0
DDAG
2831 /*
2832 * Postcopy sends chunks of bitmap over the wire, but it
2833 * just needs indexes at this point, avoids it having
2834 * target page specific code.
2835 */
739fcc1b 2836 postcopy_send_discard_bm_ram(ms, block);
810cf2bb 2837 postcopy_discard_send_finish(ms);
e0b266f0 2838 }
e0b266f0
DDAG
2839}
2840
3d0684b2 2841/**
8324ef86 2842 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
3d0684b2
JQ
2843 *
2844 * Helper for postcopy_chunk_hostpages; it's called twice to
2845 * canonicalize the two bitmaps, that are similar, but one is
2846 * inverted.
99e314eb 2847 *
3d0684b2
JQ
2848 * Postcopy requires that all target pages in a hostpage are dirty or
2849 * clean, not a mix. This function canonicalizes the bitmaps.
99e314eb 2850 *
3d0684b2 2851 * @ms: current migration state
3d0684b2 2852 * @block: block that contains the page we want to canonicalize
99e314eb 2853 */
1e7cf8c3 2854static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
99e314eb 2855{
53518d94 2856 RAMState *rs = ram_state;
6b6712ef 2857 unsigned long *bitmap = block->bmap;
29c59172 2858 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
6b6712ef 2859 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
99e314eb
DDAG
2860 unsigned long run_start;
2861
29c59172
DDAG
2862 if (block->page_size == TARGET_PAGE_SIZE) {
2863 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2864 return;
2865 }
2866
1e7cf8c3
WY
2867 /* Find a dirty page */
2868 run_start = find_next_bit(bitmap, pages, 0);
99e314eb 2869
6b6712ef 2870 while (run_start < pages) {
99e314eb
DDAG
2871
2872 /*
2873 * If the start of this run of pages is in the middle of a host
2874 * page, then we need to fixup this host page.
2875 */
9dec3cc3 2876 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
99e314eb 2877 /* Find the end of this run */
1e7cf8c3 2878 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
99e314eb
DDAG
2879 /*
2880 * If the end isn't at the start of a host page, then the
2881 * run doesn't finish at the end of a host page
2882 * and we need to discard.
2883 */
99e314eb
DDAG
2884 }
2885
9dec3cc3 2886 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
99e314eb 2887 unsigned long page;
dad45ab2
WY
2888 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2889 host_ratio);
2890 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
99e314eb 2891
99e314eb
DDAG
2892 /* Clean up the bitmap */
2893 for (page = fixup_start_addr;
2894 page < fixup_start_addr + host_ratio; page++) {
99e314eb
DDAG
2895 /*
2896 * Remark them as dirty, updating the count for any pages
2897 * that weren't previously dirty.
2898 */
0d8ec885 2899 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
99e314eb
DDAG
2900 }
2901 }
2902
1e7cf8c3
WY
2903 /* Find the next dirty page for the next iteration */
2904 run_start = find_next_bit(bitmap, pages, run_start);
99e314eb
DDAG
2905 }
2906}
2907
3d0684b2
JQ
2908/**
2909 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2910 *
e0b266f0
DDAG
2911 * Transmit the set of pages to be discarded after precopy to the target
2912 * these are pages that:
2913 * a) Have been previously transmitted but are now dirty again
2914 * b) Pages that have never been transmitted, this ensures that
2915 * any pages on the destination that have been mapped by background
2916 * tasks get discarded (transparent huge pages is the specific concern)
2917 * Hopefully this is pretty sparse
3d0684b2
JQ
2918 *
2919 * @ms: current migration state
e0b266f0 2920 */
739fcc1b 2921void ram_postcopy_send_discard_bitmap(MigrationState *ms)
e0b266f0 2922{
53518d94 2923 RAMState *rs = ram_state;
e0b266f0 2924
89ac5a1d 2925 RCU_READ_LOCK_GUARD();
e0b266f0
DDAG
2926
2927 /* This should be our last sync, the src is now paused */
eb859c53 2928 migration_bitmap_sync(rs);
e0b266f0 2929
6b6712ef 2930 /* Easiest way to make sure we don't resume in the middle of a host-page */
ec6f3ab9 2931 rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL;
6b6712ef 2932 rs->last_seen_block = NULL;
6b6712ef 2933 rs->last_page = 0;
e0b266f0 2934
739fcc1b 2935 postcopy_each_ram_send_discard(ms);
e0b266f0 2936
739fcc1b 2937 trace_ram_postcopy_send_discard_bitmap();
e0b266f0
DDAG
2938}
2939
3d0684b2
JQ
2940/**
2941 * ram_discard_range: discard dirtied pages at the beginning of postcopy
e0b266f0 2942 *
3d0684b2 2943 * Returns zero on success
e0b266f0 2944 *
36449157
JQ
2945 * @rbname: name of the RAMBlock of the request. NULL means the
2946 * same that last one.
3d0684b2
JQ
2947 * @start: RAMBlock starting page
2948 * @length: RAMBlock size
e0b266f0 2949 */
aaa2064c 2950int ram_discard_range(const char *rbname, uint64_t start, size_t length)
e0b266f0 2951{
36449157 2952 trace_ram_discard_range(rbname, start, length);
d3a5038c 2953
89ac5a1d 2954 RCU_READ_LOCK_GUARD();
36449157 2955 RAMBlock *rb = qemu_ram_block_by_name(rbname);
e0b266f0
DDAG
2956
2957 if (!rb) {
36449157 2958 error_report("ram_discard_range: Failed to find block '%s'", rbname);
03acb4e9 2959 return -1;
e0b266f0
DDAG
2960 }
2961
814bb08f
PX
2962 /*
2963 * On source VM, we don't need to update the received bitmap since
2964 * we don't even have one.
2965 */
2966 if (rb->receivedmap) {
2967 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2968 length >> qemu_target_page_bits());
2969 }
2970
03acb4e9 2971 return ram_block_discard_range(rb, start, length);
e0b266f0
DDAG
2972}
2973
84593a08
PX
2974/*
2975 * For every allocation, we will try not to crash the VM if the
2976 * allocation failed.
2977 */
2978static int xbzrle_init(void)
2979{
2980 Error *local_err = NULL;
2981
2982 if (!migrate_use_xbzrle()) {
2983 return 0;
2984 }
2985
2986 XBZRLE_cache_lock();
2987
2988 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2989 if (!XBZRLE.zero_target_page) {
2990 error_report("%s: Error allocating zero page", __func__);
2991 goto err_out;
2992 }
2993
2994 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2995 TARGET_PAGE_SIZE, &local_err);
2996 if (!XBZRLE.cache) {
2997 error_report_err(local_err);
2998 goto free_zero_page;
2999 }
3000
3001 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3002 if (!XBZRLE.encoded_buf) {
3003 error_report("%s: Error allocating encoded_buf", __func__);
3004 goto free_cache;
3005 }
3006
3007 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3008 if (!XBZRLE.current_buf) {
3009 error_report("%s: Error allocating current_buf", __func__);
3010 goto free_encoded_buf;
3011 }
3012
3013 /* We are all good */
3014 XBZRLE_cache_unlock();
3015 return 0;
3016
3017free_encoded_buf:
3018 g_free(XBZRLE.encoded_buf);
3019 XBZRLE.encoded_buf = NULL;
3020free_cache:
3021 cache_fini(XBZRLE.cache);
3022 XBZRLE.cache = NULL;
3023free_zero_page:
3024 g_free(XBZRLE.zero_target_page);
3025 XBZRLE.zero_target_page = NULL;
3026err_out:
3027 XBZRLE_cache_unlock();
3028 return -ENOMEM;
3029}
3030
53518d94 3031static int ram_state_init(RAMState **rsp)
56e93d26 3032{
7d00ee6a
PX
3033 *rsp = g_try_new0(RAMState, 1);
3034
3035 if (!*rsp) {
3036 error_report("%s: Init ramstate fail", __func__);
3037 return -1;
3038 }
53518d94
JQ
3039
3040 qemu_mutex_init(&(*rsp)->bitmap_mutex);
3041 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3042 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
8d80e195 3043 (*rsp)->ram_bytes_total = ram_bytes_total();
56e93d26 3044
7d00ee6a 3045 /*
40c4d4a8
IR
3046 * Count the total number of pages used by ram blocks not including any
3047 * gaps due to alignment or unplugs.
03158519 3048 * This must match with the initial values of dirty bitmap.
7d00ee6a 3049 */
8d80e195 3050 (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS;
7d00ee6a
PX
3051 ram_state_reset(*rsp);
3052
3053 return 0;
3054}
3055
d6eff5d7 3056static void ram_list_init_bitmaps(void)
7d00ee6a 3057{
002cad6b 3058 MigrationState *ms = migrate_get_current();
d6eff5d7
PX
3059 RAMBlock *block;
3060 unsigned long pages;
002cad6b 3061 uint8_t shift;
56e93d26 3062
0827b9e9
AA
3063 /* Skip setting bitmap if there is no RAM */
3064 if (ram_bytes_total()) {
002cad6b
PX
3065 shift = ms->clear_bitmap_shift;
3066 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3067 error_report("clear_bitmap_shift (%u) too big, using "
3068 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3069 shift = CLEAR_BITMAP_SHIFT_MAX;
3070 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3071 error_report("clear_bitmap_shift (%u) too small, using "
3072 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3073 shift = CLEAR_BITMAP_SHIFT_MIN;
3074 }
3075
fbd162e6 3076 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
d6eff5d7 3077 pages = block->max_length >> TARGET_PAGE_BITS;
03158519
WY
3078 /*
3079 * The initial dirty bitmap for migration must be set with all
3080 * ones to make sure we'll migrate every guest RAM page to
3081 * destination.
40c4d4a8
IR
3082 * Here we set RAMBlock.bmap all to 1 because when rebegin a
3083 * new migration after a failed migration, ram_list.
3084 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3085 * guest memory.
03158519 3086 */
6b6712ef 3087 block->bmap = bitmap_new(pages);
40c4d4a8 3088 bitmap_set(block->bmap, 0, pages);
002cad6b
PX
3089 block->clear_bmap_shift = shift;
3090 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
0827b9e9 3091 }
f3f491fc 3092 }
d6eff5d7
PX
3093}
3094
be39b4cd
DH
3095static void migration_bitmap_clear_discarded_pages(RAMState *rs)
3096{
3097 unsigned long pages;
3098 RAMBlock *rb;
3099
3100 RCU_READ_LOCK_GUARD();
3101
3102 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3103 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
3104 rs->migration_dirty_pages -= pages;
3105 }
3106}
3107
d6eff5d7
PX
3108static void ram_init_bitmaps(RAMState *rs)
3109{
3110 /* For memory_global_dirty_log_start below. */
3111 qemu_mutex_lock_iothread();
3112 qemu_mutex_lock_ramlist();
f3f491fc 3113
89ac5a1d
DDAG
3114 WITH_RCU_READ_LOCK_GUARD() {
3115 ram_list_init_bitmaps();
278e2f55
AG
3116 /* We don't use dirty log with background snapshots */
3117 if (!migrate_background_snapshot()) {
63b41db4 3118 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
278e2f55
AG
3119 migration_bitmap_sync_precopy(rs);
3120 }
89ac5a1d 3121 }
56e93d26 3122 qemu_mutex_unlock_ramlist();
49877834 3123 qemu_mutex_unlock_iothread();
be39b4cd
DH
3124
3125 /*
3126 * After an eventual first bitmap sync, fixup the initial bitmap
3127 * containing all 1s to exclude any discarded pages from migration.
3128 */
3129 migration_bitmap_clear_discarded_pages(rs);
d6eff5d7
PX
3130}
3131
3132static int ram_init_all(RAMState **rsp)
3133{
3134 if (ram_state_init(rsp)) {
3135 return -1;
3136 }
3137
3138 if (xbzrle_init()) {
3139 ram_state_cleanup(rsp);
3140 return -1;
3141 }
3142
3143 ram_init_bitmaps(*rsp);
a91246c9
HZ
3144
3145 return 0;
3146}
3147
08614f34
PX
3148static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3149{
3150 RAMBlock *block;
3151 uint64_t pages = 0;
3152
3153 /*
3154 * Postcopy is not using xbzrle/compression, so no need for that.
3155 * Also, since source are already halted, we don't need to care
3156 * about dirty page logging as well.
3157 */
3158
fbd162e6 3159 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
08614f34
PX
3160 pages += bitmap_count_one(block->bmap,
3161 block->used_length >> TARGET_PAGE_BITS);
3162 }
3163
3164 /* This may not be aligned with current bitmaps. Recalculate. */
3165 rs->migration_dirty_pages = pages;
3166
1a373522 3167 ram_state_reset(rs);
08614f34
PX
3168
3169 /* Update RAMState cache of output QEMUFile */
7f401b80 3170 rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out;
08614f34
PX
3171
3172 trace_ram_state_resume_prepare(pages);
3173}
3174
6bcb05fc
WW
3175/*
3176 * This function clears bits of the free pages reported by the caller from the
3177 * migration dirty bitmap. @addr is the host address corresponding to the
3178 * start of the continuous guest free pages, and @len is the total bytes of
3179 * those pages.
3180 */
3181void qemu_guest_free_page_hint(void *addr, size_t len)
3182{
3183 RAMBlock *block;
3184 ram_addr_t offset;
3185 size_t used_len, start, npages;
3186 MigrationState *s = migrate_get_current();
3187
3188 /* This function is currently expected to be used during live migration */
3189 if (!migration_is_setup_or_active(s->state)) {
3190 return;
3191 }
3192
3193 for (; len > 0; len -= used_len, addr += used_len) {
3194 block = qemu_ram_block_from_host(addr, false, &offset);
3195 if (unlikely(!block || offset >= block->used_length)) {
3196 /*
3197 * The implementation might not support RAMBlock resize during
3198 * live migration, but it could happen in theory with future
3199 * updates. So we add a check here to capture that case.
3200 */
3201 error_report_once("%s unexpected error", __func__);
3202 return;
3203 }
3204
3205 if (len <= block->used_length - offset) {
3206 used_len = len;
3207 } else {
3208 used_len = block->used_length - offset;
3209 }
3210
3211 start = offset >> TARGET_PAGE_BITS;
3212 npages = used_len >> TARGET_PAGE_BITS;
3213
3214 qemu_mutex_lock(&ram_state->bitmap_mutex);
3143577d
WW
3215 /*
3216 * The skipped free pages are equavalent to be sent from clear_bmap's
3217 * perspective, so clear the bits from the memory region bitmap which
3218 * are initially set. Otherwise those skipped pages will be sent in
3219 * the next round after syncing from the memory region bitmap.
3220 */
1230a25f 3221 migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
6bcb05fc
WW
3222 ram_state->migration_dirty_pages -=
3223 bitmap_count_one_with_offset(block->bmap, start, npages);
3224 bitmap_clear(block->bmap, start, npages);
3225 qemu_mutex_unlock(&ram_state->bitmap_mutex);
3226 }
3227}
3228
3d0684b2
JQ
3229/*
3230 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
a91246c9
HZ
3231 * long-running RCU critical section. When rcu-reclaims in the code
3232 * start to become numerous it will be necessary to reduce the
3233 * granularity of these critical sections.
3234 */
3235
3d0684b2
JQ
3236/**
3237 * ram_save_setup: Setup RAM for migration
3238 *
3239 * Returns zero to indicate success and negative for error
3240 *
3241 * @f: QEMUFile where to send the data
3242 * @opaque: RAMState pointer
3243 */
a91246c9
HZ
3244static int ram_save_setup(QEMUFile *f, void *opaque)
3245{
53518d94 3246 RAMState **rsp = opaque;
a91246c9 3247 RAMBlock *block;
33d70973 3248 int ret;
a91246c9 3249
dcaf446e
XG
3250 if (compress_threads_save_setup()) {
3251 return -1;
3252 }
3253
a91246c9
HZ
3254 /* migration has already setup the bitmap, reuse it. */
3255 if (!migration_in_colo_state()) {
7d00ee6a 3256 if (ram_init_all(rsp) != 0) {
dcaf446e 3257 compress_threads_save_cleanup();
a91246c9 3258 return -1;
53518d94 3259 }
a91246c9 3260 }
7f401b80 3261 (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f;
a91246c9 3262
0e6ebd48 3263 WITH_RCU_READ_LOCK_GUARD() {
8008a272
JQ
3264 qemu_put_be64(f, ram_bytes_total_with_ignored()
3265 | RAM_SAVE_FLAG_MEM_SIZE);
56e93d26 3266
0e6ebd48
DDAG
3267 RAMBLOCK_FOREACH_MIGRATABLE(block) {
3268 qemu_put_byte(f, strlen(block->idstr));
3269 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3270 qemu_put_be64(f, block->used_length);
3271 if (migrate_postcopy_ram() && block->page_size !=
3272 qemu_host_page_size) {
3273 qemu_put_be64(f, block->page_size);
3274 }
3275 if (migrate_ignore_shared()) {
3276 qemu_put_be64(f, block->mr->addr);
3277 }
fbd162e6 3278 }
56e93d26
JQ
3279 }
3280
56e93d26
JQ
3281 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3282 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3283
4010ba38
JQ
3284 migration_ops = g_malloc0(sizeof(MigrationOps));
3285 migration_ops->ram_save_target_page = ram_save_target_page_legacy;
8ebb6ecc 3286 ret = multifd_send_sync_main(f);
33d70973
LB
3287 if (ret < 0) {
3288 return ret;
3289 }
3290
56e93d26 3291 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
35374cbd 3292 qemu_fflush(f);
56e93d26
JQ
3293
3294 return 0;
3295}
3296
3d0684b2
JQ
3297/**
3298 * ram_save_iterate: iterative stage for migration
3299 *
3300 * Returns zero to indicate success and negative for error
3301 *
3302 * @f: QEMUFile where to send the data
3303 * @opaque: RAMState pointer
3304 */
56e93d26
JQ
3305static int ram_save_iterate(QEMUFile *f, void *opaque)
3306{
53518d94
JQ
3307 RAMState **temp = opaque;
3308 RAMState *rs = *temp;
3d4095b2 3309 int ret = 0;
56e93d26
JQ
3310 int i;
3311 int64_t t0;
5c90308f 3312 int done = 0;
56e93d26 3313
b2557345
PL
3314 if (blk_mig_bulk_active()) {
3315 /* Avoid transferring ram during bulk phase of block migration as
3316 * the bulk phase will usually take a long time and transferring
3317 * ram updates during that time is pointless. */
3318 goto out;
3319 }
3320
63268c49
PX
3321 /*
3322 * We'll take this lock a little bit long, but it's okay for two reasons.
3323 * Firstly, the only possible other thread to take it is who calls
3324 * qemu_guest_free_page_hint(), which should be rare; secondly, see
3325 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3326 * guarantees that we'll at least released it in a regular basis.
3327 */
3328 qemu_mutex_lock(&rs->bitmap_mutex);
89ac5a1d
DDAG
3329 WITH_RCU_READ_LOCK_GUARD() {
3330 if (ram_list.version != rs->last_version) {
3331 ram_state_reset(rs);
3332 }
56e93d26 3333
89ac5a1d
DDAG
3334 /* Read version before ram_list.blocks */
3335 smp_rmb();
56e93d26 3336
89ac5a1d 3337 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
56e93d26 3338
89ac5a1d
DDAG
3339 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3340 i = 0;
3341 while ((ret = qemu_file_rate_limit(f)) == 0 ||
a1fe28df 3342 postcopy_has_request(rs)) {
89ac5a1d 3343 int pages;
e03a34f8 3344
89ac5a1d
DDAG
3345 if (qemu_file_get_error(f)) {
3346 break;
3347 }
e8f3735f 3348
05931ec5 3349 pages = ram_find_and_save_block(rs);
89ac5a1d
DDAG
3350 /* no more pages to sent */
3351 if (pages == 0) {
3352 done = 1;
3353 break;
3354 }
e8f3735f 3355
89ac5a1d
DDAG
3356 if (pages < 0) {
3357 qemu_file_set_error(f, pages);
56e93d26
JQ
3358 break;
3359 }
89ac5a1d
DDAG
3360
3361 rs->target_page_count += pages;
3362
644acf99
WY
3363 /*
3364 * During postcopy, it is necessary to make sure one whole host
3365 * page is sent in one chunk.
3366 */
3367 if (migrate_postcopy_ram()) {
3368 flush_compressed_data(rs);
3369 }
3370
89ac5a1d
DDAG
3371 /*
3372 * we want to check in the 1st loop, just in case it was the 1st
3373 * time and we had to sync the dirty bitmap.
3374 * qemu_clock_get_ns() is a bit expensive, so we only check each
3375 * some iterations
3376 */
3377 if ((i & 63) == 0) {
3378 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3379 1000000;
3380 if (t1 > MAX_WAIT) {
3381 trace_ram_save_iterate_big_wait(t1, i);
3382 break;
3383 }
3384 }
3385 i++;
56e93d26 3386 }
56e93d26 3387 }
63268c49 3388 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26
JQ
3389
3390 /*
3391 * Must occur before EOS (or any QEMUFile operation)
3392 * because of RDMA protocol.
3393 */
3394 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3395
b2557345 3396out:
b69a0227
JQ
3397 if (ret >= 0
3398 && migration_is_setup_or_active(migrate_get_current()->state)) {
7f401b80 3399 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
33d70973
LB
3400 if (ret < 0) {
3401 return ret;
3402 }
3403
3d4095b2
JQ
3404 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3405 qemu_fflush(f);
4c2d0f6d 3406 ram_transferred_add(8);
56e93d26 3407
3d4095b2
JQ
3408 ret = qemu_file_get_error(f);
3409 }
56e93d26
JQ
3410 if (ret < 0) {
3411 return ret;
3412 }
3413
5c90308f 3414 return done;
56e93d26
JQ
3415}
3416
3d0684b2
JQ
3417/**
3418 * ram_save_complete: function called to send the remaining amount of ram
3419 *
e8f3735f 3420 * Returns zero to indicate success or negative on error
3d0684b2
JQ
3421 *
3422 * Called with iothread lock
3423 *
3424 * @f: QEMUFile where to send the data
3425 * @opaque: RAMState pointer
3426 */
56e93d26
JQ
3427static int ram_save_complete(QEMUFile *f, void *opaque)
3428{
53518d94
JQ
3429 RAMState **temp = opaque;
3430 RAMState *rs = *temp;
e8f3735f 3431 int ret = 0;
6f37bb8b 3432
05931ec5
JQ
3433 rs->last_stage = !migration_in_colo_state();
3434
89ac5a1d
DDAG
3435 WITH_RCU_READ_LOCK_GUARD() {
3436 if (!migration_in_postcopy()) {
3437 migration_bitmap_sync_precopy(rs);
3438 }
56e93d26 3439
89ac5a1d 3440 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
56e93d26 3441
89ac5a1d 3442 /* try transferring iterative blocks of memory */
56e93d26 3443
89ac5a1d 3444 /* flush all remaining blocks regardless of rate limiting */
c13221b5 3445 qemu_mutex_lock(&rs->bitmap_mutex);
89ac5a1d
DDAG
3446 while (true) {
3447 int pages;
56e93d26 3448
05931ec5 3449 pages = ram_find_and_save_block(rs);
89ac5a1d
DDAG
3450 /* no more blocks to sent */
3451 if (pages == 0) {
3452 break;
3453 }
3454 if (pages < 0) {
3455 ret = pages;
3456 break;
3457 }
e8f3735f 3458 }
c13221b5 3459 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26 3460
89ac5a1d
DDAG
3461 flush_compressed_data(rs);
3462 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3463 }
d09a6fde 3464
33d70973
LB
3465 if (ret < 0) {
3466 return ret;
3d4095b2 3467 }
56e93d26 3468
7f401b80 3469 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
33d70973
LB
3470 if (ret < 0) {
3471 return ret;
3472 }
3473
3474 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3475 qemu_fflush(f);
3476
3477 return 0;
56e93d26
JQ
3478}
3479
24beea4e
JQ
3480static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy,
3481 uint64_t *can_postcopy)
56e93d26 3482{
53518d94
JQ
3483 RAMState **temp = opaque;
3484 RAMState *rs = *temp;
56e93d26 3485
c8df4a7a 3486 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 3487
c8df4a7a
JQ
3488 if (migrate_postcopy_ram()) {
3489 /* We can do postcopy, and all the data is postcopiable */
24beea4e 3490 *can_postcopy += remaining_size;
c8df4a7a 3491 } else {
24beea4e 3492 *must_precopy += remaining_size;
c8df4a7a
JQ
3493 }
3494}
3495
24beea4e
JQ
3496static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy,
3497 uint64_t *can_postcopy)
c8df4a7a 3498{
28ef5339 3499 MigrationState *s = migrate_get_current();
c8df4a7a
JQ
3500 RAMState **temp = opaque;
3501 RAMState *rs = *temp;
3502
3503 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3504
28ef5339 3505 if (!migration_in_postcopy() && remaining_size < s->threshold_size) {
56e93d26 3506 qemu_mutex_lock_iothread();
89ac5a1d
DDAG
3507 WITH_RCU_READ_LOCK_GUARD() {
3508 migration_bitmap_sync_precopy(rs);
3509 }
56e93d26 3510 qemu_mutex_unlock_iothread();
9edabd4d 3511 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 3512 }
c31b098f 3513
86e1167e
VSO
3514 if (migrate_postcopy_ram()) {
3515 /* We can do postcopy, and all the data is postcopiable */
24beea4e 3516 *can_postcopy += remaining_size;
86e1167e 3517 } else {
24beea4e 3518 *must_precopy += remaining_size;
86e1167e 3519 }
56e93d26
JQ
3520}
3521
3522static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3523{
3524 unsigned int xh_len;
3525 int xh_flags;
063e760a 3526 uint8_t *loaded_data;
56e93d26 3527
56e93d26
JQ
3528 /* extract RLE header */
3529 xh_flags = qemu_get_byte(f);
3530 xh_len = qemu_get_be16(f);
3531
3532 if (xh_flags != ENCODING_FLAG_XBZRLE) {
3533 error_report("Failed to load XBZRLE page - wrong compression!");
3534 return -1;
3535 }
3536
3537 if (xh_len > TARGET_PAGE_SIZE) {
3538 error_report("Failed to load XBZRLE page - len overflow!");
3539 return -1;
3540 }
f265e0e4 3541 loaded_data = XBZRLE.decoded_buf;
56e93d26 3542 /* load data and decode */
f265e0e4 3543 /* it can change loaded_data to point to an internal buffer */
063e760a 3544 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
56e93d26
JQ
3545
3546 /* decode RLE */
063e760a 3547 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
56e93d26
JQ
3548 TARGET_PAGE_SIZE) == -1) {
3549 error_report("Failed to load XBZRLE page - decode error!");
3550 return -1;
3551 }
3552
3553 return 0;
3554}
3555
3d0684b2
JQ
3556/**
3557 * ram_block_from_stream: read a RAMBlock id from the migration stream
3558 *
3559 * Must be called from within a rcu critical section.
3560 *
56e93d26 3561 * Returns a pointer from within the RCU-protected ram_list.
a7180877 3562 *
755e8d7c 3563 * @mis: the migration incoming state pointer
3d0684b2
JQ
3564 * @f: QEMUFile where to read the data from
3565 * @flags: Page flags (mostly to see if it's a continuation of previous block)
c01b16ed 3566 * @channel: the channel we're using
a7180877 3567 */
755e8d7c 3568static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
c01b16ed
PX
3569 QEMUFile *f, int flags,
3570 int channel)
56e93d26 3571{
c01b16ed 3572 RAMBlock *block = mis->last_recv_block[channel];
56e93d26
JQ
3573 char id[256];
3574 uint8_t len;
3575
3576 if (flags & RAM_SAVE_FLAG_CONTINUE) {
4c4bad48 3577 if (!block) {
56e93d26
JQ
3578 error_report("Ack, bad migration stream!");
3579 return NULL;
3580 }
4c4bad48 3581 return block;
56e93d26
JQ
3582 }
3583
3584 len = qemu_get_byte(f);
3585 qemu_get_buffer(f, (uint8_t *)id, len);
3586 id[len] = 0;
3587
e3dd7493 3588 block = qemu_ram_block_by_name(id);
4c4bad48
HZ
3589 if (!block) {
3590 error_report("Can't find block %s", id);
3591 return NULL;
56e93d26
JQ
3592 }
3593
fbd162e6 3594 if (ramblock_is_ignored(block)) {
b895de50
CLG
3595 error_report("block %s should not be migrated !", id);
3596 return NULL;
3597 }
3598
c01b16ed 3599 mis->last_recv_block[channel] = block;
755e8d7c 3600
4c4bad48
HZ
3601 return block;
3602}
3603
3604static inline void *host_from_ram_block_offset(RAMBlock *block,
3605 ram_addr_t offset)
3606{
3607 if (!offset_in_ramblock(block, offset)) {
3608 return NULL;
3609 }
3610
3611 return block->host + offset;
56e93d26
JQ
3612}
3613
6a23f639
DH
3614static void *host_page_from_ram_block_offset(RAMBlock *block,
3615 ram_addr_t offset)
3616{
3617 /* Note: Explicitly no check against offset_in_ramblock(). */
3618 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3619 block->page_size);
3620}
3621
3622static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3623 ram_addr_t offset)
3624{
3625 return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3626}
3627
13af18f2 3628static inline void *colo_cache_from_block_offset(RAMBlock *block,
8af66371 3629 ram_addr_t offset, bool record_bitmap)
13af18f2
ZC
3630{
3631 if (!offset_in_ramblock(block, offset)) {
3632 return NULL;
3633 }
3634 if (!block->colo_cache) {
3635 error_report("%s: colo_cache is NULL in block :%s",
3636 __func__, block->idstr);
3637 return NULL;
3638 }
7d9acafa
ZC
3639
3640 /*
3641 * During colo checkpoint, we need bitmap of these migrated pages.
3642 * It help us to decide which pages in ram cache should be flushed
3643 * into VM's RAM later.
3644 */
8af66371
HZ
3645 if (record_bitmap &&
3646 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
7d9acafa
ZC
3647 ram_state->migration_dirty_pages++;
3648 }
13af18f2
ZC
3649 return block->colo_cache + offset;
3650}
3651
3d0684b2
JQ
3652/**
3653 * ram_handle_compressed: handle the zero page case
3654 *
56e93d26
JQ
3655 * If a page (or a whole RDMA chunk) has been
3656 * determined to be zero, then zap it.
3d0684b2
JQ
3657 *
3658 * @host: host address for the zero page
3659 * @ch: what the page is filled from. We only support zero
3660 * @size: size of the zero page
56e93d26
JQ
3661 */
3662void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3663{
bad452a7 3664 if (ch != 0 || !buffer_is_zero(host, size)) {
56e93d26
JQ
3665 memset(host, ch, size);
3666 }
3667}
3668
797ca154
XG
3669/* return the size after decompression, or negative value on error */
3670static int
3671qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3672 const uint8_t *source, size_t source_len)
3673{
3674 int err;
3675
3676 err = inflateReset(stream);
3677 if (err != Z_OK) {
3678 return -1;
3679 }
3680
3681 stream->avail_in = source_len;
3682 stream->next_in = (uint8_t *)source;
3683 stream->avail_out = dest_len;
3684 stream->next_out = dest;
3685
3686 err = inflate(stream, Z_NO_FLUSH);
3687 if (err != Z_STREAM_END) {
3688 return -1;
3689 }
3690
3691 return stream->total_out;
3692}
3693
56e93d26
JQ
3694static void *do_data_decompress(void *opaque)
3695{
3696 DecompressParam *param = opaque;
3697 unsigned long pagesize;
33d151f4 3698 uint8_t *des;
34ab9e97 3699 int len, ret;
56e93d26 3700
33d151f4 3701 qemu_mutex_lock(&param->mutex);
90e56fb4 3702 while (!param->quit) {
33d151f4
LL
3703 if (param->des) {
3704 des = param->des;
3705 len = param->len;
3706 param->des = 0;
3707 qemu_mutex_unlock(&param->mutex);
3708
56e93d26 3709 pagesize = TARGET_PAGE_SIZE;
34ab9e97
XG
3710
3711 ret = qemu_uncompress_data(&param->stream, des, pagesize,
3712 param->compbuf, len);
f548222c 3713 if (ret < 0 && migrate_get_current()->decompress_error_check) {
34ab9e97
XG
3714 error_report("decompress data failed");
3715 qemu_file_set_error(decomp_file, ret);
3716 }
73a8912b 3717
33d151f4
LL
3718 qemu_mutex_lock(&decomp_done_lock);
3719 param->done = true;
3720 qemu_cond_signal(&decomp_done_cond);
3721 qemu_mutex_unlock(&decomp_done_lock);
3722
3723 qemu_mutex_lock(&param->mutex);
3724 } else {
3725 qemu_cond_wait(&param->cond, &param->mutex);
3726 }
56e93d26 3727 }
33d151f4 3728 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
3729
3730 return NULL;
3731}
3732
34ab9e97 3733static int wait_for_decompress_done(void)
5533b2e9
LL
3734{
3735 int idx, thread_count;
3736
3737 if (!migrate_use_compression()) {
34ab9e97 3738 return 0;
5533b2e9
LL
3739 }
3740
3741 thread_count = migrate_decompress_threads();
3742 qemu_mutex_lock(&decomp_done_lock);
3743 for (idx = 0; idx < thread_count; idx++) {
3744 while (!decomp_param[idx].done) {
3745 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3746 }
3747 }
3748 qemu_mutex_unlock(&decomp_done_lock);
34ab9e97 3749 return qemu_file_get_error(decomp_file);
5533b2e9
LL
3750}
3751
f0afa331 3752static void compress_threads_load_cleanup(void)
56e93d26
JQ
3753{
3754 int i, thread_count;
3755
3416ab5b
JQ
3756 if (!migrate_use_compression()) {
3757 return;
3758 }
56e93d26
JQ
3759 thread_count = migrate_decompress_threads();
3760 for (i = 0; i < thread_count; i++) {
797ca154
XG
3761 /*
3762 * we use it as a indicator which shows if the thread is
3763 * properly init'd or not
3764 */
3765 if (!decomp_param[i].compbuf) {
3766 break;
3767 }
3768
56e93d26 3769 qemu_mutex_lock(&decomp_param[i].mutex);
90e56fb4 3770 decomp_param[i].quit = true;
56e93d26
JQ
3771 qemu_cond_signal(&decomp_param[i].cond);
3772 qemu_mutex_unlock(&decomp_param[i].mutex);
3773 }
3774 for (i = 0; i < thread_count; i++) {
797ca154
XG
3775 if (!decomp_param[i].compbuf) {
3776 break;
3777 }
3778
56e93d26
JQ
3779 qemu_thread_join(decompress_threads + i);
3780 qemu_mutex_destroy(&decomp_param[i].mutex);
3781 qemu_cond_destroy(&decomp_param[i].cond);
797ca154 3782 inflateEnd(&decomp_param[i].stream);
56e93d26 3783 g_free(decomp_param[i].compbuf);
797ca154 3784 decomp_param[i].compbuf = NULL;
56e93d26
JQ
3785 }
3786 g_free(decompress_threads);
3787 g_free(decomp_param);
56e93d26
JQ
3788 decompress_threads = NULL;
3789 decomp_param = NULL;
34ab9e97 3790 decomp_file = NULL;
56e93d26
JQ
3791}
3792
34ab9e97 3793static int compress_threads_load_setup(QEMUFile *f)
797ca154
XG
3794{
3795 int i, thread_count;
3796
3797 if (!migrate_use_compression()) {
3798 return 0;
3799 }
3800
3801 thread_count = migrate_decompress_threads();
3802 decompress_threads = g_new0(QemuThread, thread_count);
3803 decomp_param = g_new0(DecompressParam, thread_count);
3804 qemu_mutex_init(&decomp_done_lock);
3805 qemu_cond_init(&decomp_done_cond);
34ab9e97 3806 decomp_file = f;
797ca154
XG
3807 for (i = 0; i < thread_count; i++) {
3808 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3809 goto exit;
3810 }
3811
3812 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3813 qemu_mutex_init(&decomp_param[i].mutex);
3814 qemu_cond_init(&decomp_param[i].cond);
3815 decomp_param[i].done = true;
3816 decomp_param[i].quit = false;
3817 qemu_thread_create(decompress_threads + i, "decompress",
3818 do_data_decompress, decomp_param + i,
3819 QEMU_THREAD_JOINABLE);
3820 }
3821 return 0;
3822exit:
3823 compress_threads_load_cleanup();
3824 return -1;
3825}
3826
c1bc6626 3827static void decompress_data_with_multi_threads(QEMUFile *f,
56e93d26
JQ
3828 void *host, int len)
3829{
3830 int idx, thread_count;
3831
3832 thread_count = migrate_decompress_threads();
37396950 3833 QEMU_LOCK_GUARD(&decomp_done_lock);
56e93d26
JQ
3834 while (true) {
3835 for (idx = 0; idx < thread_count; idx++) {
73a8912b 3836 if (decomp_param[idx].done) {
33d151f4
LL
3837 decomp_param[idx].done = false;
3838 qemu_mutex_lock(&decomp_param[idx].mutex);
c1bc6626 3839 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
56e93d26
JQ
3840 decomp_param[idx].des = host;
3841 decomp_param[idx].len = len;
33d151f4
LL
3842 qemu_cond_signal(&decomp_param[idx].cond);
3843 qemu_mutex_unlock(&decomp_param[idx].mutex);
56e93d26
JQ
3844 break;
3845 }
3846 }
3847 if (idx < thread_count) {
3848 break;
73a8912b
LL
3849 } else {
3850 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
56e93d26
JQ
3851 }
3852 }
3853}
3854
b70cb3b4
RL
3855static void colo_init_ram_state(void)
3856{
3857 ram_state_init(&ram_state);
b70cb3b4
RL
3858}
3859
13af18f2
ZC
3860/*
3861 * colo cache: this is for secondary VM, we cache the whole
3862 * memory of the secondary VM, it is need to hold the global lock
3863 * to call this helper.
3864 */
3865int colo_init_ram_cache(void)
3866{
3867 RAMBlock *block;
3868
44901b5a
PB
3869 WITH_RCU_READ_LOCK_GUARD() {
3870 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3871 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
8dbe22c6 3872 NULL, false, false);
44901b5a
PB
3873 if (!block->colo_cache) {
3874 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3875 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3876 block->used_length);
3877 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3878 if (block->colo_cache) {
3879 qemu_anon_ram_free(block->colo_cache, block->used_length);
3880 block->colo_cache = NULL;
3881 }
89ac5a1d 3882 }
44901b5a 3883 return -errno;
89ac5a1d 3884 }
e5fdf920
LS
3885 if (!machine_dump_guest_core(current_machine)) {
3886 qemu_madvise(block->colo_cache, block->used_length,
3887 QEMU_MADV_DONTDUMP);
3888 }
13af18f2 3889 }
13af18f2 3890 }
44901b5a 3891
7d9acafa
ZC
3892 /*
3893 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3894 * with to decide which page in cache should be flushed into SVM's RAM. Here
3895 * we use the same name 'ram_bitmap' as for migration.
3896 */
3897 if (ram_bytes_total()) {
3898 RAMBlock *block;
3899
fbd162e6 3900 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7d9acafa 3901 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
7d9acafa 3902 block->bmap = bitmap_new(pages);
7d9acafa
ZC
3903 }
3904 }
7d9acafa 3905
b70cb3b4 3906 colo_init_ram_state();
13af18f2 3907 return 0;
13af18f2
ZC
3908}
3909
0393031a
HZ
3910/* TODO: duplicated with ram_init_bitmaps */
3911void colo_incoming_start_dirty_log(void)
3912{
3913 RAMBlock *block = NULL;
3914 /* For memory_global_dirty_log_start below. */
3915 qemu_mutex_lock_iothread();
3916 qemu_mutex_lock_ramlist();
3917
3918 memory_global_dirty_log_sync();
3919 WITH_RCU_READ_LOCK_GUARD() {
3920 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3921 ramblock_sync_dirty_bitmap(ram_state, block);
3922 /* Discard this dirty bitmap record */
3923 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3924 }
63b41db4 3925 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
0393031a
HZ
3926 }
3927 ram_state->migration_dirty_pages = 0;
3928 qemu_mutex_unlock_ramlist();
3929 qemu_mutex_unlock_iothread();
3930}
3931
13af18f2
ZC
3932/* It is need to hold the global lock to call this helper */
3933void colo_release_ram_cache(void)
3934{
3935 RAMBlock *block;
3936
63b41db4 3937 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
fbd162e6 3938 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7d9acafa
ZC
3939 g_free(block->bmap);
3940 block->bmap = NULL;
3941 }
3942
89ac5a1d
DDAG
3943 WITH_RCU_READ_LOCK_GUARD() {
3944 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3945 if (block->colo_cache) {
3946 qemu_anon_ram_free(block->colo_cache, block->used_length);
3947 block->colo_cache = NULL;
3948 }
13af18f2
ZC
3949 }
3950 }
0393031a 3951 ram_state_cleanup(&ram_state);
13af18f2
ZC
3952}
3953
f265e0e4
JQ
3954/**
3955 * ram_load_setup: Setup RAM for migration incoming side
3956 *
3957 * Returns zero to indicate success and negative for error
3958 *
3959 * @f: QEMUFile where to receive the data
3960 * @opaque: RAMState pointer
3961 */
3962static int ram_load_setup(QEMUFile *f, void *opaque)
3963{
34ab9e97 3964 if (compress_threads_load_setup(f)) {
797ca154
XG
3965 return -1;
3966 }
3967
f265e0e4 3968 xbzrle_load_setup();
f9494614 3969 ramblock_recv_map_init();
13af18f2 3970
f265e0e4
JQ
3971 return 0;
3972}
3973
3974static int ram_load_cleanup(void *opaque)
3975{
f9494614 3976 RAMBlock *rb;
56eb90af 3977
fbd162e6 3978 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
bd108a44 3979 qemu_ram_block_writeback(rb);
56eb90af
JH
3980 }
3981
f265e0e4 3982 xbzrle_load_cleanup();
f0afa331 3983 compress_threads_load_cleanup();
f9494614 3984
fbd162e6 3985 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
f9494614
AP
3986 g_free(rb->receivedmap);
3987 rb->receivedmap = NULL;
3988 }
13af18f2 3989
f265e0e4
JQ
3990 return 0;
3991}
3992
3d0684b2
JQ
3993/**
3994 * ram_postcopy_incoming_init: allocate postcopy data structures
3995 *
3996 * Returns 0 for success and negative if there was one error
3997 *
3998 * @mis: current migration incoming state
3999 *
4000 * Allocate data structures etc needed by incoming migration with
4001 * postcopy-ram. postcopy-ram's similarly names
4002 * postcopy_ram_incoming_init does the work.
1caddf8a
DDAG
4003 */
4004int ram_postcopy_incoming_init(MigrationIncomingState *mis)
4005{
c136180c 4006 return postcopy_ram_incoming_init(mis);
1caddf8a
DDAG
4007}
4008
3d0684b2
JQ
4009/**
4010 * ram_load_postcopy: load a page in postcopy case
4011 *
4012 * Returns 0 for success or -errno in case of error
4013 *
a7180877
DDAG
4014 * Called in postcopy mode by ram_load().
4015 * rcu_read_lock is taken prior to this being called.
3d0684b2
JQ
4016 *
4017 * @f: QEMUFile where to send the data
36f62f11 4018 * @channel: the channel to use for loading
a7180877 4019 */
36f62f11 4020int ram_load_postcopy(QEMUFile *f, int channel)
a7180877
DDAG
4021{
4022 int flags = 0, ret = 0;
4023 bool place_needed = false;
1aa83678 4024 bool matches_target_page_size = false;
a7180877 4025 MigrationIncomingState *mis = migration_incoming_get_current();
36f62f11 4026 PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
a7180877
DDAG
4027
4028 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4029 ram_addr_t addr;
a7180877
DDAG
4030 void *page_buffer = NULL;
4031 void *place_source = NULL;
df9ff5e1 4032 RAMBlock *block = NULL;
a7180877 4033 uint8_t ch;
644acf99 4034 int len;
a7180877
DDAG
4035
4036 addr = qemu_get_be64(f);
7a9ddfbf
PX
4037
4038 /*
4039 * If qemu file error, we should stop here, and then "addr"
4040 * may be invalid
4041 */
4042 ret = qemu_file_get_error(f);
4043 if (ret) {
4044 break;
4045 }
4046
a7180877
DDAG
4047 flags = addr & ~TARGET_PAGE_MASK;
4048 addr &= TARGET_PAGE_MASK;
4049
36f62f11 4050 trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
644acf99
WY
4051 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4052 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
c01b16ed 4053 block = ram_block_from_stream(mis, f, flags, channel);
6a23f639
DH
4054 if (!block) {
4055 ret = -EINVAL;
4056 break;
4057 }
4c4bad48 4058
898ba906
DH
4059 /*
4060 * Relying on used_length is racy and can result in false positives.
4061 * We might place pages beyond used_length in case RAM was shrunk
4062 * while in postcopy, which is fine - trying to place via
4063 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
4064 */
4065 if (!block->host || addr >= block->postcopy_length) {
a7180877
DDAG
4066 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4067 ret = -EINVAL;
4068 break;
4069 }
77dadc3f 4070 tmp_page->target_pages++;
1aa83678 4071 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
a7180877 4072 /*
28abd200
DDAG
4073 * Postcopy requires that we place whole host pages atomically;
4074 * these may be huge pages for RAMBlocks that are backed by
4075 * hugetlbfs.
a7180877
DDAG
4076 * To make it atomic, the data is read into a temporary page
4077 * that's moved into place later.
4078 * The migration protocol uses, possibly smaller, target-pages
4079 * however the source ensures it always sends all the components
91ba442f 4080 * of a host page in one chunk.
a7180877 4081 */
77dadc3f 4082 page_buffer = tmp_page->tmp_huge_page +
6a23f639
DH
4083 host_page_offset_from_ram_block_offset(block, addr);
4084 /* If all TP are zero then we can optimise the place */
77dadc3f
PX
4085 if (tmp_page->target_pages == 1) {
4086 tmp_page->host_addr =
4087 host_page_from_ram_block_offset(block, addr);
4088 } else if (tmp_page->host_addr !=
4089 host_page_from_ram_block_offset(block, addr)) {
c53b7ddc 4090 /* not the 1st TP within the HP */
36f62f11 4091 error_report("Non-same host page detected on channel %d: "
cfc7dc8a
PX
4092 "Target host page %p, received host page %p "
4093 "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
36f62f11 4094 channel, tmp_page->host_addr,
cfc7dc8a
PX
4095 host_page_from_ram_block_offset(block, addr),
4096 block->idstr, addr, tmp_page->target_pages);
6a23f639
DH
4097 ret = -EINVAL;
4098 break;
a7180877
DDAG
4099 }
4100
4101 /*
4102 * If it's the last part of a host page then we place the host
4103 * page
4104 */
77dadc3f
PX
4105 if (tmp_page->target_pages ==
4106 (block->page_size / TARGET_PAGE_SIZE)) {
4cbb3c63 4107 place_needed = true;
4cbb3c63 4108 }
77dadc3f 4109 place_source = tmp_page->tmp_huge_page;
a7180877
DDAG
4110 }
4111
4112 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
bb890ed5 4113 case RAM_SAVE_FLAG_ZERO:
a7180877 4114 ch = qemu_get_byte(f);
2e36bc1b
WY
4115 /*
4116 * Can skip to set page_buffer when
4117 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
4118 */
4119 if (ch || !matches_target_page_size) {
4120 memset(page_buffer, ch, TARGET_PAGE_SIZE);
4121 }
a7180877 4122 if (ch) {
77dadc3f 4123 tmp_page->all_zero = false;
a7180877
DDAG
4124 }
4125 break;
4126
4127 case RAM_SAVE_FLAG_PAGE:
77dadc3f 4128 tmp_page->all_zero = false;
1aa83678
PX
4129 if (!matches_target_page_size) {
4130 /* For huge pages, we always use temporary buffer */
a7180877
DDAG
4131 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4132 } else {
1aa83678
PX
4133 /*
4134 * For small pages that matches target page size, we
4135 * avoid the qemu_file copy. Instead we directly use
4136 * the buffer of QEMUFile to place the page. Note: we
4137 * cannot do any QEMUFile operation before using that
4138 * buffer to make sure the buffer is valid when
4139 * placing the page.
a7180877
DDAG
4140 */
4141 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4142 TARGET_PAGE_SIZE);
4143 }
4144 break;
644acf99 4145 case RAM_SAVE_FLAG_COMPRESS_PAGE:
77dadc3f 4146 tmp_page->all_zero = false;
644acf99
WY
4147 len = qemu_get_be32(f);
4148 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4149 error_report("Invalid compressed data length: %d", len);
4150 ret = -EINVAL;
4151 break;
4152 }
4153 decompress_data_with_multi_threads(f, page_buffer, len);
4154 break;
4155
a7180877
DDAG
4156 case RAM_SAVE_FLAG_EOS:
4157 /* normal exit */
6df264ac 4158 multifd_recv_sync_main();
a7180877
DDAG
4159 break;
4160 default:
29fccade 4161 error_report("Unknown combination of migration flags: 0x%x"
a7180877
DDAG
4162 " (postcopy mode)", flags);
4163 ret = -EINVAL;
7a9ddfbf
PX
4164 break;
4165 }
4166
644acf99
WY
4167 /* Got the whole host page, wait for decompress before placing. */
4168 if (place_needed) {
4169 ret |= wait_for_decompress_done();
4170 }
4171
7a9ddfbf
PX
4172 /* Detect for any possible file errors */
4173 if (!ret && qemu_file_get_error(f)) {
4174 ret = qemu_file_get_error(f);
a7180877
DDAG
4175 }
4176
7a9ddfbf 4177 if (!ret && place_needed) {
77dadc3f
PX
4178 if (tmp_page->all_zero) {
4179 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
a7180877 4180 } else {
77dadc3f
PX
4181 ret = postcopy_place_page(mis, tmp_page->host_addr,
4182 place_source, block);
a7180877 4183 }
ddf35bdf 4184 place_needed = false;
77dadc3f 4185 postcopy_temp_page_reset(tmp_page);
a7180877 4186 }
a7180877
DDAG
4187 }
4188
4189 return ret;
4190}
4191
acab30b8
DHB
4192static bool postcopy_is_running(void)
4193{
4194 PostcopyState ps = postcopy_state_get();
4195 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4196}
4197
e6f4aa18
ZC
4198/*
4199 * Flush content of RAM cache into SVM's memory.
4200 * Only flush the pages that be dirtied by PVM or SVM or both.
4201 */
24fa16f8 4202void colo_flush_ram_cache(void)
e6f4aa18
ZC
4203{
4204 RAMBlock *block = NULL;
4205 void *dst_host;
4206 void *src_host;
4207 unsigned long offset = 0;
4208
d1955d22 4209 memory_global_dirty_log_sync();
89ac5a1d
DDAG
4210 WITH_RCU_READ_LOCK_GUARD() {
4211 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4212 ramblock_sync_dirty_bitmap(ram_state, block);
4213 }
d1955d22 4214 }
d1955d22 4215
e6f4aa18 4216 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
89ac5a1d
DDAG
4217 WITH_RCU_READ_LOCK_GUARD() {
4218 block = QLIST_FIRST_RCU(&ram_list.blocks);
e6f4aa18 4219
89ac5a1d 4220 while (block) {
a6a83cef 4221 unsigned long num = 0;
e6f4aa18 4222
a6a83cef 4223 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
542147f4
DH
4224 if (!offset_in_ramblock(block,
4225 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
89ac5a1d 4226 offset = 0;
a6a83cef 4227 num = 0;
89ac5a1d
DDAG
4228 block = QLIST_NEXT_RCU(block, next);
4229 } else {
a6a83cef
RL
4230 unsigned long i = 0;
4231
4232 for (i = 0; i < num; i++) {
4233 migration_bitmap_clear_dirty(ram_state, block, offset + i);
4234 }
8bba004c
AR
4235 dst_host = block->host
4236 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4237 src_host = block->colo_cache
4238 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
a6a83cef
RL
4239 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
4240 offset += num;
89ac5a1d 4241 }
e6f4aa18
ZC
4242 }
4243 }
e6f4aa18
ZC
4244 trace_colo_flush_ram_cache_end();
4245}
4246
10da4a36
WY
4247/**
4248 * ram_load_precopy: load pages in precopy case
4249 *
4250 * Returns 0 for success or -errno in case of error
4251 *
4252 * Called in precopy mode by ram_load().
4253 * rcu_read_lock is taken prior to this being called.
4254 *
4255 * @f: QEMUFile where to send the data
4256 */
4257static int ram_load_precopy(QEMUFile *f)
56e93d26 4258{
755e8d7c 4259 MigrationIncomingState *mis = migration_incoming_get_current();
e65cec5e 4260 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
ef08fb38 4261 /* ADVISE is earlier, it shows the source has the postcopy capability on */
80fe315c 4262 bool postcopy_advised = migration_incoming_postcopy_advised();
edc60127
JQ
4263 if (!migrate_use_compression()) {
4264 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4265 }
a7180877 4266
10da4a36 4267 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
56e93d26 4268 ram_addr_t addr, total_ram_bytes;
0393031a 4269 void *host = NULL, *host_bak = NULL;
56e93d26
JQ
4270 uint8_t ch;
4271
e65cec5e
YK
4272 /*
4273 * Yield periodically to let main loop run, but an iteration of
4274 * the main loop is expensive, so do it each some iterations
4275 */
4276 if ((i & 32767) == 0 && qemu_in_coroutine()) {
4277 aio_co_schedule(qemu_get_current_aio_context(),
4278 qemu_coroutine_self());
4279 qemu_coroutine_yield();
4280 }
4281 i++;
4282
56e93d26
JQ
4283 addr = qemu_get_be64(f);
4284 flags = addr & ~TARGET_PAGE_MASK;
4285 addr &= TARGET_PAGE_MASK;
4286
edc60127
JQ
4287 if (flags & invalid_flags) {
4288 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4289 error_report("Received an unexpected compressed page");
4290 }
4291
4292 ret = -EINVAL;
4293 break;
4294 }
4295
bb890ed5 4296 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
a776aa15 4297 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
c01b16ed
PX
4298 RAMBlock *block = ram_block_from_stream(mis, f, flags,
4299 RAM_CHANNEL_PRECOPY);
4c4bad48 4300
0393031a 4301 host = host_from_ram_block_offset(block, addr);
13af18f2 4302 /*
0393031a
HZ
4303 * After going into COLO stage, we should not load the page
4304 * into SVM's memory directly, we put them into colo_cache firstly.
4305 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
4306 * Previously, we copied all these memory in preparing stage of COLO
4307 * while we need to stop VM, which is a time-consuming process.
4308 * Here we optimize it by a trick, back-up every page while in
4309 * migration process while COLO is enabled, though it affects the
4310 * speed of the migration, but it obviously reduce the downtime of
4311 * back-up all SVM'S memory in COLO preparing stage.
13af18f2 4312 */
0393031a
HZ
4313 if (migration_incoming_colo_enabled()) {
4314 if (migration_incoming_in_colo_state()) {
4315 /* In COLO stage, put all pages into cache temporarily */
8af66371 4316 host = colo_cache_from_block_offset(block, addr, true);
0393031a
HZ
4317 } else {
4318 /*
4319 * In migration stage but before COLO stage,
4320 * Put all pages into both cache and SVM's memory.
4321 */
8af66371 4322 host_bak = colo_cache_from_block_offset(block, addr, false);
0393031a 4323 }
13af18f2 4324 }
a776aa15
DDAG
4325 if (!host) {
4326 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4327 ret = -EINVAL;
4328 break;
4329 }
13af18f2
ZC
4330 if (!migration_incoming_in_colo_state()) {
4331 ramblock_recv_bitmap_set(block, host);
4332 }
4333
1db9d8e5 4334 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
a776aa15
DDAG
4335 }
4336
56e93d26
JQ
4337 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4338 case RAM_SAVE_FLAG_MEM_SIZE:
4339 /* Synchronize RAM block list */
4340 total_ram_bytes = addr;
4341 while (!ret && total_ram_bytes) {
4342 RAMBlock *block;
56e93d26
JQ
4343 char id[256];
4344 ram_addr_t length;
4345
4346 len = qemu_get_byte(f);
4347 qemu_get_buffer(f, (uint8_t *)id, len);
4348 id[len] = 0;
4349 length = qemu_get_be64(f);
4350
e3dd7493 4351 block = qemu_ram_block_by_name(id);
b895de50
CLG
4352 if (block && !qemu_ram_is_migratable(block)) {
4353 error_report("block %s should not be migrated !", id);
4354 ret = -EINVAL;
4355 } else if (block) {
e3dd7493
DDAG
4356 if (length != block->used_length) {
4357 Error *local_err = NULL;
56e93d26 4358
fa53a0e5 4359 ret = qemu_ram_resize(block, length,
e3dd7493
DDAG
4360 &local_err);
4361 if (local_err) {
4362 error_report_err(local_err);
56e93d26 4363 }
56e93d26 4364 }
ef08fb38 4365 /* For postcopy we need to check hugepage sizes match */
e846b746 4366 if (postcopy_advised && migrate_postcopy_ram() &&
ef08fb38
DDAG
4367 block->page_size != qemu_host_page_size) {
4368 uint64_t remote_page_size = qemu_get_be64(f);
4369 if (remote_page_size != block->page_size) {
4370 error_report("Mismatched RAM page size %s "
4371 "(local) %zd != %" PRId64,
4372 id, block->page_size,
4373 remote_page_size);
4374 ret = -EINVAL;
4375 }
4376 }
fbd162e6
YK
4377 if (migrate_ignore_shared()) {
4378 hwaddr addr = qemu_get_be64(f);
fbd162e6
YK
4379 if (ramblock_is_ignored(block) &&
4380 block->mr->addr != addr) {
4381 error_report("Mismatched GPAs for block %s "
4382 "%" PRId64 "!= %" PRId64,
4383 id, (uint64_t)addr,
4384 (uint64_t)block->mr->addr);
4385 ret = -EINVAL;
4386 }
4387 }
e3dd7493
DDAG
4388 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4389 block->idstr);
4390 } else {
56e93d26
JQ
4391 error_report("Unknown ramblock \"%s\", cannot "
4392 "accept migration", id);
4393 ret = -EINVAL;
4394 }
4395
4396 total_ram_bytes -= length;
4397 }
4398 break;
a776aa15 4399
bb890ed5 4400 case RAM_SAVE_FLAG_ZERO:
56e93d26
JQ
4401 ch = qemu_get_byte(f);
4402 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4403 break;
a776aa15 4404
56e93d26 4405 case RAM_SAVE_FLAG_PAGE:
56e93d26
JQ
4406 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4407 break;
56e93d26 4408
a776aa15 4409 case RAM_SAVE_FLAG_COMPRESS_PAGE:
56e93d26
JQ
4410 len = qemu_get_be32(f);
4411 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4412 error_report("Invalid compressed data length: %d", len);
4413 ret = -EINVAL;
4414 break;
4415 }
c1bc6626 4416 decompress_data_with_multi_threads(f, host, len);
56e93d26 4417 break;
a776aa15 4418
56e93d26 4419 case RAM_SAVE_FLAG_XBZRLE:
56e93d26
JQ
4420 if (load_xbzrle(f, addr, host) < 0) {
4421 error_report("Failed to decompress XBZRLE page at "
4422 RAM_ADDR_FMT, addr);
4423 ret = -EINVAL;
4424 break;
4425 }
4426 break;
4427 case RAM_SAVE_FLAG_EOS:
4428 /* normal exit */
6df264ac 4429 multifd_recv_sync_main();
56e93d26
JQ
4430 break;
4431 default:
4432 if (flags & RAM_SAVE_FLAG_HOOK) {
632e3a5c 4433 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
56e93d26 4434 } else {
29fccade 4435 error_report("Unknown combination of migration flags: 0x%x",
56e93d26
JQ
4436 flags);
4437 ret = -EINVAL;
4438 }
4439 }
4440 if (!ret) {
4441 ret = qemu_file_get_error(f);
4442 }
0393031a
HZ
4443 if (!ret && host_bak) {
4444 memcpy(host_bak, host, TARGET_PAGE_SIZE);
4445 }
56e93d26
JQ
4446 }
4447
ca1a6b70 4448 ret |= wait_for_decompress_done();
10da4a36
WY
4449 return ret;
4450}
4451
4452static int ram_load(QEMUFile *f, void *opaque, int version_id)
4453{
4454 int ret = 0;
4455 static uint64_t seq_iter;
4456 /*
4457 * If system is running in postcopy mode, page inserts to host memory must
4458 * be atomic
4459 */
4460 bool postcopy_running = postcopy_is_running();
4461
4462 seq_iter++;
4463
4464 if (version_id != 4) {
4465 return -EINVAL;
4466 }
4467
4468 /*
4469 * This RCU critical section can be very long running.
4470 * When RCU reclaims in the code start to become numerous,
4471 * it will be necessary to reduce the granularity of this
4472 * critical section.
4473 */
89ac5a1d
DDAG
4474 WITH_RCU_READ_LOCK_GUARD() {
4475 if (postcopy_running) {
36f62f11
PX
4476 /*
4477 * Note! Here RAM_CHANNEL_PRECOPY is the precopy channel of
4478 * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
4479 * service fast page faults.
4480 */
4481 ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
89ac5a1d
DDAG
4482 } else {
4483 ret = ram_load_precopy(f);
4484 }
10da4a36 4485 }
55c4446b 4486 trace_ram_load_complete(ret, seq_iter);
e6f4aa18 4487
56e93d26
JQ
4488 return ret;
4489}
4490
c6467627
VSO
4491static bool ram_has_postcopy(void *opaque)
4492{
469dd51b 4493 RAMBlock *rb;
fbd162e6 4494 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
469dd51b
JH
4495 if (ramblock_is_pmem(rb)) {
4496 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4497 "is not supported now!", rb->idstr, rb->host);
4498 return false;
4499 }
4500 }
4501
c6467627
VSO
4502 return migrate_postcopy_ram();
4503}
4504
edd090c7
PX
4505/* Sync all the dirty bitmap with destination VM. */
4506static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4507{
4508 RAMBlock *block;
4509 QEMUFile *file = s->to_dst_file;
4510 int ramblock_count = 0;
4511
4512 trace_ram_dirty_bitmap_sync_start();
4513
fbd162e6 4514 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
edd090c7
PX
4515 qemu_savevm_send_recv_bitmap(file, block->idstr);
4516 trace_ram_dirty_bitmap_request(block->idstr);
4517 ramblock_count++;
4518 }
4519
4520 trace_ram_dirty_bitmap_sync_wait();
4521
4522 /* Wait until all the ramblocks' dirty bitmap synced */
4523 while (ramblock_count--) {
4524 qemu_sem_wait(&s->rp_state.rp_sem);
4525 }
4526
4527 trace_ram_dirty_bitmap_sync_complete();
4528
4529 return 0;
4530}
4531
4532static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4533{
4534 qemu_sem_post(&s->rp_state.rp_sem);
4535}
4536
a335debb
PX
4537/*
4538 * Read the received bitmap, revert it as the initial dirty bitmap.
4539 * This is only used when the postcopy migration is paused but wants
4540 * to resume from a middle point.
4541 */
4542int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4543{
4544 int ret = -EINVAL;
43044ac0 4545 /* from_dst_file is always valid because we're within rp_thread */
a335debb
PX
4546 QEMUFile *file = s->rp_state.from_dst_file;
4547 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
a725ef9f 4548 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
4549 uint64_t size, end_mark;
4550
4551 trace_ram_dirty_bitmap_reload_begin(block->idstr);
4552
4553 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4554 error_report("%s: incorrect state %s", __func__,
4555 MigrationStatus_str(s->state));
4556 return -EINVAL;
4557 }
4558
4559 /*
4560 * Note: see comments in ramblock_recv_bitmap_send() on why we
3a4452d8 4561 * need the endianness conversion, and the paddings.
a335debb
PX
4562 */
4563 local_size = ROUND_UP(local_size, 8);
4564
4565 /* Add paddings */
4566 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4567
4568 size = qemu_get_be64(file);
4569
4570 /* The size of the bitmap should match with our ramblock */
4571 if (size != local_size) {
4572 error_report("%s: ramblock '%s' bitmap size mismatch "
4573 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4574 block->idstr, size, local_size);
4575 ret = -EINVAL;
4576 goto out;
4577 }
4578
4579 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4580 end_mark = qemu_get_be64(file);
4581
4582 ret = qemu_file_get_error(file);
4583 if (ret || size != local_size) {
4584 error_report("%s: read bitmap failed for ramblock '%s': %d"
4585 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4586 __func__, block->idstr, ret, local_size, size);
4587 ret = -EIO;
4588 goto out;
4589 }
4590
4591 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
af3bbbe9 4592 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
a335debb
PX
4593 __func__, block->idstr, end_mark);
4594 ret = -EINVAL;
4595 goto out;
4596 }
4597
4598 /*
3a4452d8 4599 * Endianness conversion. We are during postcopy (though paused).
a335debb
PX
4600 * The dirty bitmap won't change. We can directly modify it.
4601 */
4602 bitmap_from_le(block->bmap, le_bitmap, nbits);
4603
4604 /*
4605 * What we received is "received bitmap". Revert it as the initial
4606 * dirty bitmap for this ramblock.
4607 */
4608 bitmap_complement(block->bmap, block->bmap, nbits);
4609
be39b4cd
DH
4610 /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4611 ramblock_dirty_bitmap_clear_discarded_pages(block);
4612
4613 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
a335debb
PX
4614 trace_ram_dirty_bitmap_reload_complete(block->idstr);
4615
edd090c7
PX
4616 /*
4617 * We succeeded to sync bitmap for current ramblock. If this is
4618 * the last one to sync, we need to notify the main send thread.
4619 */
4620 ram_dirty_bitmap_reload_notify(s);
4621
a335debb
PX
4622 ret = 0;
4623out:
bf269906 4624 g_free(le_bitmap);
a335debb
PX
4625 return ret;
4626}
4627
edd090c7
PX
4628static int ram_resume_prepare(MigrationState *s, void *opaque)
4629{
4630 RAMState *rs = *(RAMState **)opaque;
08614f34 4631 int ret;
edd090c7 4632
08614f34
PX
4633 ret = ram_dirty_bitmap_sync_all(s, rs);
4634 if (ret) {
4635 return ret;
4636 }
4637
4638 ram_state_resume_prepare(rs, s->to_dst_file);
4639
4640 return 0;
edd090c7
PX
4641}
4642
36f62f11
PX
4643void postcopy_preempt_shutdown_file(MigrationState *s)
4644{
4645 qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
4646 qemu_fflush(s->postcopy_qemufile_src);
4647}
4648
56e93d26 4649static SaveVMHandlers savevm_ram_handlers = {
9907e842 4650 .save_setup = ram_save_setup,
56e93d26 4651 .save_live_iterate = ram_save_iterate,
763c906b 4652 .save_live_complete_postcopy = ram_save_complete,
a3e06c3d 4653 .save_live_complete_precopy = ram_save_complete,
c6467627 4654 .has_postcopy = ram_has_postcopy,
c8df4a7a
JQ
4655 .state_pending_exact = ram_state_pending_exact,
4656 .state_pending_estimate = ram_state_pending_estimate,
56e93d26 4657 .load_state = ram_load,
f265e0e4
JQ
4658 .save_cleanup = ram_save_cleanup,
4659 .load_setup = ram_load_setup,
4660 .load_cleanup = ram_load_cleanup,
edd090c7 4661 .resume_prepare = ram_resume_prepare,
56e93d26
JQ
4662};
4663
c7c0e724
DH
4664static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4665 size_t old_size, size_t new_size)
4666{
cc61c703 4667 PostcopyState ps = postcopy_state_get();
c7c0e724
DH
4668 ram_addr_t offset;
4669 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4670 Error *err = NULL;
4671
4672 if (ramblock_is_ignored(rb)) {
4673 return;
4674 }
4675
4676 if (!migration_is_idle()) {
4677 /*
4678 * Precopy code on the source cannot deal with the size of RAM blocks
4679 * changing at random points in time - especially after sending the
4680 * RAM block sizes in the migration stream, they must no longer change.
4681 * Abort and indicate a proper reason.
4682 */
4683 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
458fecca 4684 migration_cancel(err);
c7c0e724 4685 error_free(err);
c7c0e724 4686 }
cc61c703
DH
4687
4688 switch (ps) {
4689 case POSTCOPY_INCOMING_ADVISE:
4690 /*
4691 * Update what ram_postcopy_incoming_init()->init_range() does at the
4692 * time postcopy was advised. Syncing RAM blocks with the source will
4693 * result in RAM resizes.
4694 */
4695 if (old_size < new_size) {
4696 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4697 error_report("RAM block '%s' discard of resized RAM failed",
4698 rb->idstr);
4699 }
4700 }
898ba906 4701 rb->postcopy_length = new_size;
cc61c703
DH
4702 break;
4703 case POSTCOPY_INCOMING_NONE:
4704 case POSTCOPY_INCOMING_RUNNING:
4705 case POSTCOPY_INCOMING_END:
4706 /*
4707 * Once our guest is running, postcopy does no longer care about
4708 * resizes. When growing, the new memory was not available on the
4709 * source, no handler needed.
4710 */
4711 break;
4712 default:
4713 error_report("RAM block '%s' resized during postcopy state: %d",
4714 rb->idstr, ps);
4715 exit(-1);
4716 }
c7c0e724
DH
4717}
4718
4719static RAMBlockNotifier ram_mig_ram_notifier = {
4720 .ram_block_resized = ram_mig_ram_block_resized,
4721};
4722
56e93d26
JQ
4723void ram_mig_init(void)
4724{
4725 qemu_mutex_init(&XBZRLE.lock);
ce62df53 4726 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
c7c0e724 4727 ram_block_notifier_add(&ram_mig_ram_notifier);
56e93d26 4728}