]> git.proxmox.com Git - mirror_qemu.git/blame - migration/ram.c
migration: I messed state_pending_exact/estimate
[mirror_qemu.git] / migration / ram.c
CommitLineData
56e93d26
JQ
1/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
76cc7b58
JQ
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
56e93d26
JQ
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
e688df6b 28
1393a485 29#include "qemu/osdep.h"
f348b6d1 30#include "qemu/cutils.h"
56e93d26
JQ
31#include "qemu/bitops.h"
32#include "qemu/bitmap.h"
b85ea5fa 33#include "qemu/madvise.h"
7205c9ec 34#include "qemu/main-loop.h"
c0e0825c 35#include "io/channel-null.h"
709e3fe8 36#include "xbzrle.h"
7b1e1a22 37#include "ram.h"
6666c96a 38#include "migration.h"
f2a8f0a6 39#include "migration/register.h"
7b1e1a22 40#include "migration/misc.h"
08a0aee1 41#include "qemu-file.h"
be07b0ac 42#include "postcopy-ram.h"
53d37d36 43#include "page_cache.h"
56e93d26 44#include "qemu/error-report.h"
e688df6b 45#include "qapi/error.h"
ab7cbb0b 46#include "qapi/qapi-types-migration.h"
9af23989 47#include "qapi/qapi-events-migration.h"
8acabf69 48#include "qapi/qmp/qerror.h"
56e93d26 49#include "trace.h"
56e93d26 50#include "exec/ram_addr.h"
f9494614 51#include "exec/target_page.h"
56e93d26 52#include "qemu/rcu_queue.h"
a91246c9 53#include "migration/colo.h"
53d37d36 54#include "block.h"
b0c3cf94 55#include "sysemu/cpu-throttle.h"
edd090c7 56#include "savevm.h"
b9ee2f7d 57#include "qemu/iov.h"
d32ca5ad 58#include "multifd.h"
278e2f55
AG
59#include "sysemu/runstate.h"
60
e5fdf920
LS
61#include "hw/boards.h" /* for machine_dump_guest_core() */
62
278e2f55
AG
63#if defined(__linux__)
64#include "qemu/userfaultfd.h"
65#endif /* defined(__linux__) */
56e93d26 66
56e93d26
JQ
67/***********************************************************/
68/* ram save/restore */
69
bb890ed5
JQ
70/* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
71 * worked for pages that where filled with the same char. We switched
72 * it to only search for the zero value. And to avoid confusion with
73 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
74 */
75
56e93d26 76#define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
bb890ed5 77#define RAM_SAVE_FLAG_ZERO 0x02
56e93d26
JQ
78#define RAM_SAVE_FLAG_MEM_SIZE 0x04
79#define RAM_SAVE_FLAG_PAGE 0x08
80#define RAM_SAVE_FLAG_EOS 0x10
81#define RAM_SAVE_FLAG_CONTINUE 0x20
82#define RAM_SAVE_FLAG_XBZRLE 0x40
83/* 0x80 is reserved in migration.h start with 0x100 next */
84#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
85
9360447d
JQ
86XBZRLECacheStats xbzrle_counters;
87
f1668764
PX
88/* used by the search for pages to send */
89struct PageSearchStatus {
90 /* The migration channel used for a specific host page */
91 QEMUFile *pss_channel;
ec6f3ab9
PX
92 /* Last block from where we have sent data */
93 RAMBlock *last_sent_block;
f1668764
PX
94 /* Current block being searched */
95 RAMBlock *block;
96 /* Current page to search from */
97 unsigned long page;
98 /* Set once we wrap around */
99 bool complete_round;
f1668764
PX
100 /* Whether we're sending a host page */
101 bool host_page_sending;
102 /* The start/end of current host page. Invalid if host_page_sending==false */
103 unsigned long host_page_start;
104 unsigned long host_page_end;
105};
106typedef struct PageSearchStatus PageSearchStatus;
107
56e93d26
JQ
108/* struct contains XBZRLE cache and a static page
109 used by the compression */
110static struct {
111 /* buffer used for XBZRLE encoding */
112 uint8_t *encoded_buf;
113 /* buffer for storing page content */
114 uint8_t *current_buf;
115 /* Cache for XBZRLE, Protected by lock. */
116 PageCache *cache;
117 QemuMutex lock;
c00e0928
JQ
118 /* it will store a page full of zeros */
119 uint8_t *zero_target_page;
f265e0e4
JQ
120 /* buffer used for XBZRLE decoding */
121 uint8_t *decoded_buf;
56e93d26
JQ
122} XBZRLE;
123
56e93d26
JQ
124static void XBZRLE_cache_lock(void)
125{
f4c51a6b 126 if (migrate_use_xbzrle()) {
56e93d26 127 qemu_mutex_lock(&XBZRLE.lock);
f4c51a6b 128 }
56e93d26
JQ
129}
130
131static void XBZRLE_cache_unlock(void)
132{
f4c51a6b 133 if (migrate_use_xbzrle()) {
56e93d26 134 qemu_mutex_unlock(&XBZRLE.lock);
f4c51a6b 135 }
56e93d26
JQ
136}
137
3d0684b2
JQ
138/**
139 * xbzrle_cache_resize: resize the xbzrle cache
140 *
cbde7be9 141 * This function is called from migrate_params_apply in main
3d0684b2
JQ
142 * thread, possibly while a migration is in progress. A running
143 * migration may be using the cache and might finish during this call,
144 * hence changes to the cache are protected by XBZRLE.lock().
145 *
c9dede2d 146 * Returns 0 for success or -1 for error
3d0684b2
JQ
147 *
148 * @new_size: new cache size
8acabf69 149 * @errp: set *errp if the check failed, with reason
56e93d26 150 */
8b9407a0 151int xbzrle_cache_resize(uint64_t new_size, Error **errp)
56e93d26
JQ
152{
153 PageCache *new_cache;
c9dede2d 154 int64_t ret = 0;
56e93d26 155
8acabf69
JQ
156 /* Check for truncation */
157 if (new_size != (size_t)new_size) {
158 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
159 "exceeding address space");
160 return -1;
161 }
162
2a313e5c
JQ
163 if (new_size == migrate_xbzrle_cache_size()) {
164 /* nothing to do */
c9dede2d 165 return 0;
2a313e5c
JQ
166 }
167
56e93d26
JQ
168 XBZRLE_cache_lock();
169
170 if (XBZRLE.cache != NULL) {
80f8dfde 171 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
56e93d26 172 if (!new_cache) {
56e93d26
JQ
173 ret = -1;
174 goto out;
175 }
176
177 cache_fini(XBZRLE.cache);
178 XBZRLE.cache = new_cache;
179 }
56e93d26
JQ
180out:
181 XBZRLE_cache_unlock();
182 return ret;
183}
184
20123ee1
PX
185static bool postcopy_preempt_active(void)
186{
187 return migrate_postcopy_preempt() && migration_in_postcopy();
188}
189
3ded54b1 190bool ramblock_is_ignored(RAMBlock *block)
fbd162e6
YK
191{
192 return !qemu_ram_is_migratable(block) ||
193 (migrate_ignore_shared() && qemu_ram_is_shared(block));
194}
195
343f632c
DDAG
196#undef RAMBLOCK_FOREACH
197
fbd162e6
YK
198int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
199{
200 RAMBlock *block;
201 int ret = 0;
202
89ac5a1d
DDAG
203 RCU_READ_LOCK_GUARD();
204
fbd162e6
YK
205 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
206 ret = func(block, opaque);
207 if (ret) {
208 break;
209 }
210 }
fbd162e6
YK
211 return ret;
212}
213
f9494614
AP
214static void ramblock_recv_map_init(void)
215{
216 RAMBlock *rb;
217
fbd162e6 218 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
f9494614
AP
219 assert(!rb->receivedmap);
220 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
221 }
222}
223
224int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
225{
226 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
227 rb->receivedmap);
228}
229
1cba9f6e
DDAG
230bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
231{
232 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
233}
234
f9494614
AP
235void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
236{
237 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
238}
239
240void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
241 size_t nr)
242{
243 bitmap_set_atomic(rb->receivedmap,
244 ramblock_recv_bitmap_offset(host_addr, rb),
245 nr);
246}
247
a335debb
PX
248#define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
249
250/*
251 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
252 *
253 * Returns >0 if success with sent bytes, or <0 if error.
254 */
255int64_t ramblock_recv_bitmap_send(QEMUFile *file,
256 const char *block_name)
257{
258 RAMBlock *block = qemu_ram_block_by_name(block_name);
259 unsigned long *le_bitmap, nbits;
260 uint64_t size;
261
262 if (!block) {
263 error_report("%s: invalid block name: %s", __func__, block_name);
264 return -1;
265 }
266
898ba906 267 nbits = block->postcopy_length >> TARGET_PAGE_BITS;
a335debb
PX
268
269 /*
270 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
271 * machines we may need 4 more bytes for padding (see below
272 * comment). So extend it a bit before hand.
273 */
274 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
275
276 /*
277 * Always use little endian when sending the bitmap. This is
278 * required that when source and destination VMs are not using the
3a4452d8 279 * same endianness. (Note: big endian won't work.)
a335debb
PX
280 */
281 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
282
283 /* Size of the bitmap, in bytes */
a725ef9f 284 size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
285
286 /*
287 * size is always aligned to 8 bytes for 64bit machines, but it
288 * may not be true for 32bit machines. We need this padding to
289 * make sure the migration can survive even between 32bit and
290 * 64bit machines.
291 */
292 size = ROUND_UP(size, 8);
293
294 qemu_put_be64(file, size);
295 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
296 /*
297 * Mark as an end, in case the middle part is screwed up due to
3a4452d8 298 * some "mysterious" reason.
a335debb
PX
299 */
300 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
301 qemu_fflush(file);
302
bf269906 303 g_free(le_bitmap);
a335debb
PX
304
305 if (qemu_file_get_error(file)) {
306 return qemu_file_get_error(file);
307 }
308
309 return size + sizeof(size);
310}
311
ec481c6c
JQ
312/*
313 * An outstanding page request, on the source, having been received
314 * and queued
315 */
316struct RAMSrcPageRequest {
317 RAMBlock *rb;
318 hwaddr offset;
319 hwaddr len;
320
321 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
322};
323
6f37bb8b
JQ
324/* State of RAM for migration */
325struct RAMState {
f1668764
PX
326 /*
327 * PageSearchStatus structures for the channels when send pages.
328 * Protected by the bitmap_mutex.
329 */
330 PageSearchStatus pss[RAM_CHANNEL_MAX];
278e2f55
AG
331 /* UFFD file descriptor, used in 'write-tracking' migration */
332 int uffdio_fd;
8d80e195
JQ
333 /* total ram size in bytes */
334 uint64_t ram_bytes_total;
6f37bb8b
JQ
335 /* Last block that we have visited searching for dirty pages */
336 RAMBlock *last_seen_block;
269ace29
JQ
337 /* Last dirty target page we have sent */
338 ram_addr_t last_page;
6f37bb8b
JQ
339 /* last ram version we have seen */
340 uint32_t last_version;
8d820d6f
JQ
341 /* How many times we have dirty too many pages */
342 int dirty_rate_high_cnt;
f664da80
JQ
343 /* these variables are used for bitmap sync */
344 /* last time we did a full bitmap_sync */
345 int64_t time_last_bitmap_sync;
eac74159 346 /* bytes transferred at start_time */
c4bdf0cf 347 uint64_t bytes_xfer_prev;
a66cd90c 348 /* number of dirty pages since start_time */
68908ed6 349 uint64_t num_dirty_pages_period;
b5833fde
JQ
350 /* xbzrle misses since the beginning of the period */
351 uint64_t xbzrle_cache_miss_prev;
e460a4b1
WW
352 /* Amount of xbzrle pages since the beginning of the period */
353 uint64_t xbzrle_pages_prev;
354 /* Amount of xbzrle encoded bytes since the beginning of the period */
355 uint64_t xbzrle_bytes_prev;
1a373522
DH
356 /* Start using XBZRLE (e.g., after the first round). */
357 bool xbzrle_enabled;
05931ec5
JQ
358 /* Are we on the last stage of migration */
359 bool last_stage;
76e03000
XG
360 /* compression statistics since the beginning of the period */
361 /* amount of count that no free thread to compress data */
362 uint64_t compress_thread_busy_prev;
363 /* amount bytes after compression */
364 uint64_t compressed_size_prev;
365 /* amount of compressed pages */
366 uint64_t compress_pages_prev;
367
be8b02ed
XG
368 /* total handled target pages at the beginning of period */
369 uint64_t target_page_count_prev;
370 /* total handled target pages since start */
371 uint64_t target_page_count;
9360447d 372 /* number of dirty bits in the bitmap */
2dfaf12e 373 uint64_t migration_dirty_pages;
f1668764
PX
374 /*
375 * Protects:
376 * - dirty/clear bitmap
377 * - migration_dirty_pages
378 * - pss structures
379 */
108cfae0 380 QemuMutex bitmap_mutex;
68a098f3
JQ
381 /* The RAMBlock used in the last src_page_requests */
382 RAMBlock *last_req_rb;
ec481c6c
JQ
383 /* Queue of outstanding page requests from the destination */
384 QemuMutex src_page_req_mutex;
b58deb34 385 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
6f37bb8b
JQ
386};
387typedef struct RAMState RAMState;
388
53518d94 389static RAMState *ram_state;
6f37bb8b 390
bd227060
WW
391static NotifierWithReturnList precopy_notifier_list;
392
a1fe28df
PX
393/* Whether postcopy has queued requests? */
394static bool postcopy_has_request(RAMState *rs)
395{
396 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
397}
398
bd227060
WW
399void precopy_infrastructure_init(void)
400{
401 notifier_with_return_list_init(&precopy_notifier_list);
402}
403
404void precopy_add_notifier(NotifierWithReturn *n)
405{
406 notifier_with_return_list_add(&precopy_notifier_list, n);
407}
408
409void precopy_remove_notifier(NotifierWithReturn *n)
410{
411 notifier_with_return_remove(n);
412}
413
414int precopy_notify(PrecopyNotifyReason reason, Error **errp)
415{
416 PrecopyNotifyData pnd;
417 pnd.reason = reason;
418 pnd.errp = errp;
419
420 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
421}
422
9edabd4d 423uint64_t ram_bytes_remaining(void)
2f4fde93 424{
bae416e5
DDAG
425 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
426 0;
2f4fde93
JQ
427}
428
23b7576d
PX
429/*
430 * NOTE: not all stats in ram_counters are used in reality. See comments
431 * for struct MigrationAtomicStats. The ultimate result of ram migration
432 * counters will be a merged version with both ram_counters and the atomic
433 * fields in ram_atomic_counters.
434 */
9360447d 435MigrationStats ram_counters;
23b7576d 436MigrationAtomicStats ram_atomic_counters;
96506894 437
26a26069 438void ram_transferred_add(uint64_t bytes)
4c2d0f6d 439{
ae680668
DE
440 if (runstate_is_running()) {
441 ram_counters.precopy_bytes += bytes;
442 } else if (migration_in_postcopy()) {
23b7576d 443 stat64_add(&ram_atomic_counters.postcopy_bytes, bytes);
ae680668
DE
444 } else {
445 ram_counters.downtime_bytes += bytes;
446 }
23b7576d 447 stat64_add(&ram_atomic_counters.transferred, bytes);
4c2d0f6d
DE
448}
449
d59c40cc
LB
450void dirty_sync_missed_zero_copy(void)
451{
452 ram_counters.dirty_sync_missed_zero_copy++;
453}
454
4010ba38
JQ
455struct MigrationOps {
456 int (*ram_save_target_page)(RAMState *rs, PageSearchStatus *pss);
457};
458typedef struct MigrationOps MigrationOps;
459
460MigrationOps *migration_ops;
461
76e03000
XG
462CompressionStats compression_counters;
463
56e93d26 464struct CompressParam {
56e93d26 465 bool done;
90e56fb4 466 bool quit;
5e5fdcff 467 bool zero_page;
56e93d26
JQ
468 QEMUFile *file;
469 QemuMutex mutex;
470 QemuCond cond;
471 RAMBlock *block;
472 ram_addr_t offset;
34ab9e97
XG
473
474 /* internally used fields */
dcaf446e 475 z_stream stream;
34ab9e97 476 uint8_t *originbuf;
56e93d26
JQ
477};
478typedef struct CompressParam CompressParam;
479
480struct DecompressParam {
73a8912b 481 bool done;
90e56fb4 482 bool quit;
56e93d26
JQ
483 QemuMutex mutex;
484 QemuCond cond;
485 void *des;
d341d9f3 486 uint8_t *compbuf;
56e93d26 487 int len;
797ca154 488 z_stream stream;
56e93d26
JQ
489};
490typedef struct DecompressParam DecompressParam;
491
492static CompressParam *comp_param;
493static QemuThread *compress_threads;
494/* comp_done_cond is used to wake up the migration thread when
495 * one of the compression threads has finished the compression.
496 * comp_done_lock is used to co-work with comp_done_cond.
497 */
0d9f9a5c
LL
498static QemuMutex comp_done_lock;
499static QemuCond comp_done_cond;
56e93d26 500
34ab9e97 501static QEMUFile *decomp_file;
56e93d26
JQ
502static DecompressParam *decomp_param;
503static QemuThread *decompress_threads;
73a8912b
LL
504static QemuMutex decomp_done_lock;
505static QemuCond decomp_done_cond;
56e93d26 506
93589827
PX
507static int ram_save_host_page_urgent(PageSearchStatus *pss);
508
5e5fdcff 509static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
6ef3771c 510 ram_addr_t offset, uint8_t *source_buf);
56e93d26 511
ebd88a49
PX
512/* NOTE: page is the PFN not real ram_addr_t. */
513static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page)
514{
515 pss->block = rb;
516 pss->page = page;
517 pss->complete_round = false;
518}
519
93589827
PX
520/*
521 * Check whether two PSSs are actively sending the same page. Return true
522 * if it is, false otherwise.
523 */
524static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2)
525{
526 return pss1->host_page_sending && pss2->host_page_sending &&
527 (pss1->host_page_start == pss2->host_page_start);
528}
529
56e93d26
JQ
530static void *do_data_compress(void *opaque)
531{
532 CompressParam *param = opaque;
a7a9a88f
LL
533 RAMBlock *block;
534 ram_addr_t offset;
5e5fdcff 535 bool zero_page;
56e93d26 536
a7a9a88f 537 qemu_mutex_lock(&param->mutex);
90e56fb4 538 while (!param->quit) {
a7a9a88f
LL
539 if (param->block) {
540 block = param->block;
541 offset = param->offset;
542 param->block = NULL;
543 qemu_mutex_unlock(&param->mutex);
544
5e5fdcff
XG
545 zero_page = do_compress_ram_page(param->file, &param->stream,
546 block, offset, param->originbuf);
a7a9a88f 547
0d9f9a5c 548 qemu_mutex_lock(&comp_done_lock);
a7a9a88f 549 param->done = true;
5e5fdcff 550 param->zero_page = zero_page;
0d9f9a5c
LL
551 qemu_cond_signal(&comp_done_cond);
552 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
553
554 qemu_mutex_lock(&param->mutex);
555 } else {
56e93d26
JQ
556 qemu_cond_wait(&param->cond, &param->mutex);
557 }
56e93d26 558 }
a7a9a88f 559 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
560
561 return NULL;
562}
563
f0afa331 564static void compress_threads_save_cleanup(void)
56e93d26
JQ
565{
566 int i, thread_count;
567
05306935 568 if (!migrate_use_compression() || !comp_param) {
56e93d26
JQ
569 return;
570 }
05306935 571
56e93d26
JQ
572 thread_count = migrate_compress_threads();
573 for (i = 0; i < thread_count; i++) {
dcaf446e
XG
574 /*
575 * we use it as a indicator which shows if the thread is
576 * properly init'd or not
577 */
578 if (!comp_param[i].file) {
579 break;
580 }
05306935
FL
581
582 qemu_mutex_lock(&comp_param[i].mutex);
583 comp_param[i].quit = true;
584 qemu_cond_signal(&comp_param[i].cond);
585 qemu_mutex_unlock(&comp_param[i].mutex);
586
56e93d26 587 qemu_thread_join(compress_threads + i);
56e93d26
JQ
588 qemu_mutex_destroy(&comp_param[i].mutex);
589 qemu_cond_destroy(&comp_param[i].cond);
dcaf446e 590 deflateEnd(&comp_param[i].stream);
34ab9e97 591 g_free(comp_param[i].originbuf);
dcaf446e
XG
592 qemu_fclose(comp_param[i].file);
593 comp_param[i].file = NULL;
56e93d26 594 }
0d9f9a5c
LL
595 qemu_mutex_destroy(&comp_done_lock);
596 qemu_cond_destroy(&comp_done_cond);
56e93d26
JQ
597 g_free(compress_threads);
598 g_free(comp_param);
56e93d26
JQ
599 compress_threads = NULL;
600 comp_param = NULL;
56e93d26
JQ
601}
602
dcaf446e 603static int compress_threads_save_setup(void)
56e93d26
JQ
604{
605 int i, thread_count;
606
607 if (!migrate_use_compression()) {
dcaf446e 608 return 0;
56e93d26 609 }
56e93d26
JQ
610 thread_count = migrate_compress_threads();
611 compress_threads = g_new0(QemuThread, thread_count);
612 comp_param = g_new0(CompressParam, thread_count);
0d9f9a5c
LL
613 qemu_cond_init(&comp_done_cond);
614 qemu_mutex_init(&comp_done_lock);
56e93d26 615 for (i = 0; i < thread_count; i++) {
34ab9e97
XG
616 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
617 if (!comp_param[i].originbuf) {
618 goto exit;
619 }
620
dcaf446e
XG
621 if (deflateInit(&comp_param[i].stream,
622 migrate_compress_level()) != Z_OK) {
34ab9e97 623 g_free(comp_param[i].originbuf);
dcaf446e
XG
624 goto exit;
625 }
626
e110aa91
C
627 /* comp_param[i].file is just used as a dummy buffer to save data,
628 * set its ops to empty.
56e93d26 629 */
77ef2dc1 630 comp_param[i].file = qemu_file_new_output(
c0e0825c 631 QIO_CHANNEL(qio_channel_null_new()));
56e93d26 632 comp_param[i].done = true;
90e56fb4 633 comp_param[i].quit = false;
56e93d26
JQ
634 qemu_mutex_init(&comp_param[i].mutex);
635 qemu_cond_init(&comp_param[i].cond);
636 qemu_thread_create(compress_threads + i, "compress",
637 do_data_compress, comp_param + i,
638 QEMU_THREAD_JOINABLE);
639 }
dcaf446e
XG
640 return 0;
641
642exit:
643 compress_threads_save_cleanup();
644 return -1;
56e93d26
JQ
645}
646
647/**
3d0684b2 648 * save_page_header: write page header to wire
56e93d26
JQ
649 *
650 * If this is the 1st block, it also writes the block identification
651 *
3d0684b2 652 * Returns the number of bytes written
56e93d26 653 *
ec6f3ab9 654 * @pss: current PSS channel status
56e93d26
JQ
655 * @block: block that contains the page we want to send
656 * @offset: offset inside the block for the page
657 * in the lower bits, it contains flags
658 */
ec6f3ab9 659static size_t save_page_header(PageSearchStatus *pss, RAMBlock *block,
2bf3aa85 660 ram_addr_t offset)
56e93d26 661{
9f5f380b 662 size_t size, len;
ec6f3ab9
PX
663 bool same_block = (block == pss->last_sent_block);
664 QEMUFile *f = pss->pss_channel;
56e93d26 665
10661f11 666 if (same_block) {
24795694
JQ
667 offset |= RAM_SAVE_FLAG_CONTINUE;
668 }
2bf3aa85 669 qemu_put_be64(f, offset);
56e93d26
JQ
670 size = 8;
671
10661f11 672 if (!same_block) {
9f5f380b 673 len = strlen(block->idstr);
2bf3aa85
JQ
674 qemu_put_byte(f, len);
675 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
9f5f380b 676 size += 1 + len;
ec6f3ab9 677 pss->last_sent_block = block;
56e93d26
JQ
678 }
679 return size;
680}
681
3d0684b2 682/**
179a8080 683 * mig_throttle_guest_down: throttle down the guest
3d0684b2
JQ
684 *
685 * Reduce amount of guest cpu execution to hopefully slow down memory
686 * writes. If guest dirty memory rate is reduced below the rate at
687 * which we can transfer pages to the destination then we should be
688 * able to complete migration. Some workloads dirty memory way too
689 * fast and will not effectively converge, even with auto-converge.
070afca2 690 */
cbbf8182
KZ
691static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
692 uint64_t bytes_dirty_threshold)
070afca2
JH
693{
694 MigrationState *s = migrate_get_current();
2594f56d 695 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
cbbf8182
KZ
696 uint64_t pct_increment = s->parameters.cpu_throttle_increment;
697 bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
4cbc9c7f 698 int pct_max = s->parameters.max_cpu_throttle;
070afca2 699
cbbf8182
KZ
700 uint64_t throttle_now = cpu_throttle_get_percentage();
701 uint64_t cpu_now, cpu_ideal, throttle_inc;
702
070afca2
JH
703 /* We have not started throttling yet. Let's start it. */
704 if (!cpu_throttle_active()) {
705 cpu_throttle_set(pct_initial);
706 } else {
707 /* Throttling already on, just increase the rate */
cbbf8182
KZ
708 if (!pct_tailslow) {
709 throttle_inc = pct_increment;
710 } else {
711 /* Compute the ideal CPU percentage used by Guest, which may
712 * make the dirty rate match the dirty rate threshold. */
713 cpu_now = 100 - throttle_now;
714 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
715 bytes_dirty_period);
716 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
717 }
718 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
070afca2
JH
719 }
720}
721
91fe9a8d
RL
722void mig_throttle_counter_reset(void)
723{
724 RAMState *rs = ram_state;
725
726 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
727 rs->num_dirty_pages_period = 0;
23b7576d 728 rs->bytes_xfer_prev = stat64_get(&ram_atomic_counters.transferred);
91fe9a8d
RL
729}
730
3d0684b2
JQ
731/**
732 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
733 *
6f37bb8b 734 * @rs: current RAM state
3d0684b2
JQ
735 * @current_addr: address for the zero page
736 *
737 * Update the xbzrle cache to reflect a page that's been sent as all 0.
56e93d26
JQ
738 * The important thing is that a stale (not-yet-0'd) page be replaced
739 * by the new data.
740 * As a bonus, if the page wasn't in the cache it gets added so that
3d0684b2 741 * when a small write is made into the 0'd page it gets XBZRLE sent.
56e93d26 742 */
6f37bb8b 743static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
56e93d26 744{
56e93d26
JQ
745 /* We don't care if this fails to allocate a new cache page
746 * as long as it updated an old one */
c00e0928 747 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
9360447d 748 ram_counters.dirty_sync_count);
56e93d26
JQ
749}
750
751#define ENCODING_FLAG_XBZRLE 0x1
752
753/**
754 * save_xbzrle_page: compress and send current page
755 *
756 * Returns: 1 means that we wrote the page
757 * 0 means that page is identical to the one already sent
758 * -1 means that xbzrle would be longer than normal
759 *
5a987738 760 * @rs: current RAM state
ec6f3ab9 761 * @pss: current PSS channel
3d0684b2
JQ
762 * @current_data: pointer to the address of the page contents
763 * @current_addr: addr of the page
56e93d26
JQ
764 * @block: block that contains the page we want to send
765 * @offset: offset inside the block for the page
56e93d26 766 */
ec6f3ab9 767static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
61717ea9
PX
768 uint8_t **current_data, ram_addr_t current_addr,
769 RAMBlock *block, ram_addr_t offset)
56e93d26
JQ
770{
771 int encoded_len = 0, bytes_xbzrle;
772 uint8_t *prev_cached_page;
ec6f3ab9 773 QEMUFile *file = pss->pss_channel;
56e93d26 774
9360447d
JQ
775 if (!cache_is_cached(XBZRLE.cache, current_addr,
776 ram_counters.dirty_sync_count)) {
777 xbzrle_counters.cache_miss++;
05931ec5 778 if (!rs->last_stage) {
56e93d26 779 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
9360447d 780 ram_counters.dirty_sync_count) == -1) {
56e93d26
JQ
781 return -1;
782 } else {
783 /* update *current_data when the page has been
784 inserted into cache */
785 *current_data = get_cached_data(XBZRLE.cache, current_addr);
786 }
787 }
788 return -1;
789 }
790
e460a4b1
WW
791 /*
792 * Reaching here means the page has hit the xbzrle cache, no matter what
793 * encoding result it is (normal encoding, overflow or skipping the page),
3a4452d8 794 * count the page as encoded. This is used to calculate the encoding rate.
e460a4b1
WW
795 *
796 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
797 * 2nd page turns out to be skipped (i.e. no new bytes written to the
798 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
799 * skipped page included. In this way, the encoding rate can tell if the
800 * guest page is good for xbzrle encoding.
801 */
802 xbzrle_counters.pages++;
56e93d26
JQ
803 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
804
805 /* save current buffer into memory */
806 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
807
808 /* XBZRLE encoding (if there is no overflow) */
809 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
810 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
811 TARGET_PAGE_SIZE);
ca353803
WY
812
813 /*
814 * Update the cache contents, so that it corresponds to the data
815 * sent, in all cases except where we skip the page.
816 */
05931ec5 817 if (!rs->last_stage && encoded_len != 0) {
ca353803
WY
818 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
819 /*
820 * In the case where we couldn't compress, ensure that the caller
821 * sends the data from the cache, since the guest might have
822 * changed the RAM since we copied it.
823 */
824 *current_data = prev_cached_page;
825 }
826
56e93d26 827 if (encoded_len == 0) {
55c4446b 828 trace_save_xbzrle_page_skipping();
56e93d26
JQ
829 return 0;
830 } else if (encoded_len == -1) {
55c4446b 831 trace_save_xbzrle_page_overflow();
9360447d 832 xbzrle_counters.overflow++;
e460a4b1 833 xbzrle_counters.bytes += TARGET_PAGE_SIZE;
56e93d26
JQ
834 return -1;
835 }
836
56e93d26 837 /* Send XBZRLE based compressed page */
ec6f3ab9 838 bytes_xbzrle = save_page_header(pss, block,
204b88b8 839 offset | RAM_SAVE_FLAG_XBZRLE);
61717ea9
PX
840 qemu_put_byte(file, ENCODING_FLAG_XBZRLE);
841 qemu_put_be16(file, encoded_len);
842 qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len);
56e93d26 843 bytes_xbzrle += encoded_len + 1 + 2;
e460a4b1
WW
844 /*
845 * Like compressed_size (please see update_compress_thread_counts),
846 * the xbzrle encoded bytes don't count the 8 byte header with
847 * RAM_SAVE_FLAG_CONTINUE.
848 */
849 xbzrle_counters.bytes += bytes_xbzrle - 8;
4c2d0f6d 850 ram_transferred_add(bytes_xbzrle);
56e93d26
JQ
851
852 return 1;
853}
854
3d0684b2 855/**
d9e474ea 856 * pss_find_next_dirty: find the next dirty page of current ramblock
f3f491fc 857 *
d9e474ea
PX
858 * This function updates pss->page to point to the next dirty page index
859 * within the ramblock to migrate, or the end of ramblock when nothing
860 * found. Note that when pss->host_page_sending==true it means we're
861 * during sending a host page, so we won't look for dirty page that is
862 * outside the host page boundary.
3d0684b2 863 *
d9e474ea 864 * @pss: the current page search status
f3f491fc 865 */
d9e474ea 866static void pss_find_next_dirty(PageSearchStatus *pss)
56e93d26 867{
d9e474ea 868 RAMBlock *rb = pss->block;
6b6712ef
JQ
869 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
870 unsigned long *bitmap = rb->bmap;
56e93d26 871
fbd162e6 872 if (ramblock_is_ignored(rb)) {
d9e474ea
PX
873 /* Points directly to the end, so we know no dirty page */
874 pss->page = size;
875 return;
876 }
877
878 /*
879 * If during sending a host page, only look for dirty pages within the
880 * current host page being send.
881 */
882 if (pss->host_page_sending) {
883 assert(pss->host_page_end);
884 size = MIN(size, pss->host_page_end);
b895de50
CLG
885 }
886
d9e474ea 887 pss->page = find_next_bit(bitmap, size, pss->page);
56e93d26
JQ
888}
889
1230a25f 890static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
3143577d
WW
891 unsigned long page)
892{
893 uint8_t shift;
894 hwaddr size, start;
895
896 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
897 return;
898 }
899
900 shift = rb->clear_bmap_shift;
901 /*
902 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
903 * can make things easier sometimes since then start address
904 * of the small chunk will always be 64 pages aligned so the
905 * bitmap will always be aligned to unsigned long. We should
906 * even be able to remove this restriction but I'm simply
907 * keeping it.
908 */
909 assert(shift >= 6);
910
911 size = 1ULL << (TARGET_PAGE_BITS + shift);
7648297d 912 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
3143577d
WW
913 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
914 memory_region_clear_dirty_bitmap(rb->mr, start, size);
915}
916
917static void
1230a25f 918migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
3143577d
WW
919 unsigned long start,
920 unsigned long npages)
921{
922 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
923 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
924 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
925
926 /*
927 * Clear pages from start to start + npages - 1, so the end boundary is
928 * exclusive.
929 */
930 for (i = chunk_start; i < chunk_end; i += chunk_pages) {
1230a25f 931 migration_clear_memory_region_dirty_bitmap(rb, i);
3143577d
WW
932 }
933}
934
a6a83cef
RL
935/*
936 * colo_bitmap_find_diry:find contiguous dirty pages from start
937 *
938 * Returns the page offset within memory region of the start of the contiguout
939 * dirty page
940 *
941 * @rs: current RAM state
942 * @rb: RAMBlock where to search for dirty pages
943 * @start: page where we start the search
944 * @num: the number of contiguous dirty pages
945 */
946static inline
947unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
948 unsigned long start, unsigned long *num)
949{
950 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
951 unsigned long *bitmap = rb->bmap;
952 unsigned long first, next;
953
954 *num = 0;
955
956 if (ramblock_is_ignored(rb)) {
957 return size;
958 }
959
960 first = find_next_bit(bitmap, size, start);
961 if (first >= size) {
962 return first;
963 }
964 next = find_next_zero_bit(bitmap, size, first + 1);
965 assert(next >= first);
966 *num = next - first;
967 return first;
968}
969
06b10688 970static inline bool migration_bitmap_clear_dirty(RAMState *rs,
f20e2865
JQ
971 RAMBlock *rb,
972 unsigned long page)
a82d593b
DDAG
973{
974 bool ret;
a82d593b 975
002cad6b
PX
976 /*
977 * Clear dirty bitmap if needed. This _must_ be called before we
978 * send any of the page in the chunk because we need to make sure
979 * we can capture further page content changes when we sync dirty
980 * log the next time. So as long as we are going to send any of
981 * the page in the chunk we clear the remote dirty bitmap for all.
982 * Clearing it earlier won't be a problem, but too late will.
983 */
1230a25f 984 migration_clear_memory_region_dirty_bitmap(rb, page);
002cad6b 985
6b6712ef 986 ret = test_and_clear_bit(page, rb->bmap);
a82d593b 987 if (ret) {
0d8ec885 988 rs->migration_dirty_pages--;
a82d593b 989 }
386a907b 990
a82d593b
DDAG
991 return ret;
992}
993
be39b4cd
DH
994static void dirty_bitmap_clear_section(MemoryRegionSection *section,
995 void *opaque)
996{
997 const hwaddr offset = section->offset_within_region;
998 const hwaddr size = int128_get64(section->size);
999 const unsigned long start = offset >> TARGET_PAGE_BITS;
1000 const unsigned long npages = size >> TARGET_PAGE_BITS;
1001 RAMBlock *rb = section->mr->ram_block;
1002 uint64_t *cleared_bits = opaque;
1003
1004 /*
1005 * We don't grab ram_state->bitmap_mutex because we expect to run
1006 * only when starting migration or during postcopy recovery where
1007 * we don't have concurrent access.
1008 */
1009 if (!migration_in_postcopy() && !migrate_background_snapshot()) {
1010 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
1011 }
1012 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
1013 bitmap_clear(rb->bmap, start, npages);
1014}
1015
1016/*
1017 * Exclude all dirty pages from migration that fall into a discarded range as
1018 * managed by a RamDiscardManager responsible for the mapped memory region of
1019 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
1020 *
1021 * Discarded pages ("logically unplugged") have undefined content and must
1022 * not get migrated, because even reading these pages for migration might
1023 * result in undesired behavior.
1024 *
1025 * Returns the number of cleared bits in the RAMBlock dirty bitmap.
1026 *
1027 * Note: The result is only stable while migrating (precopy/postcopy).
1028 */
1029static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
1030{
1031 uint64_t cleared_bits = 0;
1032
1033 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
1034 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1035 MemoryRegionSection section = {
1036 .mr = rb->mr,
1037 .offset_within_region = 0,
1038 .size = int128_make64(qemu_ram_get_used_length(rb)),
1039 };
1040
1041 ram_discard_manager_replay_discarded(rdm, &section,
1042 dirty_bitmap_clear_section,
1043 &cleared_bits);
1044 }
1045 return cleared_bits;
1046}
1047
9470c5e0
DH
1048/*
1049 * Check if a host-page aligned page falls into a discarded range as managed by
1050 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
1051 *
1052 * Note: The result is only stable while migrating (precopy/postcopy).
1053 */
1054bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
1055{
1056 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1057 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1058 MemoryRegionSection section = {
1059 .mr = rb->mr,
1060 .offset_within_region = start,
1061 .size = int128_make64(qemu_ram_pagesize(rb)),
1062 };
1063
1064 return !ram_discard_manager_is_populated(rdm, &section);
1065 }
1066 return false;
1067}
1068
267691b6 1069/* Called with RCU critical section */
7a3e9571 1070static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
56e93d26 1071{
fb613580
KZ
1072 uint64_t new_dirty_pages =
1073 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
1074
1075 rs->migration_dirty_pages += new_dirty_pages;
1076 rs->num_dirty_pages_period += new_dirty_pages;
56e93d26
JQ
1077}
1078
3d0684b2
JQ
1079/**
1080 * ram_pagesize_summary: calculate all the pagesizes of a VM
1081 *
1082 * Returns a summary bitmap of the page sizes of all RAMBlocks
1083 *
1084 * For VMs with just normal pages this is equivalent to the host page
1085 * size. If it's got some huge pages then it's the OR of all the
1086 * different page sizes.
e8ca1db2
DDAG
1087 */
1088uint64_t ram_pagesize_summary(void)
1089{
1090 RAMBlock *block;
1091 uint64_t summary = 0;
1092
fbd162e6 1093 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
e8ca1db2
DDAG
1094 summary |= block->page_size;
1095 }
1096
1097 return summary;
1098}
1099
aecbfe9c
XG
1100uint64_t ram_get_total_transferred_pages(void)
1101{
23b7576d
PX
1102 return stat64_get(&ram_atomic_counters.normal) +
1103 stat64_get(&ram_atomic_counters.duplicate) +
1104 compression_counters.pages + xbzrle_counters.pages;
aecbfe9c
XG
1105}
1106
b734035b
XG
1107static void migration_update_rates(RAMState *rs, int64_t end_time)
1108{
be8b02ed 1109 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
76e03000 1110 double compressed_size;
b734035b
XG
1111
1112 /* calculate period counters */
1113 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1114 / (end_time - rs->time_last_bitmap_sync);
1115
be8b02ed 1116 if (!page_count) {
b734035b
XG
1117 return;
1118 }
1119
1120 if (migrate_use_xbzrle()) {
e460a4b1
WW
1121 double encoded_size, unencoded_size;
1122
b734035b 1123 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
be8b02ed 1124 rs->xbzrle_cache_miss_prev) / page_count;
b734035b 1125 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
e460a4b1
WW
1126 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1127 TARGET_PAGE_SIZE;
1128 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
92271402 1129 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
e460a4b1 1130 xbzrle_counters.encoding_rate = 0;
e460a4b1
WW
1131 } else {
1132 xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1133 }
1134 rs->xbzrle_pages_prev = xbzrle_counters.pages;
1135 rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
b734035b 1136 }
76e03000
XG
1137
1138 if (migrate_use_compression()) {
1139 compression_counters.busy_rate = (double)(compression_counters.busy -
1140 rs->compress_thread_busy_prev) / page_count;
1141 rs->compress_thread_busy_prev = compression_counters.busy;
1142
1143 compressed_size = compression_counters.compressed_size -
1144 rs->compressed_size_prev;
1145 if (compressed_size) {
1146 double uncompressed_size = (compression_counters.pages -
1147 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1148
1149 /* Compression-Ratio = Uncompressed-size / Compressed-size */
1150 compression_counters.compression_rate =
1151 uncompressed_size / compressed_size;
1152
1153 rs->compress_pages_prev = compression_counters.pages;
1154 rs->compressed_size_prev = compression_counters.compressed_size;
1155 }
1156 }
b734035b
XG
1157}
1158
dc14a470
KZ
1159static void migration_trigger_throttle(RAMState *rs)
1160{
1161 MigrationState *s = migrate_get_current();
1162 uint64_t threshold = s->parameters.throttle_trigger_threshold;
23b7576d
PX
1163 uint64_t bytes_xfer_period =
1164 stat64_get(&ram_atomic_counters.transferred) - rs->bytes_xfer_prev;
dc14a470
KZ
1165 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1166 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1167
1168 /* During block migration the auto-converge logic incorrectly detects
1169 * that ram migration makes no progress. Avoid this by disabling the
1170 * throttling logic during the bulk phase of block migration. */
1171 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1172 /* The following detection logic can be refined later. For now:
1173 Check to see if the ratio between dirtied bytes and the approx.
1174 amount of bytes that just got transferred since the last time
1175 we were in this routine reaches the threshold. If that happens
1176 twice, start or increase throttling. */
1177
1178 if ((bytes_dirty_period > bytes_dirty_threshold) &&
1179 (++rs->dirty_rate_high_cnt >= 2)) {
1180 trace_migration_throttle();
1181 rs->dirty_rate_high_cnt = 0;
cbbf8182
KZ
1182 mig_throttle_guest_down(bytes_dirty_period,
1183 bytes_dirty_threshold);
dc14a470
KZ
1184 }
1185 }
1186}
1187
8d820d6f 1188static void migration_bitmap_sync(RAMState *rs)
56e93d26
JQ
1189{
1190 RAMBlock *block;
56e93d26 1191 int64_t end_time;
56e93d26 1192
9360447d 1193 ram_counters.dirty_sync_count++;
56e93d26 1194
f664da80
JQ
1195 if (!rs->time_last_bitmap_sync) {
1196 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
56e93d26
JQ
1197 }
1198
1199 trace_migration_bitmap_sync_start();
9c1f8f44 1200 memory_global_dirty_log_sync();
56e93d26 1201
108cfae0 1202 qemu_mutex_lock(&rs->bitmap_mutex);
89ac5a1d
DDAG
1203 WITH_RCU_READ_LOCK_GUARD() {
1204 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1205 ramblock_sync_dirty_bitmap(rs, block);
1206 }
1207 ram_counters.remaining = ram_bytes_remaining();
56e93d26 1208 }
108cfae0 1209 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26 1210
9458a9a1 1211 memory_global_after_dirty_log_sync();
a66cd90c 1212 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1ffb5dfd 1213
56e93d26
JQ
1214 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1215
1216 /* more than 1 second = 1000 millisecons */
f664da80 1217 if (end_time > rs->time_last_bitmap_sync + 1000) {
dc14a470 1218 migration_trigger_throttle(rs);
070afca2 1219
b734035b
XG
1220 migration_update_rates(rs, end_time);
1221
be8b02ed 1222 rs->target_page_count_prev = rs->target_page_count;
d693c6f1
FF
1223
1224 /* reset period counters */
f664da80 1225 rs->time_last_bitmap_sync = end_time;
a66cd90c 1226 rs->num_dirty_pages_period = 0;
23b7576d 1227 rs->bytes_xfer_prev = stat64_get(&ram_atomic_counters.transferred);
56e93d26 1228 }
4addcd4f 1229 if (migrate_use_events()) {
3ab72385 1230 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
4addcd4f 1231 }
56e93d26
JQ
1232}
1233
bd227060
WW
1234static void migration_bitmap_sync_precopy(RAMState *rs)
1235{
1236 Error *local_err = NULL;
1237
1238 /*
1239 * The current notifier usage is just an optimization to migration, so we
1240 * don't stop the normal migration process in the error case.
1241 */
1242 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1243 error_report_err(local_err);
b4a1733c 1244 local_err = NULL;
bd227060
WW
1245 }
1246
1247 migration_bitmap_sync(rs);
1248
1249 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1250 error_report_err(local_err);
1251 }
1252}
1253
a4dbaf8e 1254void ram_release_page(const char *rbname, uint64_t offset)
47fe16ff
JQ
1255{
1256 if (!migrate_release_ram() || !migration_in_postcopy()) {
1257 return;
1258 }
1259
1260 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1261}
1262
6c97ec5f
XG
1263/**
1264 * save_zero_page_to_file: send the zero page to the file
1265 *
1266 * Returns the size of data written to the file, 0 means the page is not
1267 * a zero page
1268 *
ec6f3ab9 1269 * @pss: current PSS channel
6c97ec5f
XG
1270 * @block: block that contains the page we want to send
1271 * @offset: offset inside the block for the page
1272 */
ec6f3ab9 1273static int save_zero_page_to_file(PageSearchStatus *pss,
6c97ec5f
XG
1274 RAMBlock *block, ram_addr_t offset)
1275{
1276 uint8_t *p = block->host + offset;
ec6f3ab9 1277 QEMUFile *file = pss->pss_channel;
6c97ec5f
XG
1278 int len = 0;
1279
bad452a7 1280 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
ec6f3ab9 1281 len += save_page_header(pss, block, offset | RAM_SAVE_FLAG_ZERO);
6c97ec5f
XG
1282 qemu_put_byte(file, 0);
1283 len += 1;
47fe16ff 1284 ram_release_page(block->idstr, offset);
6c97ec5f
XG
1285 }
1286 return len;
1287}
1288
56e93d26 1289/**
3d0684b2 1290 * save_zero_page: send the zero page to the stream
56e93d26 1291 *
3d0684b2 1292 * Returns the number of pages written.
56e93d26 1293 *
ec6f3ab9 1294 * @pss: current PSS channel
56e93d26
JQ
1295 * @block: block that contains the page we want to send
1296 * @offset: offset inside the block for the page
56e93d26 1297 */
ec6f3ab9 1298static int save_zero_page(PageSearchStatus *pss, RAMBlock *block,
61717ea9 1299 ram_addr_t offset)
56e93d26 1300{
ec6f3ab9 1301 int len = save_zero_page_to_file(pss, block, offset);
56e93d26 1302
6c97ec5f 1303 if (len) {
23b7576d 1304 stat64_add(&ram_atomic_counters.duplicate, 1);
4c2d0f6d 1305 ram_transferred_add(len);
6c97ec5f 1306 return 1;
56e93d26 1307 }
6c97ec5f 1308 return -1;
56e93d26
JQ
1309}
1310
059ff0fb
XG
1311/*
1312 * @pages: the number of pages written by the control path,
1313 * < 0 - error
1314 * > 0 - number of pages written
1315 *
1316 * Return true if the pages has been saved, otherwise false is returned.
1317 */
61717ea9
PX
1318static bool control_save_page(PageSearchStatus *pss, RAMBlock *block,
1319 ram_addr_t offset, int *pages)
059ff0fb
XG
1320{
1321 uint64_t bytes_xmit = 0;
1322 int ret;
1323
1324 *pages = -1;
61717ea9
PX
1325 ret = ram_control_save_page(pss->pss_channel, block->offset, offset,
1326 TARGET_PAGE_SIZE, &bytes_xmit);
059ff0fb
XG
1327 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1328 return false;
1329 }
1330
1331 if (bytes_xmit) {
4c2d0f6d 1332 ram_transferred_add(bytes_xmit);
059ff0fb
XG
1333 *pages = 1;
1334 }
1335
1336 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1337 return true;
1338 }
1339
1340 if (bytes_xmit > 0) {
23b7576d 1341 stat64_add(&ram_atomic_counters.normal, 1);
059ff0fb 1342 } else if (bytes_xmit == 0) {
23b7576d 1343 stat64_add(&ram_atomic_counters.duplicate, 1);
059ff0fb
XG
1344 }
1345
1346 return true;
1347}
1348
65dacaa0
XG
1349/*
1350 * directly send the page to the stream
1351 *
1352 * Returns the number of pages written.
1353 *
ec6f3ab9 1354 * @pss: current PSS channel
65dacaa0
XG
1355 * @block: block that contains the page we want to send
1356 * @offset: offset inside the block for the page
1357 * @buf: the page to be sent
1358 * @async: send to page asyncly
1359 */
ec6f3ab9 1360static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
61717ea9 1361 ram_addr_t offset, uint8_t *buf, bool async)
65dacaa0 1362{
ec6f3ab9
PX
1363 QEMUFile *file = pss->pss_channel;
1364
1365 ram_transferred_add(save_page_header(pss, block,
4c2d0f6d 1366 offset | RAM_SAVE_FLAG_PAGE));
65dacaa0 1367 if (async) {
61717ea9 1368 qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
f912ec5b 1369 migrate_release_ram() &&
65dacaa0
XG
1370 migration_in_postcopy());
1371 } else {
61717ea9 1372 qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
65dacaa0 1373 }
4c2d0f6d 1374 ram_transferred_add(TARGET_PAGE_SIZE);
23b7576d 1375 stat64_add(&ram_atomic_counters.normal, 1);
65dacaa0
XG
1376 return 1;
1377}
1378
56e93d26 1379/**
3d0684b2 1380 * ram_save_page: send the given page to the stream
56e93d26 1381 *
3d0684b2 1382 * Returns the number of pages written.
3fd3c4b3
DDAG
1383 * < 0 - error
1384 * >=0 - Number of pages written - this might legally be 0
1385 * if xbzrle noticed the page was the same.
56e93d26 1386 *
6f37bb8b 1387 * @rs: current RAM state
56e93d26
JQ
1388 * @block: block that contains the page we want to send
1389 * @offset: offset inside the block for the page
56e93d26 1390 */
05931ec5 1391static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
56e93d26
JQ
1392{
1393 int pages = -1;
56e93d26 1394 uint8_t *p;
56e93d26 1395 bool send_async = true;
a08f6890 1396 RAMBlock *block = pss->block;
8bba004c 1397 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
059ff0fb 1398 ram_addr_t current_addr = block->offset + offset;
56e93d26 1399
2f68e399 1400 p = block->host + offset;
1db9d8e5 1401 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
56e93d26 1402
56e93d26 1403 XBZRLE_cache_lock();
1a373522 1404 if (rs->xbzrle_enabled && !migration_in_postcopy()) {
ec6f3ab9 1405 pages = save_xbzrle_page(rs, pss, &p, current_addr,
61717ea9 1406 block, offset);
05931ec5 1407 if (!rs->last_stage) {
059ff0fb
XG
1408 /* Can't send this cached data async, since the cache page
1409 * might get updated before it gets to the wire
56e93d26 1410 */
059ff0fb 1411 send_async = false;
56e93d26
JQ
1412 }
1413 }
1414
1415 /* XBZRLE overflow or normal page */
1416 if (pages == -1) {
ec6f3ab9 1417 pages = save_normal_page(pss, block, offset, p, send_async);
56e93d26
JQ
1418 }
1419
1420 XBZRLE_cache_unlock();
1421
1422 return pages;
1423}
1424
61717ea9 1425static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block,
b9ee2f7d
JQ
1426 ram_addr_t offset)
1427{
61717ea9 1428 if (multifd_queue_page(file, block, offset) < 0) {
713f762a
IR
1429 return -1;
1430 }
23b7576d 1431 stat64_add(&ram_atomic_counters.normal, 1);
b9ee2f7d
JQ
1432
1433 return 1;
1434}
1435
5e5fdcff 1436static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
6ef3771c 1437 ram_addr_t offset, uint8_t *source_buf)
56e93d26 1438{
53518d94 1439 RAMState *rs = ram_state;
ec6f3ab9 1440 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
20d549cb 1441 uint8_t *p = block->host + offset;
6ef3771c 1442 int ret;
56e93d26 1443
ec6f3ab9 1444 if (save_zero_page_to_file(pss, block, offset)) {
e7f2e190 1445 return true;
5e5fdcff
XG
1446 }
1447
ec6f3ab9 1448 save_page_header(pss, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
34ab9e97
XG
1449
1450 /*
1451 * copy it to a internal buffer to avoid it being modified by VM
1452 * so that we can catch up the error during compression and
1453 * decompression
1454 */
1455 memcpy(source_buf, p, TARGET_PAGE_SIZE);
6ef3771c
XG
1456 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1457 if (ret < 0) {
1458 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
b3be2896 1459 error_report("compressed data failed!");
b3be2896 1460 }
e7f2e190 1461 return false;
5e5fdcff
XG
1462}
1463
1464static void
1465update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1466{
4c2d0f6d 1467 ram_transferred_add(bytes_xmit);
76e03000 1468
5e5fdcff 1469 if (param->zero_page) {
23b7576d 1470 stat64_add(&ram_atomic_counters.duplicate, 1);
76e03000 1471 return;
5e5fdcff 1472 }
76e03000
XG
1473
1474 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1475 compression_counters.compressed_size += bytes_xmit - 8;
1476 compression_counters.pages++;
56e93d26
JQ
1477}
1478
32b05495
XG
1479static bool save_page_use_compression(RAMState *rs);
1480
ce25d337 1481static void flush_compressed_data(RAMState *rs)
56e93d26 1482{
eaa238ab 1483 MigrationState *ms = migrate_get_current();
56e93d26
JQ
1484 int idx, len, thread_count;
1485
32b05495 1486 if (!save_page_use_compression(rs)) {
56e93d26
JQ
1487 return;
1488 }
1489 thread_count = migrate_compress_threads();
a7a9a88f 1490
0d9f9a5c 1491 qemu_mutex_lock(&comp_done_lock);
56e93d26 1492 for (idx = 0; idx < thread_count; idx++) {
a7a9a88f 1493 while (!comp_param[idx].done) {
0d9f9a5c 1494 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26 1495 }
a7a9a88f 1496 }
0d9f9a5c 1497 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
1498
1499 for (idx = 0; idx < thread_count; idx++) {
1500 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 1501 if (!comp_param[idx].quit) {
eaa238ab 1502 len = qemu_put_qemu_file(ms->to_dst_file, comp_param[idx].file);
5e5fdcff
XG
1503 /*
1504 * it's safe to fetch zero_page without holding comp_done_lock
1505 * as there is no further request submitted to the thread,
1506 * i.e, the thread should be waiting for a request at this point.
1507 */
1508 update_compress_thread_counts(&comp_param[idx], len);
56e93d26 1509 }
a7a9a88f 1510 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26
JQ
1511 }
1512}
1513
1514static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1515 ram_addr_t offset)
1516{
1517 param->block = block;
1518 param->offset = offset;
1519}
1520
eaa238ab 1521static int compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset)
56e93d26
JQ
1522{
1523 int idx, thread_count, bytes_xmit = -1, pages = -1;
1d58872a 1524 bool wait = migrate_compress_wait_thread();
eaa238ab 1525 MigrationState *ms = migrate_get_current();
56e93d26
JQ
1526
1527 thread_count = migrate_compress_threads();
0d9f9a5c 1528 qemu_mutex_lock(&comp_done_lock);
1d58872a
XG
1529retry:
1530 for (idx = 0; idx < thread_count; idx++) {
1531 if (comp_param[idx].done) {
1532 comp_param[idx].done = false;
eaa238ab
PX
1533 bytes_xmit = qemu_put_qemu_file(ms->to_dst_file,
1534 comp_param[idx].file);
1d58872a
XG
1535 qemu_mutex_lock(&comp_param[idx].mutex);
1536 set_compress_params(&comp_param[idx], block, offset);
1537 qemu_cond_signal(&comp_param[idx].cond);
1538 qemu_mutex_unlock(&comp_param[idx].mutex);
1539 pages = 1;
5e5fdcff 1540 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
56e93d26 1541 break;
56e93d26
JQ
1542 }
1543 }
1d58872a
XG
1544
1545 /*
1546 * wait for the free thread if the user specifies 'compress-wait-thread',
1547 * otherwise we will post the page out in the main thread as normal page.
1548 */
1549 if (pages < 0 && wait) {
1550 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1551 goto retry;
1552 }
0d9f9a5c 1553 qemu_mutex_unlock(&comp_done_lock);
56e93d26
JQ
1554
1555 return pages;
1556}
1557
31e2ac74
JQ
1558#define PAGE_ALL_CLEAN 0
1559#define PAGE_TRY_AGAIN 1
1560#define PAGE_DIRTY_FOUND 2
3d0684b2
JQ
1561/**
1562 * find_dirty_block: find the next dirty page and update any state
1563 * associated with the search process.
b9e60928 1564 *
31e2ac74
JQ
1565 * Returns:
1566 * PAGE_ALL_CLEAN: no dirty page found, give up
1567 * PAGE_TRY_AGAIN: no dirty page found, retry for next block
1568 * PAGE_DIRTY_FOUND: dirty page found
b9e60928 1569 *
6f37bb8b 1570 * @rs: current RAM state
3d0684b2
JQ
1571 * @pss: data about the state of the current dirty page scan
1572 * @again: set to false if the search has scanned the whole of RAM
b9e60928 1573 */
31e2ac74 1574static int find_dirty_block(RAMState *rs, PageSearchStatus *pss)
b9e60928 1575{
d9e474ea
PX
1576 /* Update pss->page for the next dirty bit in ramblock */
1577 pss_find_next_dirty(pss);
1578
6f37bb8b 1579 if (pss->complete_round && pss->block == rs->last_seen_block &&
a935e30f 1580 pss->page >= rs->last_page) {
b9e60928
DDAG
1581 /*
1582 * We've been once around the RAM and haven't found anything.
1583 * Give up.
1584 */
31e2ac74 1585 return PAGE_ALL_CLEAN;
b9e60928 1586 }
542147f4
DH
1587 if (!offset_in_ramblock(pss->block,
1588 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
b9e60928 1589 /* Didn't find anything in this RAM Block */
a935e30f 1590 pss->page = 0;
b9e60928
DDAG
1591 pss->block = QLIST_NEXT_RCU(pss->block, next);
1592 if (!pss->block) {
48df9d80
XG
1593 /*
1594 * If memory migration starts over, we will meet a dirtied page
1595 * which may still exists in compression threads's ring, so we
1596 * should flush the compressed data to make sure the new page
1597 * is not overwritten by the old one in the destination.
1598 *
1599 * Also If xbzrle is on, stop using the data compression at this
1600 * point. In theory, xbzrle can do better than compression.
1601 */
1602 flush_compressed_data(rs);
1603
b9e60928
DDAG
1604 /* Hit the end of the list */
1605 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1606 /* Flag that we've looped */
1607 pss->complete_round = true;
1a373522
DH
1608 /* After the first round, enable XBZRLE. */
1609 if (migrate_use_xbzrle()) {
1610 rs->xbzrle_enabled = true;
1611 }
b9e60928
DDAG
1612 }
1613 /* Didn't find anything this time, but try again on the new block */
31e2ac74 1614 return PAGE_TRY_AGAIN;
b9e60928 1615 } else {
31e2ac74
JQ
1616 /* We've found something */
1617 return PAGE_DIRTY_FOUND;
b9e60928
DDAG
1618 }
1619}
1620
3d0684b2
JQ
1621/**
1622 * unqueue_page: gets a page of the queue
1623 *
a82d593b 1624 * Helper for 'get_queued_page' - gets a page off the queue
a82d593b 1625 *
3d0684b2
JQ
1626 * Returns the block of the page (or NULL if none available)
1627 *
ec481c6c 1628 * @rs: current RAM state
3d0684b2 1629 * @offset: used to return the offset within the RAMBlock
a82d593b 1630 */
f20e2865 1631static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
a82d593b 1632{
a1fe28df 1633 struct RAMSrcPageRequest *entry;
a82d593b
DDAG
1634 RAMBlock *block = NULL;
1635
a1fe28df 1636 if (!postcopy_has_request(rs)) {
ae526e32
XG
1637 return NULL;
1638 }
1639
6e8a355d 1640 QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
a1fe28df
PX
1641
1642 /*
1643 * This should _never_ change even after we take the lock, because no one
1644 * should be taking anything off the request list other than us.
1645 */
1646 assert(postcopy_has_request(rs));
1647
1648 entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1649 block = entry->rb;
1650 *offset = entry->offset;
1651
777f53c7
TH
1652 if (entry->len > TARGET_PAGE_SIZE) {
1653 entry->len -= TARGET_PAGE_SIZE;
1654 entry->offset += TARGET_PAGE_SIZE;
a1fe28df
PX
1655 } else {
1656 memory_region_unref(block->mr);
1657 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1658 g_free(entry);
1659 migration_consume_urgent_request();
a82d593b 1660 }
a82d593b
DDAG
1661
1662 return block;
1663}
1664
278e2f55
AG
1665#if defined(__linux__)
1666/**
1667 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1668 * is found, return RAM block pointer and page offset
1669 *
1670 * Returns pointer to the RAMBlock containing faulting page,
1671 * NULL if no write faults are pending
1672 *
1673 * @rs: current RAM state
1674 * @offset: page offset from the beginning of the block
1675 */
1676static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1677{
1678 struct uffd_msg uffd_msg;
1679 void *page_address;
82ea3e3b 1680 RAMBlock *block;
278e2f55
AG
1681 int res;
1682
1683 if (!migrate_background_snapshot()) {
1684 return NULL;
1685 }
1686
1687 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1688 if (res <= 0) {
1689 return NULL;
1690 }
1691
1692 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
82ea3e3b
AG
1693 block = qemu_ram_block_from_host(page_address, false, offset);
1694 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1695 return block;
278e2f55
AG
1696}
1697
1698/**
1699 * ram_save_release_protection: release UFFD write protection after
1700 * a range of pages has been saved
1701 *
1702 * @rs: current RAM state
1703 * @pss: page-search-status structure
1704 * @start_page: index of the first page in the range relative to pss->block
1705 *
1706 * Returns 0 on success, negative value in case of an error
1707*/
1708static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1709 unsigned long start_page)
1710{
1711 int res = 0;
1712
1713 /* Check if page is from UFFD-managed region. */
1714 if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1715 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
258f5c98 1716 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
278e2f55
AG
1717
1718 /* Flush async buffers before un-protect. */
61717ea9 1719 qemu_fflush(pss->pss_channel);
278e2f55
AG
1720 /* Un-protect memory range. */
1721 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1722 false, false);
1723 }
1724
1725 return res;
1726}
1727
1728/* ram_write_tracking_available: check if kernel supports required UFFD features
1729 *
1730 * Returns true if supports, false otherwise
1731 */
1732bool ram_write_tracking_available(void)
1733{
1734 uint64_t uffd_features;
1735 int res;
1736
1737 res = uffd_query_features(&uffd_features);
1738 return (res == 0 &&
1739 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1740}
1741
1742/* ram_write_tracking_compatible: check if guest configuration is
1743 * compatible with 'write-tracking'
1744 *
1745 * Returns true if compatible, false otherwise
1746 */
1747bool ram_write_tracking_compatible(void)
1748{
1749 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1750 int uffd_fd;
82ea3e3b 1751 RAMBlock *block;
278e2f55
AG
1752 bool ret = false;
1753
1754 /* Open UFFD file descriptor */
1755 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1756 if (uffd_fd < 0) {
1757 return false;
1758 }
1759
1760 RCU_READ_LOCK_GUARD();
1761
82ea3e3b 1762 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
278e2f55
AG
1763 uint64_t uffd_ioctls;
1764
1765 /* Nothing to do with read-only and MMIO-writable regions */
82ea3e3b 1766 if (block->mr->readonly || block->mr->rom_device) {
278e2f55
AG
1767 continue;
1768 }
1769 /* Try to register block memory via UFFD-IO to track writes */
82ea3e3b 1770 if (uffd_register_memory(uffd_fd, block->host, block->max_length,
278e2f55
AG
1771 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1772 goto out;
1773 }
1774 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1775 goto out;
1776 }
1777 }
1778 ret = true;
1779
1780out:
1781 uffd_close_fd(uffd_fd);
1782 return ret;
1783}
1784
f7b9dcfb
DH
1785static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1786 ram_addr_t size)
1787{
5f19a449
DH
1788 const ram_addr_t end = offset + size;
1789
f7b9dcfb
DH
1790 /*
1791 * We read one byte of each page; this will preallocate page tables if
1792 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1793 * where no page was populated yet. This might require adaption when
1794 * supporting other mappings, like shmem.
1795 */
5f19a449 1796 for (; offset < end; offset += block->page_size) {
f7b9dcfb
DH
1797 char tmp = *((char *)block->host + offset);
1798
1799 /* Don't optimize the read out */
1800 asm volatile("" : "+r" (tmp));
1801 }
1802}
1803
6fee3a1f
DH
1804static inline int populate_read_section(MemoryRegionSection *section,
1805 void *opaque)
1806{
1807 const hwaddr size = int128_get64(section->size);
1808 hwaddr offset = section->offset_within_region;
1809 RAMBlock *block = section->mr->ram_block;
1810
1811 populate_read_range(block, offset, size);
1812 return 0;
1813}
1814
eeccb99c 1815/*
f7b9dcfb
DH
1816 * ram_block_populate_read: preallocate page tables and populate pages in the
1817 * RAM block by reading a byte of each page.
eeccb99c
AG
1818 *
1819 * Since it's solely used for userfault_fd WP feature, here we just
1820 * hardcode page size to qemu_real_host_page_size.
1821 *
82ea3e3b 1822 * @block: RAM block to populate
eeccb99c 1823 */
6fee3a1f 1824static void ram_block_populate_read(RAMBlock *rb)
eeccb99c 1825{
6fee3a1f
DH
1826 /*
1827 * Skip populating all pages that fall into a discarded range as managed by
1828 * a RamDiscardManager responsible for the mapped memory region of the
1829 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1830 * must not get populated automatically. We don't have to track
1831 * modifications via userfaultfd WP reliably, because these pages will
1832 * not be part of the migration stream either way -- see
1833 * ramblock_dirty_bitmap_exclude_discarded_pages().
1834 *
1835 * Note: The result is only stable while migrating (precopy/postcopy).
1836 */
1837 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1838 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1839 MemoryRegionSection section = {
1840 .mr = rb->mr,
1841 .offset_within_region = 0,
1842 .size = rb->mr->size,
1843 };
1844
1845 ram_discard_manager_replay_populated(rdm, &section,
1846 populate_read_section, NULL);
1847 } else {
1848 populate_read_range(rb, 0, rb->used_length);
1849 }
eeccb99c
AG
1850}
1851
1852/*
1853 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1854 */
1855void ram_write_tracking_prepare(void)
1856{
82ea3e3b 1857 RAMBlock *block;
eeccb99c
AG
1858
1859 RCU_READ_LOCK_GUARD();
1860
82ea3e3b 1861 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
eeccb99c 1862 /* Nothing to do with read-only and MMIO-writable regions */
82ea3e3b 1863 if (block->mr->readonly || block->mr->rom_device) {
eeccb99c
AG
1864 continue;
1865 }
1866
1867 /*
1868 * Populate pages of the RAM block before enabling userfault_fd
1869 * write protection.
1870 *
1871 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1872 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1873 * pages with pte_none() entries in page table.
1874 */
f7b9dcfb 1875 ram_block_populate_read(block);
eeccb99c
AG
1876 }
1877}
1878
e41c5770
DH
1879static inline int uffd_protect_section(MemoryRegionSection *section,
1880 void *opaque)
1881{
1882 const hwaddr size = int128_get64(section->size);
1883 const hwaddr offset = section->offset_within_region;
1884 RAMBlock *rb = section->mr->ram_block;
1885 int uffd_fd = (uintptr_t)opaque;
1886
1887 return uffd_change_protection(uffd_fd, rb->host + offset, size, true,
1888 false);
1889}
1890
1891static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd)
1892{
1893 assert(rb->flags & RAM_UF_WRITEPROTECT);
1894
1895 /* See ram_block_populate_read() */
1896 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1897 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1898 MemoryRegionSection section = {
1899 .mr = rb->mr,
1900 .offset_within_region = 0,
1901 .size = rb->mr->size,
1902 };
1903
1904 return ram_discard_manager_replay_populated(rdm, &section,
1905 uffd_protect_section,
1906 (void *)(uintptr_t)uffd_fd);
1907 }
1908 return uffd_change_protection(uffd_fd, rb->host,
1909 rb->used_length, true, false);
1910}
1911
278e2f55
AG
1912/*
1913 * ram_write_tracking_start: start UFFD-WP memory tracking
1914 *
1915 * Returns 0 for success or negative value in case of error
1916 */
1917int ram_write_tracking_start(void)
1918{
1919 int uffd_fd;
1920 RAMState *rs = ram_state;
82ea3e3b 1921 RAMBlock *block;
278e2f55
AG
1922
1923 /* Open UFFD file descriptor */
1924 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1925 if (uffd_fd < 0) {
1926 return uffd_fd;
1927 }
1928 rs->uffdio_fd = uffd_fd;
1929
1930 RCU_READ_LOCK_GUARD();
1931
82ea3e3b 1932 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
278e2f55 1933 /* Nothing to do with read-only and MMIO-writable regions */
82ea3e3b 1934 if (block->mr->readonly || block->mr->rom_device) {
278e2f55
AG
1935 continue;
1936 }
1937
1938 /* Register block memory with UFFD to track writes */
82ea3e3b
AG
1939 if (uffd_register_memory(rs->uffdio_fd, block->host,
1940 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
278e2f55
AG
1941 goto fail;
1942 }
72ef3a37
DH
1943 block->flags |= RAM_UF_WRITEPROTECT;
1944 memory_region_ref(block->mr);
1945
278e2f55 1946 /* Apply UFFD write protection to the block memory range */
e41c5770 1947 if (ram_block_uffd_protect(block, uffd_fd)) {
278e2f55
AG
1948 goto fail;
1949 }
278e2f55 1950
82ea3e3b
AG
1951 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1952 block->host, block->max_length);
278e2f55
AG
1953 }
1954
1955 return 0;
1956
1957fail:
1958 error_report("ram_write_tracking_start() failed: restoring initial memory state");
1959
82ea3e3b
AG
1960 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1961 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
278e2f55
AG
1962 continue;
1963 }
82ea3e3b 1964 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
278e2f55 1965 /* Cleanup flags and remove reference */
82ea3e3b
AG
1966 block->flags &= ~RAM_UF_WRITEPROTECT;
1967 memory_region_unref(block->mr);
278e2f55
AG
1968 }
1969
1970 uffd_close_fd(uffd_fd);
1971 rs->uffdio_fd = -1;
1972 return -1;
1973}
1974
1975/**
1976 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1977 */
1978void ram_write_tracking_stop(void)
1979{
1980 RAMState *rs = ram_state;
82ea3e3b 1981 RAMBlock *block;
278e2f55
AG
1982
1983 RCU_READ_LOCK_GUARD();
1984
82ea3e3b
AG
1985 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1986 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
278e2f55
AG
1987 continue;
1988 }
82ea3e3b 1989 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
278e2f55 1990
82ea3e3b
AG
1991 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1992 block->host, block->max_length);
278e2f55
AG
1993
1994 /* Cleanup flags and remove reference */
82ea3e3b
AG
1995 block->flags &= ~RAM_UF_WRITEPROTECT;
1996 memory_region_unref(block->mr);
278e2f55
AG
1997 }
1998
1999 /* Finally close UFFD file descriptor */
2000 uffd_close_fd(rs->uffdio_fd);
2001 rs->uffdio_fd = -1;
2002}
2003
2004#else
2005/* No target OS support, stubs just fail or ignore */
2006
2007static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
2008{
2009 (void) rs;
2010 (void) offset;
2011
2012 return NULL;
2013}
2014
2015static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
2016 unsigned long start_page)
2017{
2018 (void) rs;
2019 (void) pss;
2020 (void) start_page;
2021
2022 return 0;
2023}
2024
2025bool ram_write_tracking_available(void)
2026{
2027 return false;
2028}
2029
2030bool ram_write_tracking_compatible(void)
2031{
2032 assert(0);
2033 return false;
2034}
2035
2036int ram_write_tracking_start(void)
2037{
2038 assert(0);
2039 return -1;
2040}
2041
2042void ram_write_tracking_stop(void)
2043{
2044 assert(0);
2045}
2046#endif /* defined(__linux__) */
2047
3d0684b2 2048/**
ff1543af 2049 * get_queued_page: unqueue a page from the postcopy requests
3d0684b2
JQ
2050 *
2051 * Skips pages that are already sent (!dirty)
a82d593b 2052 *
a5f7b1a6 2053 * Returns true if a queued page is found
a82d593b 2054 *
6f37bb8b 2055 * @rs: current RAM state
3d0684b2 2056 * @pss: data about the state of the current dirty page scan
a82d593b 2057 */
f20e2865 2058static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
a82d593b
DDAG
2059{
2060 RAMBlock *block;
2061 ram_addr_t offset;
777f53c7
TH
2062 bool dirty;
2063
2064 do {
2065 block = unqueue_page(rs, &offset);
2066 /*
2067 * We're sending this page, and since it's postcopy nothing else
2068 * will dirty it, and we must make sure it doesn't get sent again
2069 * even if this queue request was received after the background
2070 * search already sent it.
2071 */
2072 if (block) {
2073 unsigned long page;
2074
2075 page = offset >> TARGET_PAGE_BITS;
2076 dirty = test_bit(page, block->bmap);
2077 if (!dirty) {
2078 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2079 page);
2080 } else {
2081 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2082 }
2083 }
a82d593b 2084
777f53c7 2085 } while (block && !dirty);
a82d593b 2086
b062106d 2087 if (!block) {
278e2f55
AG
2088 /*
2089 * Poll write faults too if background snapshot is enabled; that's
2090 * when we have vcpus got blocked by the write protected pages.
2091 */
2092 block = poll_fault_page(rs, &offset);
2093 }
2094
a82d593b 2095 if (block) {
a82d593b
DDAG
2096 /*
2097 * We want the background search to continue from the queued page
2098 * since the guest is likely to want other pages near to the page
2099 * it just requested.
2100 */
2101 pss->block = block;
a935e30f 2102 pss->page = offset >> TARGET_PAGE_BITS;
422314e7
WY
2103
2104 /*
2105 * This unqueued page would break the "one round" check, even is
2106 * really rare.
2107 */
2108 pss->complete_round = false;
a82d593b
DDAG
2109 }
2110
2111 return !!block;
2112}
2113
6c595cde 2114/**
5e58f968
JQ
2115 * migration_page_queue_free: drop any remaining pages in the ram
2116 * request queue
6c595cde 2117 *
3d0684b2
JQ
2118 * It should be empty at the end anyway, but in error cases there may
2119 * be some left. in case that there is any page left, we drop it.
2120 *
6c595cde 2121 */
83c13382 2122static void migration_page_queue_free(RAMState *rs)
6c595cde 2123{
ec481c6c 2124 struct RAMSrcPageRequest *mspr, *next_mspr;
6c595cde
DDAG
2125 /* This queue generally should be empty - but in the case of a failed
2126 * migration might have some droppings in.
2127 */
89ac5a1d 2128 RCU_READ_LOCK_GUARD();
ec481c6c 2129 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
6c595cde 2130 memory_region_unref(mspr->rb->mr);
ec481c6c 2131 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
6c595cde
DDAG
2132 g_free(mspr);
2133 }
6c595cde
DDAG
2134}
2135
2136/**
3d0684b2
JQ
2137 * ram_save_queue_pages: queue the page for transmission
2138 *
2139 * A request from postcopy destination for example.
2140 *
2141 * Returns zero on success or negative on error
2142 *
3d0684b2
JQ
2143 * @rbname: Name of the RAMBLock of the request. NULL means the
2144 * same that last one.
2145 * @start: starting address from the start of the RAMBlock
2146 * @len: length (in bytes) to send
6c595cde 2147 */
96506894 2148int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
6c595cde
DDAG
2149{
2150 RAMBlock *ramblock;
53518d94 2151 RAMState *rs = ram_state;
6c595cde 2152
9360447d 2153 ram_counters.postcopy_requests++;
89ac5a1d
DDAG
2154 RCU_READ_LOCK_GUARD();
2155
6c595cde
DDAG
2156 if (!rbname) {
2157 /* Reuse last RAMBlock */
68a098f3 2158 ramblock = rs->last_req_rb;
6c595cde
DDAG
2159
2160 if (!ramblock) {
2161 /*
2162 * Shouldn't happen, we can't reuse the last RAMBlock if
2163 * it's the 1st request.
2164 */
2165 error_report("ram_save_queue_pages no previous block");
03acb4e9 2166 return -1;
6c595cde
DDAG
2167 }
2168 } else {
2169 ramblock = qemu_ram_block_by_name(rbname);
2170
2171 if (!ramblock) {
2172 /* We shouldn't be asked for a non-existent RAMBlock */
2173 error_report("ram_save_queue_pages no block '%s'", rbname);
03acb4e9 2174 return -1;
6c595cde 2175 }
68a098f3 2176 rs->last_req_rb = ramblock;
6c595cde
DDAG
2177 }
2178 trace_ram_save_queue_pages(ramblock->idstr, start, len);
542147f4 2179 if (!offset_in_ramblock(ramblock, start + len - 1)) {
9458ad6b
JQ
2180 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2181 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
6c595cde 2182 __func__, start, len, ramblock->used_length);
03acb4e9 2183 return -1;
6c595cde
DDAG
2184 }
2185
93589827
PX
2186 /*
2187 * When with postcopy preempt, we send back the page directly in the
2188 * rp-return thread.
2189 */
2190 if (postcopy_preempt_active()) {
2191 ram_addr_t page_start = start >> TARGET_PAGE_BITS;
2192 size_t page_size = qemu_ram_pagesize(ramblock);
2193 PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY];
2194 int ret = 0;
2195
2196 qemu_mutex_lock(&rs->bitmap_mutex);
2197
2198 pss_init(pss, ramblock, page_start);
2199 /*
2200 * Always use the preempt channel, and make sure it's there. It's
2201 * safe to access without lock, because when rp-thread is running
2202 * we should be the only one who operates on the qemufile
2203 */
2204 pss->pss_channel = migrate_get_current()->postcopy_qemufile_src;
93589827
PX
2205 assert(pss->pss_channel);
2206
2207 /*
2208 * It must be either one or multiple of host page size. Just
2209 * assert; if something wrong we're mostly split brain anyway.
2210 */
2211 assert(len % page_size == 0);
2212 while (len) {
2213 if (ram_save_host_page_urgent(pss)) {
2214 error_report("%s: ram_save_host_page_urgent() failed: "
2215 "ramblock=%s, start_addr=0x"RAM_ADDR_FMT,
2216 __func__, ramblock->idstr, start);
2217 ret = -1;
2218 break;
2219 }
2220 /*
2221 * NOTE: after ram_save_host_page_urgent() succeeded, pss->page
2222 * will automatically be moved and point to the next host page
2223 * we're going to send, so no need to update here.
2224 *
2225 * Normally QEMU never sends >1 host page in requests, so
2226 * logically we don't even need that as the loop should only
2227 * run once, but just to be consistent.
2228 */
2229 len -= page_size;
2230 };
2231 qemu_mutex_unlock(&rs->bitmap_mutex);
2232
2233 return ret;
2234 }
2235
ec481c6c 2236 struct RAMSrcPageRequest *new_entry =
b21e2380 2237 g_new0(struct RAMSrcPageRequest, 1);
6c595cde
DDAG
2238 new_entry->rb = ramblock;
2239 new_entry->offset = start;
2240 new_entry->len = len;
2241
2242 memory_region_ref(ramblock->mr);
ec481c6c
JQ
2243 qemu_mutex_lock(&rs->src_page_req_mutex);
2244 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
e03a34f8 2245 migration_make_urgent_request();
ec481c6c 2246 qemu_mutex_unlock(&rs->src_page_req_mutex);
6c595cde
DDAG
2247
2248 return 0;
6c595cde
DDAG
2249}
2250
d7400a34
XG
2251static bool save_page_use_compression(RAMState *rs)
2252{
2253 if (!migrate_use_compression()) {
2254 return false;
2255 }
2256
2257 /*
1a373522
DH
2258 * If xbzrle is enabled (e.g., after first round of migration), stop
2259 * using the data compression. In theory, xbzrle can do better than
2260 * compression.
d7400a34 2261 */
1a373522
DH
2262 if (rs->xbzrle_enabled) {
2263 return false;
d7400a34
XG
2264 }
2265
1a373522 2266 return true;
d7400a34
XG
2267}
2268
5e5fdcff
XG
2269/*
2270 * try to compress the page before posting it out, return true if the page
2271 * has been properly handled by compression, otherwise needs other
2272 * paths to handle it
2273 */
ec6f3ab9
PX
2274static bool save_compress_page(RAMState *rs, PageSearchStatus *pss,
2275 RAMBlock *block, ram_addr_t offset)
5e5fdcff
XG
2276{
2277 if (!save_page_use_compression(rs)) {
2278 return false;
2279 }
2280
2281 /*
2282 * When starting the process of a new block, the first page of
2283 * the block should be sent out before other pages in the same
2284 * block, and all the pages in last block should have been sent
2285 * out, keeping this order is important, because the 'cont' flag
2286 * is used to avoid resending the block name.
2287 *
2288 * We post the fist page as normal page as compression will take
2289 * much CPU resource.
2290 */
ec6f3ab9 2291 if (block != pss->last_sent_block) {
5e5fdcff
XG
2292 flush_compressed_data(rs);
2293 return false;
2294 }
2295
eaa238ab 2296 if (compress_page_with_multi_thread(block, offset) > 0) {
5e5fdcff
XG
2297 return true;
2298 }
2299
76e03000 2300 compression_counters.busy++;
5e5fdcff
XG
2301 return false;
2302}
2303
a82d593b 2304/**
4010ba38 2305 * ram_save_target_page_legacy: save one target page
a82d593b 2306 *
3d0684b2 2307 * Returns the number of pages written
a82d593b 2308 *
6f37bb8b 2309 * @rs: current RAM state
3d0684b2 2310 * @pss: data about the page we want to send
a82d593b 2311 */
4010ba38 2312static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss)
a82d593b 2313{
a8ec91f9 2314 RAMBlock *block = pss->block;
8bba004c 2315 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
a8ec91f9
XG
2316 int res;
2317
61717ea9 2318 if (control_save_page(pss, block, offset, &res)) {
a8ec91f9
XG
2319 return res;
2320 }
2321
ec6f3ab9 2322 if (save_compress_page(rs, pss, block, offset)) {
5e5fdcff 2323 return 1;
d7400a34
XG
2324 }
2325
ec6f3ab9 2326 res = save_zero_page(pss, block, offset);
d7400a34
XG
2327 if (res > 0) {
2328 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2329 * page would be stale
2330 */
ef5c3d13 2331 if (rs->xbzrle_enabled) {
d7400a34
XG
2332 XBZRLE_cache_lock();
2333 xbzrle_cache_zero_page(rs, block->offset + offset);
2334 XBZRLE_cache_unlock();
2335 }
d7400a34
XG
2336 return res;
2337 }
2338
da3f56cb 2339 /*
6f39c90b
PX
2340 * Do not use multifd in postcopy as one whole host page should be
2341 * placed. Meanwhile postcopy requires atomic update of pages, so even
2342 * if host page size == guest page size the dest guest during run may
2343 * still see partially copied pages which is data corruption.
da3f56cb 2344 */
6f39c90b 2345 if (migrate_use_multifd() && !migration_in_postcopy()) {
61717ea9 2346 return ram_save_multifd_page(pss->pss_channel, block, offset);
a82d593b
DDAG
2347 }
2348
05931ec5 2349 return ram_save_page(rs, pss);
a82d593b
DDAG
2350}
2351
d9e474ea
PX
2352/* Should be called before sending a host page */
2353static void pss_host_page_prepare(PageSearchStatus *pss)
2354{
2355 /* How many guest pages are there in one host page? */
2356 size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2357
2358 pss->host_page_sending = true;
301d7ffe
PX
2359 if (guest_pfns <= 1) {
2360 /*
2361 * This covers both when guest psize == host psize, or when guest
2362 * has larger psize than the host (guest_pfns==0).
2363 *
2364 * For the latter, we always send one whole guest page per
2365 * iteration of the host page (example: an Alpha VM on x86 host
2366 * will have guest psize 8K while host psize 4K).
2367 */
2368 pss->host_page_start = pss->page;
2369 pss->host_page_end = pss->page + 1;
2370 } else {
2371 /*
2372 * The host page spans over multiple guest pages, we send them
2373 * within the same host page iteration.
2374 */
2375 pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
2376 pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
2377 }
d9e474ea
PX
2378}
2379
2380/*
2381 * Whether the page pointed by PSS is within the host page being sent.
2382 * Must be called after a previous pss_host_page_prepare().
2383 */
2384static bool pss_within_range(PageSearchStatus *pss)
2385{
2386 ram_addr_t ram_addr;
2387
2388 assert(pss->host_page_sending);
2389
2390 /* Over host-page boundary? */
2391 if (pss->page >= pss->host_page_end) {
2392 return false;
2393 }
2394
2395 ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2396
2397 return offset_in_ramblock(pss->block, ram_addr);
2398}
2399
2400static void pss_host_page_finish(PageSearchStatus *pss)
2401{
2402 pss->host_page_sending = false;
2403 /* This is not needed, but just to reset it */
2404 pss->host_page_start = pss->host_page_end = 0;
2405}
2406
93589827
PX
2407/*
2408 * Send an urgent host page specified by `pss'. Need to be called with
2409 * bitmap_mutex held.
2410 *
2411 * Returns 0 if save host page succeeded, false otherwise.
2412 */
2413static int ram_save_host_page_urgent(PageSearchStatus *pss)
2414{
2415 bool page_dirty, sent = false;
2416 RAMState *rs = ram_state;
2417 int ret = 0;
2418
2419 trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2420 pss_host_page_prepare(pss);
2421
2422 /*
2423 * If precopy is sending the same page, let it be done in precopy, or
2424 * we could send the same page in two channels and none of them will
2425 * receive the whole page.
2426 */
2427 if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) {
2428 trace_postcopy_preempt_hit(pss->block->idstr,
2429 pss->page << TARGET_PAGE_BITS);
2430 return 0;
2431 }
2432
2433 do {
2434 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2435
2436 if (page_dirty) {
2437 /* Be strict to return code; it must be 1, or what else? */
4010ba38 2438 if (migration_ops->ram_save_target_page(rs, pss) != 1) {
93589827
PX
2439 error_report_once("%s: ram_save_target_page failed", __func__);
2440 ret = -1;
2441 goto out;
2442 }
2443 sent = true;
2444 }
2445 pss_find_next_dirty(pss);
2446 } while (pss_within_range(pss));
2447out:
2448 pss_host_page_finish(pss);
2449 /* For urgent requests, flush immediately if sent */
2450 if (sent) {
2451 qemu_fflush(pss->pss_channel);
2452 }
2453 return ret;
2454}
2455
a82d593b 2456/**
3d0684b2 2457 * ram_save_host_page: save a whole host page
a82d593b 2458 *
3d0684b2
JQ
2459 * Starting at *offset send pages up to the end of the current host
2460 * page. It's valid for the initial offset to point into the middle of
2461 * a host page in which case the remainder of the hostpage is sent.
2462 * Only dirty target pages are sent. Note that the host page size may
2463 * be a huge page for this block.
f3321554 2464 *
1eb3fc0a
DDAG
2465 * The saving stops at the boundary of the used_length of the block
2466 * if the RAMBlock isn't a multiple of the host page size.
a82d593b 2467 *
f3321554
PX
2468 * The caller must be with ram_state.bitmap_mutex held to call this
2469 * function. Note that this function can temporarily release the lock, but
2470 * when the function is returned it'll make sure the lock is still held.
2471 *
3d0684b2
JQ
2472 * Returns the number of pages written or negative on error
2473 *
6f37bb8b 2474 * @rs: current RAM state
3d0684b2 2475 * @pss: data about the page we want to send
a82d593b 2476 */
05931ec5 2477static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
a82d593b 2478{
f3321554 2479 bool page_dirty, preempt_active = postcopy_preempt_active();
a82d593b 2480 int tmppages, pages = 0;
a935e30f
JQ
2481 size_t pagesize_bits =
2482 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
278e2f55
AG
2483 unsigned long start_page = pss->page;
2484 int res;
4c011c37 2485
fbd162e6 2486 if (ramblock_is_ignored(pss->block)) {
b895de50
CLG
2487 error_report("block %s should not be migrated !", pss->block->idstr);
2488 return 0;
2489 }
2490
d9e474ea
PX
2491 /* Update host page boundary information */
2492 pss_host_page_prepare(pss);
2493
a82d593b 2494 do {
f3321554 2495 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
a82d593b 2496
f3321554
PX
2497 /* Check the pages is dirty and if it is send it */
2498 if (page_dirty) {
ba1b7c81 2499 /*
f3321554
PX
2500 * Properly yield the lock only in postcopy preempt mode
2501 * because both migration thread and rp-return thread can
2502 * operate on the bitmaps.
ba1b7c81 2503 */
f3321554
PX
2504 if (preempt_active) {
2505 qemu_mutex_unlock(&rs->bitmap_mutex);
ba1b7c81 2506 }
4010ba38 2507 tmppages = migration_ops->ram_save_target_page(rs, pss);
f3321554
PX
2508 if (tmppages >= 0) {
2509 pages += tmppages;
2510 /*
2511 * Allow rate limiting to happen in the middle of huge pages if
2512 * something is sent in the current iteration.
2513 */
2514 if (pagesize_bits > 1 && tmppages > 0) {
2515 migration_rate_limit();
2516 }
2517 }
2518 if (preempt_active) {
2519 qemu_mutex_lock(&rs->bitmap_mutex);
2520 }
2521 } else {
2522 tmppages = 0;
23feba90 2523 }
f3321554
PX
2524
2525 if (tmppages < 0) {
d9e474ea 2526 pss_host_page_finish(pss);
f3321554
PX
2527 return tmppages;
2528 }
2529
d9e474ea
PX
2530 pss_find_next_dirty(pss);
2531 } while (pss_within_range(pss));
2532
2533 pss_host_page_finish(pss);
278e2f55
AG
2534
2535 res = ram_save_release_protection(rs, pss, start_page);
2536 return (res < 0 ? res : pages);
a82d593b 2537}
6c595cde 2538
56e93d26 2539/**
3d0684b2 2540 * ram_find_and_save_block: finds a dirty page and sends it to f
56e93d26
JQ
2541 *
2542 * Called within an RCU critical section.
2543 *
e8f3735f
XG
2544 * Returns the number of pages written where zero means no dirty pages,
2545 * or negative on error
56e93d26 2546 *
6f37bb8b 2547 * @rs: current RAM state
a82d593b
DDAG
2548 *
2549 * On systems where host-page-size > target-page-size it will send all the
2550 * pages in a host page that are dirty.
56e93d26 2551 */
05931ec5 2552static int ram_find_and_save_block(RAMState *rs)
56e93d26 2553{
f1668764 2554 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
56e93d26 2555 int pages = 0;
56e93d26 2556
0827b9e9 2557 /* No dirty page as there is zero RAM */
8d80e195 2558 if (!rs->ram_bytes_total) {
0827b9e9
AA
2559 return pages;
2560 }
2561
4934a5dd
PX
2562 /*
2563 * Always keep last_seen_block/last_page valid during this procedure,
2564 * because find_dirty_block() relies on these values (e.g., we compare
2565 * last_seen_block with pss.block to see whether we searched all the
2566 * ramblocks) to detect the completion of migration. Having NULL value
2567 * of last_seen_block can conditionally cause below loop to run forever.
2568 */
2569 if (!rs->last_seen_block) {
2570 rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks);
2571 rs->last_page = 0;
2572 }
2573
f1668764 2574 pss_init(pss, rs->last_seen_block, rs->last_page);
b8fb8cb7 2575
31e2ac74 2576 while (true){
51efd36f 2577 if (!get_queued_page(rs, pss)) {
b062106d 2578 /* priority queue empty, so just search for something dirty */
31e2ac74
JQ
2579 int res = find_dirty_block(rs, pss);
2580 if (res != PAGE_DIRTY_FOUND) {
2581 if (res == PAGE_ALL_CLEAN) {
51efd36f 2582 break;
31e2ac74
JQ
2583 } else if (res == PAGE_TRY_AGAIN) {
2584 continue;
51efd36f
JQ
2585 }
2586 }
56e93d26 2587 }
51efd36f 2588 pages = ram_save_host_page(rs, pss);
31e2ac74
JQ
2589 if (pages) {
2590 break;
2591 }
2592 }
56e93d26 2593
f1668764
PX
2594 rs->last_seen_block = pss->block;
2595 rs->last_page = pss->page;
56e93d26
JQ
2596
2597 return pages;
2598}
2599
2600void acct_update_position(QEMUFile *f, size_t size, bool zero)
2601{
2602 uint64_t pages = size / TARGET_PAGE_SIZE;
f7ccd61b 2603
56e93d26 2604 if (zero) {
23b7576d 2605 stat64_add(&ram_atomic_counters.duplicate, pages);
56e93d26 2606 } else {
23b7576d 2607 stat64_add(&ram_atomic_counters.normal, pages);
4c2d0f6d 2608 ram_transferred_add(size);
1a93bd2f 2609 qemu_file_credit_transfer(f, size);
56e93d26
JQ
2610 }
2611}
2612
8008a272 2613static uint64_t ram_bytes_total_with_ignored(void)
56e93d26
JQ
2614{
2615 RAMBlock *block;
2616 uint64_t total = 0;
2617
89ac5a1d
DDAG
2618 RCU_READ_LOCK_GUARD();
2619
8008a272
JQ
2620 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2621 total += block->used_length;
99e15582 2622 }
56e93d26
JQ
2623 return total;
2624}
2625
fbd162e6
YK
2626uint64_t ram_bytes_total(void)
2627{
8008a272
JQ
2628 RAMBlock *block;
2629 uint64_t total = 0;
2630
2631 RCU_READ_LOCK_GUARD();
2632
2633 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2634 total += block->used_length;
2635 }
2636 return total;
fbd162e6
YK
2637}
2638
f265e0e4 2639static void xbzrle_load_setup(void)
56e93d26 2640{
f265e0e4 2641 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
56e93d26
JQ
2642}
2643
f265e0e4
JQ
2644static void xbzrle_load_cleanup(void)
2645{
2646 g_free(XBZRLE.decoded_buf);
2647 XBZRLE.decoded_buf = NULL;
2648}
2649
7d7c96be
PX
2650static void ram_state_cleanup(RAMState **rsp)
2651{
b9ccaf6d
DDAG
2652 if (*rsp) {
2653 migration_page_queue_free(*rsp);
2654 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2655 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2656 g_free(*rsp);
2657 *rsp = NULL;
2658 }
7d7c96be
PX
2659}
2660
84593a08
PX
2661static void xbzrle_cleanup(void)
2662{
2663 XBZRLE_cache_lock();
2664 if (XBZRLE.cache) {
2665 cache_fini(XBZRLE.cache);
2666 g_free(XBZRLE.encoded_buf);
2667 g_free(XBZRLE.current_buf);
2668 g_free(XBZRLE.zero_target_page);
2669 XBZRLE.cache = NULL;
2670 XBZRLE.encoded_buf = NULL;
2671 XBZRLE.current_buf = NULL;
2672 XBZRLE.zero_target_page = NULL;
2673 }
2674 XBZRLE_cache_unlock();
2675}
2676
f265e0e4 2677static void ram_save_cleanup(void *opaque)
56e93d26 2678{
53518d94 2679 RAMState **rsp = opaque;
6b6712ef 2680 RAMBlock *block;
eb859c53 2681
278e2f55
AG
2682 /* We don't use dirty log with background snapshots */
2683 if (!migrate_background_snapshot()) {
2684 /* caller have hold iothread lock or is in a bh, so there is
2685 * no writing race against the migration bitmap
2686 */
63b41db4
HH
2687 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2688 /*
2689 * do not stop dirty log without starting it, since
2690 * memory_global_dirty_log_stop will assert that
2691 * memory_global_dirty_log_start/stop used in pairs
2692 */
2693 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2694 }
278e2f55 2695 }
6b6712ef 2696
fbd162e6 2697 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
002cad6b
PX
2698 g_free(block->clear_bmap);
2699 block->clear_bmap = NULL;
6b6712ef
JQ
2700 g_free(block->bmap);
2701 block->bmap = NULL;
56e93d26
JQ
2702 }
2703
84593a08 2704 xbzrle_cleanup();
f0afa331 2705 compress_threads_save_cleanup();
7d7c96be 2706 ram_state_cleanup(rsp);
4010ba38
JQ
2707 g_free(migration_ops);
2708 migration_ops = NULL;
56e93d26
JQ
2709}
2710
6f37bb8b 2711static void ram_state_reset(RAMState *rs)
56e93d26 2712{
ec6f3ab9
PX
2713 int i;
2714
2715 for (i = 0; i < RAM_CHANNEL_MAX; i++) {
2716 rs->pss[i].last_sent_block = NULL;
2717 }
2718
6f37bb8b 2719 rs->last_seen_block = NULL;
269ace29 2720 rs->last_page = 0;
6f37bb8b 2721 rs->last_version = ram_list.version;
1a373522 2722 rs->xbzrle_enabled = false;
56e93d26
JQ
2723}
2724
2725#define MAX_WAIT 50 /* ms, half buffered_file limit */
2726
e0b266f0
DDAG
2727/* **** functions for postcopy ***** */
2728
ced1c616
PB
2729void ram_postcopy_migrated_memory_release(MigrationState *ms)
2730{
2731 struct RAMBlock *block;
ced1c616 2732
fbd162e6 2733 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
6b6712ef
JQ
2734 unsigned long *bitmap = block->bmap;
2735 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2736 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
ced1c616
PB
2737
2738 while (run_start < range) {
2739 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
8bba004c
AR
2740 ram_discard_range(block->idstr,
2741 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2742 ((ram_addr_t)(run_end - run_start))
2743 << TARGET_PAGE_BITS);
ced1c616
PB
2744 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2745 }
2746 }
2747}
2748
3d0684b2
JQ
2749/**
2750 * postcopy_send_discard_bm_ram: discard a RAMBlock
2751 *
e0b266f0 2752 * Callback from postcopy_each_ram_send_discard for each RAMBlock
3d0684b2
JQ
2753 *
2754 * @ms: current migration state
89dab31b 2755 * @block: RAMBlock to discard
e0b266f0 2756 */
9e7d1223 2757static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
e0b266f0 2758{
6b6712ef 2759 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
e0b266f0 2760 unsigned long current;
1e7cf8c3 2761 unsigned long *bitmap = block->bmap;
e0b266f0 2762
6b6712ef 2763 for (current = 0; current < end; ) {
1e7cf8c3 2764 unsigned long one = find_next_bit(bitmap, end, current);
33a5cb62 2765 unsigned long zero, discard_length;
e0b266f0 2766
33a5cb62
WY
2767 if (one >= end) {
2768 break;
2769 }
e0b266f0 2770
1e7cf8c3 2771 zero = find_next_zero_bit(bitmap, end, one + 1);
33a5cb62
WY
2772
2773 if (zero >= end) {
2774 discard_length = end - one;
e0b266f0 2775 } else {
33a5cb62
WY
2776 discard_length = zero - one;
2777 }
810cf2bb 2778 postcopy_discard_send_range(ms, one, discard_length);
33a5cb62 2779 current = one + discard_length;
e0b266f0 2780 }
e0b266f0
DDAG
2781}
2782
f30c2e5b
PX
2783static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2784
3d0684b2
JQ
2785/**
2786 * postcopy_each_ram_send_discard: discard all RAMBlocks
2787 *
e0b266f0
DDAG
2788 * Utility for the outgoing postcopy code.
2789 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2790 * passing it bitmap indexes and name.
e0b266f0
DDAG
2791 * (qemu_ram_foreach_block ends up passing unscaled lengths
2792 * which would mean postcopy code would have to deal with target page)
3d0684b2
JQ
2793 *
2794 * @ms: current migration state
e0b266f0 2795 */
739fcc1b 2796static void postcopy_each_ram_send_discard(MigrationState *ms)
e0b266f0
DDAG
2797{
2798 struct RAMBlock *block;
e0b266f0 2799
fbd162e6 2800 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
810cf2bb 2801 postcopy_discard_send_init(ms, block->idstr);
e0b266f0 2802
f30c2e5b
PX
2803 /*
2804 * Deal with TPS != HPS and huge pages. It discard any partially sent
2805 * host-page size chunks, mark any partially dirty host-page size
2806 * chunks as all dirty. In this case the host-page is the host-page
2807 * for the particular RAMBlock, i.e. it might be a huge page.
2808 */
2809 postcopy_chunk_hostpages_pass(ms, block);
2810
e0b266f0
DDAG
2811 /*
2812 * Postcopy sends chunks of bitmap over the wire, but it
2813 * just needs indexes at this point, avoids it having
2814 * target page specific code.
2815 */
739fcc1b 2816 postcopy_send_discard_bm_ram(ms, block);
810cf2bb 2817 postcopy_discard_send_finish(ms);
e0b266f0 2818 }
e0b266f0
DDAG
2819}
2820
3d0684b2 2821/**
8324ef86 2822 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
3d0684b2
JQ
2823 *
2824 * Helper for postcopy_chunk_hostpages; it's called twice to
2825 * canonicalize the two bitmaps, that are similar, but one is
2826 * inverted.
99e314eb 2827 *
3d0684b2
JQ
2828 * Postcopy requires that all target pages in a hostpage are dirty or
2829 * clean, not a mix. This function canonicalizes the bitmaps.
99e314eb 2830 *
3d0684b2 2831 * @ms: current migration state
3d0684b2 2832 * @block: block that contains the page we want to canonicalize
99e314eb 2833 */
1e7cf8c3 2834static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
99e314eb 2835{
53518d94 2836 RAMState *rs = ram_state;
6b6712ef 2837 unsigned long *bitmap = block->bmap;
29c59172 2838 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
6b6712ef 2839 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
99e314eb
DDAG
2840 unsigned long run_start;
2841
29c59172
DDAG
2842 if (block->page_size == TARGET_PAGE_SIZE) {
2843 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2844 return;
2845 }
2846
1e7cf8c3
WY
2847 /* Find a dirty page */
2848 run_start = find_next_bit(bitmap, pages, 0);
99e314eb 2849
6b6712ef 2850 while (run_start < pages) {
99e314eb
DDAG
2851
2852 /*
2853 * If the start of this run of pages is in the middle of a host
2854 * page, then we need to fixup this host page.
2855 */
9dec3cc3 2856 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
99e314eb 2857 /* Find the end of this run */
1e7cf8c3 2858 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
99e314eb
DDAG
2859 /*
2860 * If the end isn't at the start of a host page, then the
2861 * run doesn't finish at the end of a host page
2862 * and we need to discard.
2863 */
99e314eb
DDAG
2864 }
2865
9dec3cc3 2866 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
99e314eb 2867 unsigned long page;
dad45ab2
WY
2868 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2869 host_ratio);
2870 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
99e314eb 2871
99e314eb
DDAG
2872 /* Clean up the bitmap */
2873 for (page = fixup_start_addr;
2874 page < fixup_start_addr + host_ratio; page++) {
99e314eb
DDAG
2875 /*
2876 * Remark them as dirty, updating the count for any pages
2877 * that weren't previously dirty.
2878 */
0d8ec885 2879 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
99e314eb
DDAG
2880 }
2881 }
2882
1e7cf8c3
WY
2883 /* Find the next dirty page for the next iteration */
2884 run_start = find_next_bit(bitmap, pages, run_start);
99e314eb
DDAG
2885 }
2886}
2887
3d0684b2
JQ
2888/**
2889 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2890 *
e0b266f0
DDAG
2891 * Transmit the set of pages to be discarded after precopy to the target
2892 * these are pages that:
2893 * a) Have been previously transmitted but are now dirty again
2894 * b) Pages that have never been transmitted, this ensures that
2895 * any pages on the destination that have been mapped by background
2896 * tasks get discarded (transparent huge pages is the specific concern)
2897 * Hopefully this is pretty sparse
3d0684b2
JQ
2898 *
2899 * @ms: current migration state
e0b266f0 2900 */
739fcc1b 2901void ram_postcopy_send_discard_bitmap(MigrationState *ms)
e0b266f0 2902{
53518d94 2903 RAMState *rs = ram_state;
e0b266f0 2904
89ac5a1d 2905 RCU_READ_LOCK_GUARD();
e0b266f0
DDAG
2906
2907 /* This should be our last sync, the src is now paused */
eb859c53 2908 migration_bitmap_sync(rs);
e0b266f0 2909
6b6712ef 2910 /* Easiest way to make sure we don't resume in the middle of a host-page */
ec6f3ab9 2911 rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL;
6b6712ef 2912 rs->last_seen_block = NULL;
6b6712ef 2913 rs->last_page = 0;
e0b266f0 2914
739fcc1b 2915 postcopy_each_ram_send_discard(ms);
e0b266f0 2916
739fcc1b 2917 trace_ram_postcopy_send_discard_bitmap();
e0b266f0
DDAG
2918}
2919
3d0684b2
JQ
2920/**
2921 * ram_discard_range: discard dirtied pages at the beginning of postcopy
e0b266f0 2922 *
3d0684b2 2923 * Returns zero on success
e0b266f0 2924 *
36449157
JQ
2925 * @rbname: name of the RAMBlock of the request. NULL means the
2926 * same that last one.
3d0684b2
JQ
2927 * @start: RAMBlock starting page
2928 * @length: RAMBlock size
e0b266f0 2929 */
aaa2064c 2930int ram_discard_range(const char *rbname, uint64_t start, size_t length)
e0b266f0 2931{
36449157 2932 trace_ram_discard_range(rbname, start, length);
d3a5038c 2933
89ac5a1d 2934 RCU_READ_LOCK_GUARD();
36449157 2935 RAMBlock *rb = qemu_ram_block_by_name(rbname);
e0b266f0
DDAG
2936
2937 if (!rb) {
36449157 2938 error_report("ram_discard_range: Failed to find block '%s'", rbname);
03acb4e9 2939 return -1;
e0b266f0
DDAG
2940 }
2941
814bb08f
PX
2942 /*
2943 * On source VM, we don't need to update the received bitmap since
2944 * we don't even have one.
2945 */
2946 if (rb->receivedmap) {
2947 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2948 length >> qemu_target_page_bits());
2949 }
2950
03acb4e9 2951 return ram_block_discard_range(rb, start, length);
e0b266f0
DDAG
2952}
2953
84593a08
PX
2954/*
2955 * For every allocation, we will try not to crash the VM if the
2956 * allocation failed.
2957 */
2958static int xbzrle_init(void)
2959{
2960 Error *local_err = NULL;
2961
2962 if (!migrate_use_xbzrle()) {
2963 return 0;
2964 }
2965
2966 XBZRLE_cache_lock();
2967
2968 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2969 if (!XBZRLE.zero_target_page) {
2970 error_report("%s: Error allocating zero page", __func__);
2971 goto err_out;
2972 }
2973
2974 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2975 TARGET_PAGE_SIZE, &local_err);
2976 if (!XBZRLE.cache) {
2977 error_report_err(local_err);
2978 goto free_zero_page;
2979 }
2980
2981 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2982 if (!XBZRLE.encoded_buf) {
2983 error_report("%s: Error allocating encoded_buf", __func__);
2984 goto free_cache;
2985 }
2986
2987 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2988 if (!XBZRLE.current_buf) {
2989 error_report("%s: Error allocating current_buf", __func__);
2990 goto free_encoded_buf;
2991 }
2992
2993 /* We are all good */
2994 XBZRLE_cache_unlock();
2995 return 0;
2996
2997free_encoded_buf:
2998 g_free(XBZRLE.encoded_buf);
2999 XBZRLE.encoded_buf = NULL;
3000free_cache:
3001 cache_fini(XBZRLE.cache);
3002 XBZRLE.cache = NULL;
3003free_zero_page:
3004 g_free(XBZRLE.zero_target_page);
3005 XBZRLE.zero_target_page = NULL;
3006err_out:
3007 XBZRLE_cache_unlock();
3008 return -ENOMEM;
3009}
3010
53518d94 3011static int ram_state_init(RAMState **rsp)
56e93d26 3012{
7d00ee6a
PX
3013 *rsp = g_try_new0(RAMState, 1);
3014
3015 if (!*rsp) {
3016 error_report("%s: Init ramstate fail", __func__);
3017 return -1;
3018 }
53518d94
JQ
3019
3020 qemu_mutex_init(&(*rsp)->bitmap_mutex);
3021 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3022 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
8d80e195 3023 (*rsp)->ram_bytes_total = ram_bytes_total();
56e93d26 3024
7d00ee6a 3025 /*
40c4d4a8
IR
3026 * Count the total number of pages used by ram blocks not including any
3027 * gaps due to alignment or unplugs.
03158519 3028 * This must match with the initial values of dirty bitmap.
7d00ee6a 3029 */
8d80e195 3030 (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS;
7d00ee6a
PX
3031 ram_state_reset(*rsp);
3032
3033 return 0;
3034}
3035
d6eff5d7 3036static void ram_list_init_bitmaps(void)
7d00ee6a 3037{
002cad6b 3038 MigrationState *ms = migrate_get_current();
d6eff5d7
PX
3039 RAMBlock *block;
3040 unsigned long pages;
002cad6b 3041 uint8_t shift;
56e93d26 3042
0827b9e9
AA
3043 /* Skip setting bitmap if there is no RAM */
3044 if (ram_bytes_total()) {
002cad6b
PX
3045 shift = ms->clear_bitmap_shift;
3046 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3047 error_report("clear_bitmap_shift (%u) too big, using "
3048 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3049 shift = CLEAR_BITMAP_SHIFT_MAX;
3050 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3051 error_report("clear_bitmap_shift (%u) too small, using "
3052 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3053 shift = CLEAR_BITMAP_SHIFT_MIN;
3054 }
3055
fbd162e6 3056 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
d6eff5d7 3057 pages = block->max_length >> TARGET_PAGE_BITS;
03158519
WY
3058 /*
3059 * The initial dirty bitmap for migration must be set with all
3060 * ones to make sure we'll migrate every guest RAM page to
3061 * destination.
40c4d4a8
IR
3062 * Here we set RAMBlock.bmap all to 1 because when rebegin a
3063 * new migration after a failed migration, ram_list.
3064 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3065 * guest memory.
03158519 3066 */
6b6712ef 3067 block->bmap = bitmap_new(pages);
40c4d4a8 3068 bitmap_set(block->bmap, 0, pages);
002cad6b
PX
3069 block->clear_bmap_shift = shift;
3070 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
0827b9e9 3071 }
f3f491fc 3072 }
d6eff5d7
PX
3073}
3074
be39b4cd
DH
3075static void migration_bitmap_clear_discarded_pages(RAMState *rs)
3076{
3077 unsigned long pages;
3078 RAMBlock *rb;
3079
3080 RCU_READ_LOCK_GUARD();
3081
3082 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3083 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
3084 rs->migration_dirty_pages -= pages;
3085 }
3086}
3087
d6eff5d7
PX
3088static void ram_init_bitmaps(RAMState *rs)
3089{
3090 /* For memory_global_dirty_log_start below. */
3091 qemu_mutex_lock_iothread();
3092 qemu_mutex_lock_ramlist();
f3f491fc 3093
89ac5a1d
DDAG
3094 WITH_RCU_READ_LOCK_GUARD() {
3095 ram_list_init_bitmaps();
278e2f55
AG
3096 /* We don't use dirty log with background snapshots */
3097 if (!migrate_background_snapshot()) {
63b41db4 3098 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
278e2f55
AG
3099 migration_bitmap_sync_precopy(rs);
3100 }
89ac5a1d 3101 }
56e93d26 3102 qemu_mutex_unlock_ramlist();
49877834 3103 qemu_mutex_unlock_iothread();
be39b4cd
DH
3104
3105 /*
3106 * After an eventual first bitmap sync, fixup the initial bitmap
3107 * containing all 1s to exclude any discarded pages from migration.
3108 */
3109 migration_bitmap_clear_discarded_pages(rs);
d6eff5d7
PX
3110}
3111
3112static int ram_init_all(RAMState **rsp)
3113{
3114 if (ram_state_init(rsp)) {
3115 return -1;
3116 }
3117
3118 if (xbzrle_init()) {
3119 ram_state_cleanup(rsp);
3120 return -1;
3121 }
3122
3123 ram_init_bitmaps(*rsp);
a91246c9
HZ
3124
3125 return 0;
3126}
3127
08614f34
PX
3128static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3129{
3130 RAMBlock *block;
3131 uint64_t pages = 0;
3132
3133 /*
3134 * Postcopy is not using xbzrle/compression, so no need for that.
3135 * Also, since source are already halted, we don't need to care
3136 * about dirty page logging as well.
3137 */
3138
fbd162e6 3139 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
08614f34
PX
3140 pages += bitmap_count_one(block->bmap,
3141 block->used_length >> TARGET_PAGE_BITS);
3142 }
3143
3144 /* This may not be aligned with current bitmaps. Recalculate. */
3145 rs->migration_dirty_pages = pages;
3146
1a373522 3147 ram_state_reset(rs);
08614f34
PX
3148
3149 /* Update RAMState cache of output QEMUFile */
7f401b80 3150 rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out;
08614f34
PX
3151
3152 trace_ram_state_resume_prepare(pages);
3153}
3154
6bcb05fc
WW
3155/*
3156 * This function clears bits of the free pages reported by the caller from the
3157 * migration dirty bitmap. @addr is the host address corresponding to the
3158 * start of the continuous guest free pages, and @len is the total bytes of
3159 * those pages.
3160 */
3161void qemu_guest_free_page_hint(void *addr, size_t len)
3162{
3163 RAMBlock *block;
3164 ram_addr_t offset;
3165 size_t used_len, start, npages;
3166 MigrationState *s = migrate_get_current();
3167
3168 /* This function is currently expected to be used during live migration */
3169 if (!migration_is_setup_or_active(s->state)) {
3170 return;
3171 }
3172
3173 for (; len > 0; len -= used_len, addr += used_len) {
3174 block = qemu_ram_block_from_host(addr, false, &offset);
3175 if (unlikely(!block || offset >= block->used_length)) {
3176 /*
3177 * The implementation might not support RAMBlock resize during
3178 * live migration, but it could happen in theory with future
3179 * updates. So we add a check here to capture that case.
3180 */
3181 error_report_once("%s unexpected error", __func__);
3182 return;
3183 }
3184
3185 if (len <= block->used_length - offset) {
3186 used_len = len;
3187 } else {
3188 used_len = block->used_length - offset;
3189 }
3190
3191 start = offset >> TARGET_PAGE_BITS;
3192 npages = used_len >> TARGET_PAGE_BITS;
3193
3194 qemu_mutex_lock(&ram_state->bitmap_mutex);
3143577d
WW
3195 /*
3196 * The skipped free pages are equavalent to be sent from clear_bmap's
3197 * perspective, so clear the bits from the memory region bitmap which
3198 * are initially set. Otherwise those skipped pages will be sent in
3199 * the next round after syncing from the memory region bitmap.
3200 */
1230a25f 3201 migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
6bcb05fc
WW
3202 ram_state->migration_dirty_pages -=
3203 bitmap_count_one_with_offset(block->bmap, start, npages);
3204 bitmap_clear(block->bmap, start, npages);
3205 qemu_mutex_unlock(&ram_state->bitmap_mutex);
3206 }
3207}
3208
3d0684b2
JQ
3209/*
3210 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
a91246c9
HZ
3211 * long-running RCU critical section. When rcu-reclaims in the code
3212 * start to become numerous it will be necessary to reduce the
3213 * granularity of these critical sections.
3214 */
3215
3d0684b2
JQ
3216/**
3217 * ram_save_setup: Setup RAM for migration
3218 *
3219 * Returns zero to indicate success and negative for error
3220 *
3221 * @f: QEMUFile where to send the data
3222 * @opaque: RAMState pointer
3223 */
a91246c9
HZ
3224static int ram_save_setup(QEMUFile *f, void *opaque)
3225{
53518d94 3226 RAMState **rsp = opaque;
a91246c9 3227 RAMBlock *block;
33d70973 3228 int ret;
a91246c9 3229
dcaf446e
XG
3230 if (compress_threads_save_setup()) {
3231 return -1;
3232 }
3233
a91246c9
HZ
3234 /* migration has already setup the bitmap, reuse it. */
3235 if (!migration_in_colo_state()) {
7d00ee6a 3236 if (ram_init_all(rsp) != 0) {
dcaf446e 3237 compress_threads_save_cleanup();
a91246c9 3238 return -1;
53518d94 3239 }
a91246c9 3240 }
7f401b80 3241 (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f;
a91246c9 3242
0e6ebd48 3243 WITH_RCU_READ_LOCK_GUARD() {
8008a272
JQ
3244 qemu_put_be64(f, ram_bytes_total_with_ignored()
3245 | RAM_SAVE_FLAG_MEM_SIZE);
56e93d26 3246
0e6ebd48
DDAG
3247 RAMBLOCK_FOREACH_MIGRATABLE(block) {
3248 qemu_put_byte(f, strlen(block->idstr));
3249 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3250 qemu_put_be64(f, block->used_length);
3251 if (migrate_postcopy_ram() && block->page_size !=
3252 qemu_host_page_size) {
3253 qemu_put_be64(f, block->page_size);
3254 }
3255 if (migrate_ignore_shared()) {
3256 qemu_put_be64(f, block->mr->addr);
3257 }
fbd162e6 3258 }
56e93d26
JQ
3259 }
3260
56e93d26
JQ
3261 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3262 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3263
4010ba38
JQ
3264 migration_ops = g_malloc0(sizeof(MigrationOps));
3265 migration_ops->ram_save_target_page = ram_save_target_page_legacy;
33d70973
LB
3266 ret = multifd_send_sync_main(f);
3267 if (ret < 0) {
3268 return ret;
3269 }
3270
56e93d26 3271 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
35374cbd 3272 qemu_fflush(f);
56e93d26
JQ
3273
3274 return 0;
3275}
3276
3d0684b2
JQ
3277/**
3278 * ram_save_iterate: iterative stage for migration
3279 *
3280 * Returns zero to indicate success and negative for error
3281 *
3282 * @f: QEMUFile where to send the data
3283 * @opaque: RAMState pointer
3284 */
56e93d26
JQ
3285static int ram_save_iterate(QEMUFile *f, void *opaque)
3286{
53518d94
JQ
3287 RAMState **temp = opaque;
3288 RAMState *rs = *temp;
3d4095b2 3289 int ret = 0;
56e93d26
JQ
3290 int i;
3291 int64_t t0;
5c90308f 3292 int done = 0;
56e93d26 3293
b2557345
PL
3294 if (blk_mig_bulk_active()) {
3295 /* Avoid transferring ram during bulk phase of block migration as
3296 * the bulk phase will usually take a long time and transferring
3297 * ram updates during that time is pointless. */
3298 goto out;
3299 }
3300
63268c49
PX
3301 /*
3302 * We'll take this lock a little bit long, but it's okay for two reasons.
3303 * Firstly, the only possible other thread to take it is who calls
3304 * qemu_guest_free_page_hint(), which should be rare; secondly, see
3305 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3306 * guarantees that we'll at least released it in a regular basis.
3307 */
3308 qemu_mutex_lock(&rs->bitmap_mutex);
89ac5a1d
DDAG
3309 WITH_RCU_READ_LOCK_GUARD() {
3310 if (ram_list.version != rs->last_version) {
3311 ram_state_reset(rs);
3312 }
56e93d26 3313
89ac5a1d
DDAG
3314 /* Read version before ram_list.blocks */
3315 smp_rmb();
56e93d26 3316
89ac5a1d 3317 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
56e93d26 3318
89ac5a1d
DDAG
3319 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3320 i = 0;
3321 while ((ret = qemu_file_rate_limit(f)) == 0 ||
a1fe28df 3322 postcopy_has_request(rs)) {
89ac5a1d 3323 int pages;
e03a34f8 3324
89ac5a1d
DDAG
3325 if (qemu_file_get_error(f)) {
3326 break;
3327 }
e8f3735f 3328
05931ec5 3329 pages = ram_find_and_save_block(rs);
89ac5a1d
DDAG
3330 /* no more pages to sent */
3331 if (pages == 0) {
3332 done = 1;
3333 break;
3334 }
e8f3735f 3335
89ac5a1d
DDAG
3336 if (pages < 0) {
3337 qemu_file_set_error(f, pages);
56e93d26
JQ
3338 break;
3339 }
89ac5a1d
DDAG
3340
3341 rs->target_page_count += pages;
3342
644acf99
WY
3343 /*
3344 * During postcopy, it is necessary to make sure one whole host
3345 * page is sent in one chunk.
3346 */
3347 if (migrate_postcopy_ram()) {
3348 flush_compressed_data(rs);
3349 }
3350
89ac5a1d
DDAG
3351 /*
3352 * we want to check in the 1st loop, just in case it was the 1st
3353 * time and we had to sync the dirty bitmap.
3354 * qemu_clock_get_ns() is a bit expensive, so we only check each
3355 * some iterations
3356 */
3357 if ((i & 63) == 0) {
3358 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3359 1000000;
3360 if (t1 > MAX_WAIT) {
3361 trace_ram_save_iterate_big_wait(t1, i);
3362 break;
3363 }
3364 }
3365 i++;
56e93d26 3366 }
56e93d26 3367 }
63268c49 3368 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26
JQ
3369
3370 /*
3371 * Must occur before EOS (or any QEMUFile operation)
3372 * because of RDMA protocol.
3373 */
3374 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3375
b2557345 3376out:
b69a0227
JQ
3377 if (ret >= 0
3378 && migration_is_setup_or_active(migrate_get_current()->state)) {
7f401b80 3379 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
33d70973
LB
3380 if (ret < 0) {
3381 return ret;
3382 }
3383
3d4095b2
JQ
3384 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3385 qemu_fflush(f);
4c2d0f6d 3386 ram_transferred_add(8);
56e93d26 3387
3d4095b2
JQ
3388 ret = qemu_file_get_error(f);
3389 }
56e93d26
JQ
3390 if (ret < 0) {
3391 return ret;
3392 }
3393
5c90308f 3394 return done;
56e93d26
JQ
3395}
3396
3d0684b2
JQ
3397/**
3398 * ram_save_complete: function called to send the remaining amount of ram
3399 *
e8f3735f 3400 * Returns zero to indicate success or negative on error
3d0684b2
JQ
3401 *
3402 * Called with iothread lock
3403 *
3404 * @f: QEMUFile where to send the data
3405 * @opaque: RAMState pointer
3406 */
56e93d26
JQ
3407static int ram_save_complete(QEMUFile *f, void *opaque)
3408{
53518d94
JQ
3409 RAMState **temp = opaque;
3410 RAMState *rs = *temp;
e8f3735f 3411 int ret = 0;
6f37bb8b 3412
05931ec5
JQ
3413 rs->last_stage = !migration_in_colo_state();
3414
89ac5a1d
DDAG
3415 WITH_RCU_READ_LOCK_GUARD() {
3416 if (!migration_in_postcopy()) {
3417 migration_bitmap_sync_precopy(rs);
3418 }
56e93d26 3419
89ac5a1d 3420 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
56e93d26 3421
89ac5a1d 3422 /* try transferring iterative blocks of memory */
56e93d26 3423
89ac5a1d 3424 /* flush all remaining blocks regardless of rate limiting */
c13221b5 3425 qemu_mutex_lock(&rs->bitmap_mutex);
89ac5a1d
DDAG
3426 while (true) {
3427 int pages;
56e93d26 3428
05931ec5 3429 pages = ram_find_and_save_block(rs);
89ac5a1d
DDAG
3430 /* no more blocks to sent */
3431 if (pages == 0) {
3432 break;
3433 }
3434 if (pages < 0) {
3435 ret = pages;
3436 break;
3437 }
e8f3735f 3438 }
c13221b5 3439 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26 3440
89ac5a1d
DDAG
3441 flush_compressed_data(rs);
3442 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3443 }
d09a6fde 3444
33d70973
LB
3445 if (ret < 0) {
3446 return ret;
3d4095b2 3447 }
56e93d26 3448
7f401b80 3449 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
33d70973
LB
3450 if (ret < 0) {
3451 return ret;
3452 }
3453
3454 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3455 qemu_fflush(f);
3456
3457 return 0;
56e93d26
JQ
3458}
3459
fd70385d 3460static void ram_state_pending_estimate(void *opaque,
c8df4a7a
JQ
3461 uint64_t *res_precopy_only,
3462 uint64_t *res_compatible,
3463 uint64_t *res_postcopy_only)
56e93d26 3464{
53518d94
JQ
3465 RAMState **temp = opaque;
3466 RAMState *rs = *temp;
56e93d26 3467
c8df4a7a 3468 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 3469
c8df4a7a
JQ
3470 if (migrate_postcopy_ram()) {
3471 /* We can do postcopy, and all the data is postcopiable */
3472 *res_postcopy_only += remaining_size;
3473 } else {
3474 *res_precopy_only += remaining_size;
3475 }
3476}
3477
fd70385d 3478static void ram_state_pending_exact(void *opaque,
c8df4a7a
JQ
3479 uint64_t *res_precopy_only,
3480 uint64_t *res_compatible,
3481 uint64_t *res_postcopy_only)
3482{
3483 RAMState **temp = opaque;
3484 RAMState *rs = *temp;
3485
3486 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3487
3488 if (!migration_in_postcopy()) {
56e93d26 3489 qemu_mutex_lock_iothread();
89ac5a1d
DDAG
3490 WITH_RCU_READ_LOCK_GUARD() {
3491 migration_bitmap_sync_precopy(rs);
3492 }
56e93d26 3493 qemu_mutex_unlock_iothread();
9edabd4d 3494 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 3495 }
c31b098f 3496
86e1167e
VSO
3497 if (migrate_postcopy_ram()) {
3498 /* We can do postcopy, and all the data is postcopiable */
47995026 3499 *res_compatible += remaining_size;
86e1167e 3500 } else {
47995026 3501 *res_precopy_only += remaining_size;
86e1167e 3502 }
56e93d26
JQ
3503}
3504
3505static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3506{
3507 unsigned int xh_len;
3508 int xh_flags;
063e760a 3509 uint8_t *loaded_data;
56e93d26 3510
56e93d26
JQ
3511 /* extract RLE header */
3512 xh_flags = qemu_get_byte(f);
3513 xh_len = qemu_get_be16(f);
3514
3515 if (xh_flags != ENCODING_FLAG_XBZRLE) {
3516 error_report("Failed to load XBZRLE page - wrong compression!");
3517 return -1;
3518 }
3519
3520 if (xh_len > TARGET_PAGE_SIZE) {
3521 error_report("Failed to load XBZRLE page - len overflow!");
3522 return -1;
3523 }
f265e0e4 3524 loaded_data = XBZRLE.decoded_buf;
56e93d26 3525 /* load data and decode */
f265e0e4 3526 /* it can change loaded_data to point to an internal buffer */
063e760a 3527 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
56e93d26
JQ
3528
3529 /* decode RLE */
063e760a 3530 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
56e93d26
JQ
3531 TARGET_PAGE_SIZE) == -1) {
3532 error_report("Failed to load XBZRLE page - decode error!");
3533 return -1;
3534 }
3535
3536 return 0;
3537}
3538
3d0684b2
JQ
3539/**
3540 * ram_block_from_stream: read a RAMBlock id from the migration stream
3541 *
3542 * Must be called from within a rcu critical section.
3543 *
56e93d26 3544 * Returns a pointer from within the RCU-protected ram_list.
a7180877 3545 *
755e8d7c 3546 * @mis: the migration incoming state pointer
3d0684b2
JQ
3547 * @f: QEMUFile where to read the data from
3548 * @flags: Page flags (mostly to see if it's a continuation of previous block)
c01b16ed 3549 * @channel: the channel we're using
a7180877 3550 */
755e8d7c 3551static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
c01b16ed
PX
3552 QEMUFile *f, int flags,
3553 int channel)
56e93d26 3554{
c01b16ed 3555 RAMBlock *block = mis->last_recv_block[channel];
56e93d26
JQ
3556 char id[256];
3557 uint8_t len;
3558
3559 if (flags & RAM_SAVE_FLAG_CONTINUE) {
4c4bad48 3560 if (!block) {
56e93d26
JQ
3561 error_report("Ack, bad migration stream!");
3562 return NULL;
3563 }
4c4bad48 3564 return block;
56e93d26
JQ
3565 }
3566
3567 len = qemu_get_byte(f);
3568 qemu_get_buffer(f, (uint8_t *)id, len);
3569 id[len] = 0;
3570
e3dd7493 3571 block = qemu_ram_block_by_name(id);
4c4bad48
HZ
3572 if (!block) {
3573 error_report("Can't find block %s", id);
3574 return NULL;
56e93d26
JQ
3575 }
3576
fbd162e6 3577 if (ramblock_is_ignored(block)) {
b895de50
CLG
3578 error_report("block %s should not be migrated !", id);
3579 return NULL;
3580 }
3581
c01b16ed 3582 mis->last_recv_block[channel] = block;
755e8d7c 3583
4c4bad48
HZ
3584 return block;
3585}
3586
3587static inline void *host_from_ram_block_offset(RAMBlock *block,
3588 ram_addr_t offset)
3589{
3590 if (!offset_in_ramblock(block, offset)) {
3591 return NULL;
3592 }
3593
3594 return block->host + offset;
56e93d26
JQ
3595}
3596
6a23f639
DH
3597static void *host_page_from_ram_block_offset(RAMBlock *block,
3598 ram_addr_t offset)
3599{
3600 /* Note: Explicitly no check against offset_in_ramblock(). */
3601 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3602 block->page_size);
3603}
3604
3605static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3606 ram_addr_t offset)
3607{
3608 return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3609}
3610
13af18f2 3611static inline void *colo_cache_from_block_offset(RAMBlock *block,
8af66371 3612 ram_addr_t offset, bool record_bitmap)
13af18f2
ZC
3613{
3614 if (!offset_in_ramblock(block, offset)) {
3615 return NULL;
3616 }
3617 if (!block->colo_cache) {
3618 error_report("%s: colo_cache is NULL in block :%s",
3619 __func__, block->idstr);
3620 return NULL;
3621 }
7d9acafa
ZC
3622
3623 /*
3624 * During colo checkpoint, we need bitmap of these migrated pages.
3625 * It help us to decide which pages in ram cache should be flushed
3626 * into VM's RAM later.
3627 */
8af66371
HZ
3628 if (record_bitmap &&
3629 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
7d9acafa
ZC
3630 ram_state->migration_dirty_pages++;
3631 }
13af18f2
ZC
3632 return block->colo_cache + offset;
3633}
3634
3d0684b2
JQ
3635/**
3636 * ram_handle_compressed: handle the zero page case
3637 *
56e93d26
JQ
3638 * If a page (or a whole RDMA chunk) has been
3639 * determined to be zero, then zap it.
3d0684b2
JQ
3640 *
3641 * @host: host address for the zero page
3642 * @ch: what the page is filled from. We only support zero
3643 * @size: size of the zero page
56e93d26
JQ
3644 */
3645void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3646{
bad452a7 3647 if (ch != 0 || !buffer_is_zero(host, size)) {
56e93d26
JQ
3648 memset(host, ch, size);
3649 }
3650}
3651
797ca154
XG
3652/* return the size after decompression, or negative value on error */
3653static int
3654qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3655 const uint8_t *source, size_t source_len)
3656{
3657 int err;
3658
3659 err = inflateReset(stream);
3660 if (err != Z_OK) {
3661 return -1;
3662 }
3663
3664 stream->avail_in = source_len;
3665 stream->next_in = (uint8_t *)source;
3666 stream->avail_out = dest_len;
3667 stream->next_out = dest;
3668
3669 err = inflate(stream, Z_NO_FLUSH);
3670 if (err != Z_STREAM_END) {
3671 return -1;
3672 }
3673
3674 return stream->total_out;
3675}
3676
56e93d26
JQ
3677static void *do_data_decompress(void *opaque)
3678{
3679 DecompressParam *param = opaque;
3680 unsigned long pagesize;
33d151f4 3681 uint8_t *des;
34ab9e97 3682 int len, ret;
56e93d26 3683
33d151f4 3684 qemu_mutex_lock(&param->mutex);
90e56fb4 3685 while (!param->quit) {
33d151f4
LL
3686 if (param->des) {
3687 des = param->des;
3688 len = param->len;
3689 param->des = 0;
3690 qemu_mutex_unlock(&param->mutex);
3691
56e93d26 3692 pagesize = TARGET_PAGE_SIZE;
34ab9e97
XG
3693
3694 ret = qemu_uncompress_data(&param->stream, des, pagesize,
3695 param->compbuf, len);
f548222c 3696 if (ret < 0 && migrate_get_current()->decompress_error_check) {
34ab9e97
XG
3697 error_report("decompress data failed");
3698 qemu_file_set_error(decomp_file, ret);
3699 }
73a8912b 3700
33d151f4
LL
3701 qemu_mutex_lock(&decomp_done_lock);
3702 param->done = true;
3703 qemu_cond_signal(&decomp_done_cond);
3704 qemu_mutex_unlock(&decomp_done_lock);
3705
3706 qemu_mutex_lock(&param->mutex);
3707 } else {
3708 qemu_cond_wait(&param->cond, &param->mutex);
3709 }
56e93d26 3710 }
33d151f4 3711 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
3712
3713 return NULL;
3714}
3715
34ab9e97 3716static int wait_for_decompress_done(void)
5533b2e9
LL
3717{
3718 int idx, thread_count;
3719
3720 if (!migrate_use_compression()) {
34ab9e97 3721 return 0;
5533b2e9
LL
3722 }
3723
3724 thread_count = migrate_decompress_threads();
3725 qemu_mutex_lock(&decomp_done_lock);
3726 for (idx = 0; idx < thread_count; idx++) {
3727 while (!decomp_param[idx].done) {
3728 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3729 }
3730 }
3731 qemu_mutex_unlock(&decomp_done_lock);
34ab9e97 3732 return qemu_file_get_error(decomp_file);
5533b2e9
LL
3733}
3734
f0afa331 3735static void compress_threads_load_cleanup(void)
56e93d26
JQ
3736{
3737 int i, thread_count;
3738
3416ab5b
JQ
3739 if (!migrate_use_compression()) {
3740 return;
3741 }
56e93d26
JQ
3742 thread_count = migrate_decompress_threads();
3743 for (i = 0; i < thread_count; i++) {
797ca154
XG
3744 /*
3745 * we use it as a indicator which shows if the thread is
3746 * properly init'd or not
3747 */
3748 if (!decomp_param[i].compbuf) {
3749 break;
3750 }
3751
56e93d26 3752 qemu_mutex_lock(&decomp_param[i].mutex);
90e56fb4 3753 decomp_param[i].quit = true;
56e93d26
JQ
3754 qemu_cond_signal(&decomp_param[i].cond);
3755 qemu_mutex_unlock(&decomp_param[i].mutex);
3756 }
3757 for (i = 0; i < thread_count; i++) {
797ca154
XG
3758 if (!decomp_param[i].compbuf) {
3759 break;
3760 }
3761
56e93d26
JQ
3762 qemu_thread_join(decompress_threads + i);
3763 qemu_mutex_destroy(&decomp_param[i].mutex);
3764 qemu_cond_destroy(&decomp_param[i].cond);
797ca154 3765 inflateEnd(&decomp_param[i].stream);
56e93d26 3766 g_free(decomp_param[i].compbuf);
797ca154 3767 decomp_param[i].compbuf = NULL;
56e93d26
JQ
3768 }
3769 g_free(decompress_threads);
3770 g_free(decomp_param);
56e93d26
JQ
3771 decompress_threads = NULL;
3772 decomp_param = NULL;
34ab9e97 3773 decomp_file = NULL;
56e93d26
JQ
3774}
3775
34ab9e97 3776static int compress_threads_load_setup(QEMUFile *f)
797ca154
XG
3777{
3778 int i, thread_count;
3779
3780 if (!migrate_use_compression()) {
3781 return 0;
3782 }
3783
3784 thread_count = migrate_decompress_threads();
3785 decompress_threads = g_new0(QemuThread, thread_count);
3786 decomp_param = g_new0(DecompressParam, thread_count);
3787 qemu_mutex_init(&decomp_done_lock);
3788 qemu_cond_init(&decomp_done_cond);
34ab9e97 3789 decomp_file = f;
797ca154
XG
3790 for (i = 0; i < thread_count; i++) {
3791 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3792 goto exit;
3793 }
3794
3795 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3796 qemu_mutex_init(&decomp_param[i].mutex);
3797 qemu_cond_init(&decomp_param[i].cond);
3798 decomp_param[i].done = true;
3799 decomp_param[i].quit = false;
3800 qemu_thread_create(decompress_threads + i, "decompress",
3801 do_data_decompress, decomp_param + i,
3802 QEMU_THREAD_JOINABLE);
3803 }
3804 return 0;
3805exit:
3806 compress_threads_load_cleanup();
3807 return -1;
3808}
3809
c1bc6626 3810static void decompress_data_with_multi_threads(QEMUFile *f,
56e93d26
JQ
3811 void *host, int len)
3812{
3813 int idx, thread_count;
3814
3815 thread_count = migrate_decompress_threads();
37396950 3816 QEMU_LOCK_GUARD(&decomp_done_lock);
56e93d26
JQ
3817 while (true) {
3818 for (idx = 0; idx < thread_count; idx++) {
73a8912b 3819 if (decomp_param[idx].done) {
33d151f4
LL
3820 decomp_param[idx].done = false;
3821 qemu_mutex_lock(&decomp_param[idx].mutex);
c1bc6626 3822 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
56e93d26
JQ
3823 decomp_param[idx].des = host;
3824 decomp_param[idx].len = len;
33d151f4
LL
3825 qemu_cond_signal(&decomp_param[idx].cond);
3826 qemu_mutex_unlock(&decomp_param[idx].mutex);
56e93d26
JQ
3827 break;
3828 }
3829 }
3830 if (idx < thread_count) {
3831 break;
73a8912b
LL
3832 } else {
3833 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
56e93d26
JQ
3834 }
3835 }
3836}
3837
b70cb3b4
RL
3838static void colo_init_ram_state(void)
3839{
3840 ram_state_init(&ram_state);
b70cb3b4
RL
3841}
3842
13af18f2
ZC
3843/*
3844 * colo cache: this is for secondary VM, we cache the whole
3845 * memory of the secondary VM, it is need to hold the global lock
3846 * to call this helper.
3847 */
3848int colo_init_ram_cache(void)
3849{
3850 RAMBlock *block;
3851
44901b5a
PB
3852 WITH_RCU_READ_LOCK_GUARD() {
3853 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3854 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
8dbe22c6 3855 NULL, false, false);
44901b5a
PB
3856 if (!block->colo_cache) {
3857 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3858 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3859 block->used_length);
3860 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3861 if (block->colo_cache) {
3862 qemu_anon_ram_free(block->colo_cache, block->used_length);
3863 block->colo_cache = NULL;
3864 }
89ac5a1d 3865 }
44901b5a 3866 return -errno;
89ac5a1d 3867 }
e5fdf920
LS
3868 if (!machine_dump_guest_core(current_machine)) {
3869 qemu_madvise(block->colo_cache, block->used_length,
3870 QEMU_MADV_DONTDUMP);
3871 }
13af18f2 3872 }
13af18f2 3873 }
44901b5a 3874
7d9acafa
ZC
3875 /*
3876 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3877 * with to decide which page in cache should be flushed into SVM's RAM. Here
3878 * we use the same name 'ram_bitmap' as for migration.
3879 */
3880 if (ram_bytes_total()) {
3881 RAMBlock *block;
3882
fbd162e6 3883 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7d9acafa 3884 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
7d9acafa 3885 block->bmap = bitmap_new(pages);
7d9acafa
ZC
3886 }
3887 }
7d9acafa 3888
b70cb3b4 3889 colo_init_ram_state();
13af18f2 3890 return 0;
13af18f2
ZC
3891}
3892
0393031a
HZ
3893/* TODO: duplicated with ram_init_bitmaps */
3894void colo_incoming_start_dirty_log(void)
3895{
3896 RAMBlock *block = NULL;
3897 /* For memory_global_dirty_log_start below. */
3898 qemu_mutex_lock_iothread();
3899 qemu_mutex_lock_ramlist();
3900
3901 memory_global_dirty_log_sync();
3902 WITH_RCU_READ_LOCK_GUARD() {
3903 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3904 ramblock_sync_dirty_bitmap(ram_state, block);
3905 /* Discard this dirty bitmap record */
3906 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3907 }
63b41db4 3908 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
0393031a
HZ
3909 }
3910 ram_state->migration_dirty_pages = 0;
3911 qemu_mutex_unlock_ramlist();
3912 qemu_mutex_unlock_iothread();
3913}
3914
13af18f2
ZC
3915/* It is need to hold the global lock to call this helper */
3916void colo_release_ram_cache(void)
3917{
3918 RAMBlock *block;
3919
63b41db4 3920 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
fbd162e6 3921 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7d9acafa
ZC
3922 g_free(block->bmap);
3923 block->bmap = NULL;
3924 }
3925
89ac5a1d
DDAG
3926 WITH_RCU_READ_LOCK_GUARD() {
3927 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3928 if (block->colo_cache) {
3929 qemu_anon_ram_free(block->colo_cache, block->used_length);
3930 block->colo_cache = NULL;
3931 }
13af18f2
ZC
3932 }
3933 }
0393031a 3934 ram_state_cleanup(&ram_state);
13af18f2
ZC
3935}
3936
f265e0e4
JQ
3937/**
3938 * ram_load_setup: Setup RAM for migration incoming side
3939 *
3940 * Returns zero to indicate success and negative for error
3941 *
3942 * @f: QEMUFile where to receive the data
3943 * @opaque: RAMState pointer
3944 */
3945static int ram_load_setup(QEMUFile *f, void *opaque)
3946{
34ab9e97 3947 if (compress_threads_load_setup(f)) {
797ca154
XG
3948 return -1;
3949 }
3950
f265e0e4 3951 xbzrle_load_setup();
f9494614 3952 ramblock_recv_map_init();
13af18f2 3953
f265e0e4
JQ
3954 return 0;
3955}
3956
3957static int ram_load_cleanup(void *opaque)
3958{
f9494614 3959 RAMBlock *rb;
56eb90af 3960
fbd162e6 3961 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
bd108a44 3962 qemu_ram_block_writeback(rb);
56eb90af
JH
3963 }
3964
f265e0e4 3965 xbzrle_load_cleanup();
f0afa331 3966 compress_threads_load_cleanup();
f9494614 3967
fbd162e6 3968 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
f9494614
AP
3969 g_free(rb->receivedmap);
3970 rb->receivedmap = NULL;
3971 }
13af18f2 3972
f265e0e4
JQ
3973 return 0;
3974}
3975
3d0684b2
JQ
3976/**
3977 * ram_postcopy_incoming_init: allocate postcopy data structures
3978 *
3979 * Returns 0 for success and negative if there was one error
3980 *
3981 * @mis: current migration incoming state
3982 *
3983 * Allocate data structures etc needed by incoming migration with
3984 * postcopy-ram. postcopy-ram's similarly names
3985 * postcopy_ram_incoming_init does the work.
1caddf8a
DDAG
3986 */
3987int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3988{
c136180c 3989 return postcopy_ram_incoming_init(mis);
1caddf8a
DDAG
3990}
3991
3d0684b2
JQ
3992/**
3993 * ram_load_postcopy: load a page in postcopy case
3994 *
3995 * Returns 0 for success or -errno in case of error
3996 *
a7180877
DDAG
3997 * Called in postcopy mode by ram_load().
3998 * rcu_read_lock is taken prior to this being called.
3d0684b2
JQ
3999 *
4000 * @f: QEMUFile where to send the data
36f62f11 4001 * @channel: the channel to use for loading
a7180877 4002 */
36f62f11 4003int ram_load_postcopy(QEMUFile *f, int channel)
a7180877
DDAG
4004{
4005 int flags = 0, ret = 0;
4006 bool place_needed = false;
1aa83678 4007 bool matches_target_page_size = false;
a7180877 4008 MigrationIncomingState *mis = migration_incoming_get_current();
36f62f11 4009 PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
a7180877
DDAG
4010
4011 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4012 ram_addr_t addr;
a7180877
DDAG
4013 void *page_buffer = NULL;
4014 void *place_source = NULL;
df9ff5e1 4015 RAMBlock *block = NULL;
a7180877 4016 uint8_t ch;
644acf99 4017 int len;
a7180877
DDAG
4018
4019 addr = qemu_get_be64(f);
7a9ddfbf
PX
4020
4021 /*
4022 * If qemu file error, we should stop here, and then "addr"
4023 * may be invalid
4024 */
4025 ret = qemu_file_get_error(f);
4026 if (ret) {
4027 break;
4028 }
4029
a7180877
DDAG
4030 flags = addr & ~TARGET_PAGE_MASK;
4031 addr &= TARGET_PAGE_MASK;
4032
36f62f11 4033 trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
644acf99
WY
4034 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4035 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
c01b16ed 4036 block = ram_block_from_stream(mis, f, flags, channel);
6a23f639
DH
4037 if (!block) {
4038 ret = -EINVAL;
4039 break;
4040 }
4c4bad48 4041
898ba906
DH
4042 /*
4043 * Relying on used_length is racy and can result in false positives.
4044 * We might place pages beyond used_length in case RAM was shrunk
4045 * while in postcopy, which is fine - trying to place via
4046 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
4047 */
4048 if (!block->host || addr >= block->postcopy_length) {
a7180877
DDAG
4049 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4050 ret = -EINVAL;
4051 break;
4052 }
77dadc3f 4053 tmp_page->target_pages++;
1aa83678 4054 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
a7180877 4055 /*
28abd200
DDAG
4056 * Postcopy requires that we place whole host pages atomically;
4057 * these may be huge pages for RAMBlocks that are backed by
4058 * hugetlbfs.
a7180877
DDAG
4059 * To make it atomic, the data is read into a temporary page
4060 * that's moved into place later.
4061 * The migration protocol uses, possibly smaller, target-pages
4062 * however the source ensures it always sends all the components
91ba442f 4063 * of a host page in one chunk.
a7180877 4064 */
77dadc3f 4065 page_buffer = tmp_page->tmp_huge_page +
6a23f639
DH
4066 host_page_offset_from_ram_block_offset(block, addr);
4067 /* If all TP are zero then we can optimise the place */
77dadc3f
PX
4068 if (tmp_page->target_pages == 1) {
4069 tmp_page->host_addr =
4070 host_page_from_ram_block_offset(block, addr);
4071 } else if (tmp_page->host_addr !=
4072 host_page_from_ram_block_offset(block, addr)) {
c53b7ddc 4073 /* not the 1st TP within the HP */
36f62f11 4074 error_report("Non-same host page detected on channel %d: "
cfc7dc8a
PX
4075 "Target host page %p, received host page %p "
4076 "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
36f62f11 4077 channel, tmp_page->host_addr,
cfc7dc8a
PX
4078 host_page_from_ram_block_offset(block, addr),
4079 block->idstr, addr, tmp_page->target_pages);
6a23f639
DH
4080 ret = -EINVAL;
4081 break;
a7180877
DDAG
4082 }
4083
4084 /*
4085 * If it's the last part of a host page then we place the host
4086 * page
4087 */
77dadc3f
PX
4088 if (tmp_page->target_pages ==
4089 (block->page_size / TARGET_PAGE_SIZE)) {
4cbb3c63 4090 place_needed = true;
4cbb3c63 4091 }
77dadc3f 4092 place_source = tmp_page->tmp_huge_page;
a7180877
DDAG
4093 }
4094
4095 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
bb890ed5 4096 case RAM_SAVE_FLAG_ZERO:
a7180877 4097 ch = qemu_get_byte(f);
2e36bc1b
WY
4098 /*
4099 * Can skip to set page_buffer when
4100 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
4101 */
4102 if (ch || !matches_target_page_size) {
4103 memset(page_buffer, ch, TARGET_PAGE_SIZE);
4104 }
a7180877 4105 if (ch) {
77dadc3f 4106 tmp_page->all_zero = false;
a7180877
DDAG
4107 }
4108 break;
4109
4110 case RAM_SAVE_FLAG_PAGE:
77dadc3f 4111 tmp_page->all_zero = false;
1aa83678
PX
4112 if (!matches_target_page_size) {
4113 /* For huge pages, we always use temporary buffer */
a7180877
DDAG
4114 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4115 } else {
1aa83678
PX
4116 /*
4117 * For small pages that matches target page size, we
4118 * avoid the qemu_file copy. Instead we directly use
4119 * the buffer of QEMUFile to place the page. Note: we
4120 * cannot do any QEMUFile operation before using that
4121 * buffer to make sure the buffer is valid when
4122 * placing the page.
a7180877
DDAG
4123 */
4124 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4125 TARGET_PAGE_SIZE);
4126 }
4127 break;
644acf99 4128 case RAM_SAVE_FLAG_COMPRESS_PAGE:
77dadc3f 4129 tmp_page->all_zero = false;
644acf99
WY
4130 len = qemu_get_be32(f);
4131 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4132 error_report("Invalid compressed data length: %d", len);
4133 ret = -EINVAL;
4134 break;
4135 }
4136 decompress_data_with_multi_threads(f, page_buffer, len);
4137 break;
4138
a7180877
DDAG
4139 case RAM_SAVE_FLAG_EOS:
4140 /* normal exit */
6df264ac 4141 multifd_recv_sync_main();
a7180877
DDAG
4142 break;
4143 default:
29fccade 4144 error_report("Unknown combination of migration flags: 0x%x"
a7180877
DDAG
4145 " (postcopy mode)", flags);
4146 ret = -EINVAL;
7a9ddfbf
PX
4147 break;
4148 }
4149
644acf99
WY
4150 /* Got the whole host page, wait for decompress before placing. */
4151 if (place_needed) {
4152 ret |= wait_for_decompress_done();
4153 }
4154
7a9ddfbf
PX
4155 /* Detect for any possible file errors */
4156 if (!ret && qemu_file_get_error(f)) {
4157 ret = qemu_file_get_error(f);
a7180877
DDAG
4158 }
4159
7a9ddfbf 4160 if (!ret && place_needed) {
77dadc3f
PX
4161 if (tmp_page->all_zero) {
4162 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
a7180877 4163 } else {
77dadc3f
PX
4164 ret = postcopy_place_page(mis, tmp_page->host_addr,
4165 place_source, block);
a7180877 4166 }
ddf35bdf 4167 place_needed = false;
77dadc3f 4168 postcopy_temp_page_reset(tmp_page);
a7180877 4169 }
a7180877
DDAG
4170 }
4171
4172 return ret;
4173}
4174
acab30b8
DHB
4175static bool postcopy_is_running(void)
4176{
4177 PostcopyState ps = postcopy_state_get();
4178 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4179}
4180
e6f4aa18
ZC
4181/*
4182 * Flush content of RAM cache into SVM's memory.
4183 * Only flush the pages that be dirtied by PVM or SVM or both.
4184 */
24fa16f8 4185void colo_flush_ram_cache(void)
e6f4aa18
ZC
4186{
4187 RAMBlock *block = NULL;
4188 void *dst_host;
4189 void *src_host;
4190 unsigned long offset = 0;
4191
d1955d22 4192 memory_global_dirty_log_sync();
89ac5a1d
DDAG
4193 WITH_RCU_READ_LOCK_GUARD() {
4194 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4195 ramblock_sync_dirty_bitmap(ram_state, block);
4196 }
d1955d22 4197 }
d1955d22 4198
e6f4aa18 4199 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
89ac5a1d
DDAG
4200 WITH_RCU_READ_LOCK_GUARD() {
4201 block = QLIST_FIRST_RCU(&ram_list.blocks);
e6f4aa18 4202
89ac5a1d 4203 while (block) {
a6a83cef 4204 unsigned long num = 0;
e6f4aa18 4205
a6a83cef 4206 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
542147f4
DH
4207 if (!offset_in_ramblock(block,
4208 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
89ac5a1d 4209 offset = 0;
a6a83cef 4210 num = 0;
89ac5a1d
DDAG
4211 block = QLIST_NEXT_RCU(block, next);
4212 } else {
a6a83cef
RL
4213 unsigned long i = 0;
4214
4215 for (i = 0; i < num; i++) {
4216 migration_bitmap_clear_dirty(ram_state, block, offset + i);
4217 }
8bba004c
AR
4218 dst_host = block->host
4219 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4220 src_host = block->colo_cache
4221 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
a6a83cef
RL
4222 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
4223 offset += num;
89ac5a1d 4224 }
e6f4aa18
ZC
4225 }
4226 }
e6f4aa18
ZC
4227 trace_colo_flush_ram_cache_end();
4228}
4229
10da4a36
WY
4230/**
4231 * ram_load_precopy: load pages in precopy case
4232 *
4233 * Returns 0 for success or -errno in case of error
4234 *
4235 * Called in precopy mode by ram_load().
4236 * rcu_read_lock is taken prior to this being called.
4237 *
4238 * @f: QEMUFile where to send the data
4239 */
4240static int ram_load_precopy(QEMUFile *f)
56e93d26 4241{
755e8d7c 4242 MigrationIncomingState *mis = migration_incoming_get_current();
e65cec5e 4243 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
ef08fb38 4244 /* ADVISE is earlier, it shows the source has the postcopy capability on */
80fe315c 4245 bool postcopy_advised = migration_incoming_postcopy_advised();
edc60127
JQ
4246 if (!migrate_use_compression()) {
4247 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4248 }
a7180877 4249
10da4a36 4250 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
56e93d26 4251 ram_addr_t addr, total_ram_bytes;
0393031a 4252 void *host = NULL, *host_bak = NULL;
56e93d26
JQ
4253 uint8_t ch;
4254
e65cec5e
YK
4255 /*
4256 * Yield periodically to let main loop run, but an iteration of
4257 * the main loop is expensive, so do it each some iterations
4258 */
4259 if ((i & 32767) == 0 && qemu_in_coroutine()) {
4260 aio_co_schedule(qemu_get_current_aio_context(),
4261 qemu_coroutine_self());
4262 qemu_coroutine_yield();
4263 }
4264 i++;
4265
56e93d26
JQ
4266 addr = qemu_get_be64(f);
4267 flags = addr & ~TARGET_PAGE_MASK;
4268 addr &= TARGET_PAGE_MASK;
4269
edc60127
JQ
4270 if (flags & invalid_flags) {
4271 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4272 error_report("Received an unexpected compressed page");
4273 }
4274
4275 ret = -EINVAL;
4276 break;
4277 }
4278
bb890ed5 4279 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
a776aa15 4280 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
c01b16ed
PX
4281 RAMBlock *block = ram_block_from_stream(mis, f, flags,
4282 RAM_CHANNEL_PRECOPY);
4c4bad48 4283
0393031a 4284 host = host_from_ram_block_offset(block, addr);
13af18f2 4285 /*
0393031a
HZ
4286 * After going into COLO stage, we should not load the page
4287 * into SVM's memory directly, we put them into colo_cache firstly.
4288 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
4289 * Previously, we copied all these memory in preparing stage of COLO
4290 * while we need to stop VM, which is a time-consuming process.
4291 * Here we optimize it by a trick, back-up every page while in
4292 * migration process while COLO is enabled, though it affects the
4293 * speed of the migration, but it obviously reduce the downtime of
4294 * back-up all SVM'S memory in COLO preparing stage.
13af18f2 4295 */
0393031a
HZ
4296 if (migration_incoming_colo_enabled()) {
4297 if (migration_incoming_in_colo_state()) {
4298 /* In COLO stage, put all pages into cache temporarily */
8af66371 4299 host = colo_cache_from_block_offset(block, addr, true);
0393031a
HZ
4300 } else {
4301 /*
4302 * In migration stage but before COLO stage,
4303 * Put all pages into both cache and SVM's memory.
4304 */
8af66371 4305 host_bak = colo_cache_from_block_offset(block, addr, false);
0393031a 4306 }
13af18f2 4307 }
a776aa15
DDAG
4308 if (!host) {
4309 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4310 ret = -EINVAL;
4311 break;
4312 }
13af18f2
ZC
4313 if (!migration_incoming_in_colo_state()) {
4314 ramblock_recv_bitmap_set(block, host);
4315 }
4316
1db9d8e5 4317 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
a776aa15
DDAG
4318 }
4319
56e93d26
JQ
4320 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4321 case RAM_SAVE_FLAG_MEM_SIZE:
4322 /* Synchronize RAM block list */
4323 total_ram_bytes = addr;
4324 while (!ret && total_ram_bytes) {
4325 RAMBlock *block;
56e93d26
JQ
4326 char id[256];
4327 ram_addr_t length;
4328
4329 len = qemu_get_byte(f);
4330 qemu_get_buffer(f, (uint8_t *)id, len);
4331 id[len] = 0;
4332 length = qemu_get_be64(f);
4333
e3dd7493 4334 block = qemu_ram_block_by_name(id);
b895de50
CLG
4335 if (block && !qemu_ram_is_migratable(block)) {
4336 error_report("block %s should not be migrated !", id);
4337 ret = -EINVAL;
4338 } else if (block) {
e3dd7493
DDAG
4339 if (length != block->used_length) {
4340 Error *local_err = NULL;
56e93d26 4341
fa53a0e5 4342 ret = qemu_ram_resize(block, length,
e3dd7493
DDAG
4343 &local_err);
4344 if (local_err) {
4345 error_report_err(local_err);
56e93d26 4346 }
56e93d26 4347 }
ef08fb38 4348 /* For postcopy we need to check hugepage sizes match */
e846b746 4349 if (postcopy_advised && migrate_postcopy_ram() &&
ef08fb38
DDAG
4350 block->page_size != qemu_host_page_size) {
4351 uint64_t remote_page_size = qemu_get_be64(f);
4352 if (remote_page_size != block->page_size) {
4353 error_report("Mismatched RAM page size %s "
4354 "(local) %zd != %" PRId64,
4355 id, block->page_size,
4356 remote_page_size);
4357 ret = -EINVAL;
4358 }
4359 }
fbd162e6
YK
4360 if (migrate_ignore_shared()) {
4361 hwaddr addr = qemu_get_be64(f);
fbd162e6
YK
4362 if (ramblock_is_ignored(block) &&
4363 block->mr->addr != addr) {
4364 error_report("Mismatched GPAs for block %s "
4365 "%" PRId64 "!= %" PRId64,
4366 id, (uint64_t)addr,
4367 (uint64_t)block->mr->addr);
4368 ret = -EINVAL;
4369 }
4370 }
e3dd7493
DDAG
4371 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4372 block->idstr);
4373 } else {
56e93d26
JQ
4374 error_report("Unknown ramblock \"%s\", cannot "
4375 "accept migration", id);
4376 ret = -EINVAL;
4377 }
4378
4379 total_ram_bytes -= length;
4380 }
4381 break;
a776aa15 4382
bb890ed5 4383 case RAM_SAVE_FLAG_ZERO:
56e93d26
JQ
4384 ch = qemu_get_byte(f);
4385 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4386 break;
a776aa15 4387
56e93d26 4388 case RAM_SAVE_FLAG_PAGE:
56e93d26
JQ
4389 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4390 break;
56e93d26 4391
a776aa15 4392 case RAM_SAVE_FLAG_COMPRESS_PAGE:
56e93d26
JQ
4393 len = qemu_get_be32(f);
4394 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4395 error_report("Invalid compressed data length: %d", len);
4396 ret = -EINVAL;
4397 break;
4398 }
c1bc6626 4399 decompress_data_with_multi_threads(f, host, len);
56e93d26 4400 break;
a776aa15 4401
56e93d26 4402 case RAM_SAVE_FLAG_XBZRLE:
56e93d26
JQ
4403 if (load_xbzrle(f, addr, host) < 0) {
4404 error_report("Failed to decompress XBZRLE page at "
4405 RAM_ADDR_FMT, addr);
4406 ret = -EINVAL;
4407 break;
4408 }
4409 break;
4410 case RAM_SAVE_FLAG_EOS:
4411 /* normal exit */
6df264ac 4412 multifd_recv_sync_main();
56e93d26
JQ
4413 break;
4414 default:
4415 if (flags & RAM_SAVE_FLAG_HOOK) {
632e3a5c 4416 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
56e93d26 4417 } else {
29fccade 4418 error_report("Unknown combination of migration flags: 0x%x",
56e93d26
JQ
4419 flags);
4420 ret = -EINVAL;
4421 }
4422 }
4423 if (!ret) {
4424 ret = qemu_file_get_error(f);
4425 }
0393031a
HZ
4426 if (!ret && host_bak) {
4427 memcpy(host_bak, host, TARGET_PAGE_SIZE);
4428 }
56e93d26
JQ
4429 }
4430
ca1a6b70 4431 ret |= wait_for_decompress_done();
10da4a36
WY
4432 return ret;
4433}
4434
4435static int ram_load(QEMUFile *f, void *opaque, int version_id)
4436{
4437 int ret = 0;
4438 static uint64_t seq_iter;
4439 /*
4440 * If system is running in postcopy mode, page inserts to host memory must
4441 * be atomic
4442 */
4443 bool postcopy_running = postcopy_is_running();
4444
4445 seq_iter++;
4446
4447 if (version_id != 4) {
4448 return -EINVAL;
4449 }
4450
4451 /*
4452 * This RCU critical section can be very long running.
4453 * When RCU reclaims in the code start to become numerous,
4454 * it will be necessary to reduce the granularity of this
4455 * critical section.
4456 */
89ac5a1d
DDAG
4457 WITH_RCU_READ_LOCK_GUARD() {
4458 if (postcopy_running) {
36f62f11
PX
4459 /*
4460 * Note! Here RAM_CHANNEL_PRECOPY is the precopy channel of
4461 * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
4462 * service fast page faults.
4463 */
4464 ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
89ac5a1d
DDAG
4465 } else {
4466 ret = ram_load_precopy(f);
4467 }
10da4a36 4468 }
55c4446b 4469 trace_ram_load_complete(ret, seq_iter);
e6f4aa18 4470
56e93d26
JQ
4471 return ret;
4472}
4473
c6467627
VSO
4474static bool ram_has_postcopy(void *opaque)
4475{
469dd51b 4476 RAMBlock *rb;
fbd162e6 4477 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
469dd51b
JH
4478 if (ramblock_is_pmem(rb)) {
4479 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4480 "is not supported now!", rb->idstr, rb->host);
4481 return false;
4482 }
4483 }
4484
c6467627
VSO
4485 return migrate_postcopy_ram();
4486}
4487
edd090c7
PX
4488/* Sync all the dirty bitmap with destination VM. */
4489static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4490{
4491 RAMBlock *block;
4492 QEMUFile *file = s->to_dst_file;
4493 int ramblock_count = 0;
4494
4495 trace_ram_dirty_bitmap_sync_start();
4496
fbd162e6 4497 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
edd090c7
PX
4498 qemu_savevm_send_recv_bitmap(file, block->idstr);
4499 trace_ram_dirty_bitmap_request(block->idstr);
4500 ramblock_count++;
4501 }
4502
4503 trace_ram_dirty_bitmap_sync_wait();
4504
4505 /* Wait until all the ramblocks' dirty bitmap synced */
4506 while (ramblock_count--) {
4507 qemu_sem_wait(&s->rp_state.rp_sem);
4508 }
4509
4510 trace_ram_dirty_bitmap_sync_complete();
4511
4512 return 0;
4513}
4514
4515static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4516{
4517 qemu_sem_post(&s->rp_state.rp_sem);
4518}
4519
a335debb
PX
4520/*
4521 * Read the received bitmap, revert it as the initial dirty bitmap.
4522 * This is only used when the postcopy migration is paused but wants
4523 * to resume from a middle point.
4524 */
4525int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4526{
4527 int ret = -EINVAL;
43044ac0 4528 /* from_dst_file is always valid because we're within rp_thread */
a335debb
PX
4529 QEMUFile *file = s->rp_state.from_dst_file;
4530 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
a725ef9f 4531 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
4532 uint64_t size, end_mark;
4533
4534 trace_ram_dirty_bitmap_reload_begin(block->idstr);
4535
4536 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4537 error_report("%s: incorrect state %s", __func__,
4538 MigrationStatus_str(s->state));
4539 return -EINVAL;
4540 }
4541
4542 /*
4543 * Note: see comments in ramblock_recv_bitmap_send() on why we
3a4452d8 4544 * need the endianness conversion, and the paddings.
a335debb
PX
4545 */
4546 local_size = ROUND_UP(local_size, 8);
4547
4548 /* Add paddings */
4549 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4550
4551 size = qemu_get_be64(file);
4552
4553 /* The size of the bitmap should match with our ramblock */
4554 if (size != local_size) {
4555 error_report("%s: ramblock '%s' bitmap size mismatch "
4556 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4557 block->idstr, size, local_size);
4558 ret = -EINVAL;
4559 goto out;
4560 }
4561
4562 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4563 end_mark = qemu_get_be64(file);
4564
4565 ret = qemu_file_get_error(file);
4566 if (ret || size != local_size) {
4567 error_report("%s: read bitmap failed for ramblock '%s': %d"
4568 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4569 __func__, block->idstr, ret, local_size, size);
4570 ret = -EIO;
4571 goto out;
4572 }
4573
4574 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
af3bbbe9 4575 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
a335debb
PX
4576 __func__, block->idstr, end_mark);
4577 ret = -EINVAL;
4578 goto out;
4579 }
4580
4581 /*
3a4452d8 4582 * Endianness conversion. We are during postcopy (though paused).
a335debb
PX
4583 * The dirty bitmap won't change. We can directly modify it.
4584 */
4585 bitmap_from_le(block->bmap, le_bitmap, nbits);
4586
4587 /*
4588 * What we received is "received bitmap". Revert it as the initial
4589 * dirty bitmap for this ramblock.
4590 */
4591 bitmap_complement(block->bmap, block->bmap, nbits);
4592
be39b4cd
DH
4593 /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4594 ramblock_dirty_bitmap_clear_discarded_pages(block);
4595
4596 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
a335debb
PX
4597 trace_ram_dirty_bitmap_reload_complete(block->idstr);
4598
edd090c7
PX
4599 /*
4600 * We succeeded to sync bitmap for current ramblock. If this is
4601 * the last one to sync, we need to notify the main send thread.
4602 */
4603 ram_dirty_bitmap_reload_notify(s);
4604
a335debb
PX
4605 ret = 0;
4606out:
bf269906 4607 g_free(le_bitmap);
a335debb
PX
4608 return ret;
4609}
4610
edd090c7
PX
4611static int ram_resume_prepare(MigrationState *s, void *opaque)
4612{
4613 RAMState *rs = *(RAMState **)opaque;
08614f34 4614 int ret;
edd090c7 4615
08614f34
PX
4616 ret = ram_dirty_bitmap_sync_all(s, rs);
4617 if (ret) {
4618 return ret;
4619 }
4620
4621 ram_state_resume_prepare(rs, s->to_dst_file);
4622
4623 return 0;
edd090c7
PX
4624}
4625
36f62f11
PX
4626void postcopy_preempt_shutdown_file(MigrationState *s)
4627{
4628 qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
4629 qemu_fflush(s->postcopy_qemufile_src);
4630}
4631
56e93d26 4632static SaveVMHandlers savevm_ram_handlers = {
9907e842 4633 .save_setup = ram_save_setup,
56e93d26 4634 .save_live_iterate = ram_save_iterate,
763c906b 4635 .save_live_complete_postcopy = ram_save_complete,
a3e06c3d 4636 .save_live_complete_precopy = ram_save_complete,
c6467627 4637 .has_postcopy = ram_has_postcopy,
c8df4a7a
JQ
4638 .state_pending_exact = ram_state_pending_exact,
4639 .state_pending_estimate = ram_state_pending_estimate,
56e93d26 4640 .load_state = ram_load,
f265e0e4
JQ
4641 .save_cleanup = ram_save_cleanup,
4642 .load_setup = ram_load_setup,
4643 .load_cleanup = ram_load_cleanup,
edd090c7 4644 .resume_prepare = ram_resume_prepare,
56e93d26
JQ
4645};
4646
c7c0e724
DH
4647static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4648 size_t old_size, size_t new_size)
4649{
cc61c703 4650 PostcopyState ps = postcopy_state_get();
c7c0e724
DH
4651 ram_addr_t offset;
4652 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4653 Error *err = NULL;
4654
4655 if (ramblock_is_ignored(rb)) {
4656 return;
4657 }
4658
4659 if (!migration_is_idle()) {
4660 /*
4661 * Precopy code on the source cannot deal with the size of RAM blocks
4662 * changing at random points in time - especially after sending the
4663 * RAM block sizes in the migration stream, they must no longer change.
4664 * Abort and indicate a proper reason.
4665 */
4666 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
458fecca 4667 migration_cancel(err);
c7c0e724 4668 error_free(err);
c7c0e724 4669 }
cc61c703
DH
4670
4671 switch (ps) {
4672 case POSTCOPY_INCOMING_ADVISE:
4673 /*
4674 * Update what ram_postcopy_incoming_init()->init_range() does at the
4675 * time postcopy was advised. Syncing RAM blocks with the source will
4676 * result in RAM resizes.
4677 */
4678 if (old_size < new_size) {
4679 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4680 error_report("RAM block '%s' discard of resized RAM failed",
4681 rb->idstr);
4682 }
4683 }
898ba906 4684 rb->postcopy_length = new_size;
cc61c703
DH
4685 break;
4686 case POSTCOPY_INCOMING_NONE:
4687 case POSTCOPY_INCOMING_RUNNING:
4688 case POSTCOPY_INCOMING_END:
4689 /*
4690 * Once our guest is running, postcopy does no longer care about
4691 * resizes. When growing, the new memory was not available on the
4692 * source, no handler needed.
4693 */
4694 break;
4695 default:
4696 error_report("RAM block '%s' resized during postcopy state: %d",
4697 rb->idstr, ps);
4698 exit(-1);
4699 }
c7c0e724
DH
4700}
4701
4702static RAMBlockNotifier ram_mig_ram_notifier = {
4703 .ram_block_resized = ram_mig_ram_block_resized,
4704};
4705
56e93d26
JQ
4706void ram_mig_init(void)
4707{
4708 qemu_mutex_init(&XBZRLE.lock);
ce62df53 4709 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
c7c0e724 4710 ram_block_notifier_add(&ram_mig_ram_notifier);
56e93d26 4711}