]> git.proxmox.com Git - mirror_qemu.git/blame - migration/ram.c
migration: Simplify ram_find_and_save_block()
[mirror_qemu.git] / migration / ram.c
CommitLineData
56e93d26
JQ
1/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
76cc7b58
JQ
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
56e93d26
JQ
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
e688df6b 28
1393a485 29#include "qemu/osdep.h"
f348b6d1 30#include "qemu/cutils.h"
56e93d26
JQ
31#include "qemu/bitops.h"
32#include "qemu/bitmap.h"
b85ea5fa 33#include "qemu/madvise.h"
7205c9ec 34#include "qemu/main-loop.h"
c0e0825c 35#include "io/channel-null.h"
709e3fe8 36#include "xbzrle.h"
7b1e1a22 37#include "ram.h"
6666c96a 38#include "migration.h"
f2a8f0a6 39#include "migration/register.h"
7b1e1a22 40#include "migration/misc.h"
08a0aee1 41#include "qemu-file.h"
be07b0ac 42#include "postcopy-ram.h"
53d37d36 43#include "page_cache.h"
56e93d26 44#include "qemu/error-report.h"
e688df6b 45#include "qapi/error.h"
ab7cbb0b 46#include "qapi/qapi-types-migration.h"
9af23989 47#include "qapi/qapi-events-migration.h"
8acabf69 48#include "qapi/qmp/qerror.h"
56e93d26 49#include "trace.h"
56e93d26 50#include "exec/ram_addr.h"
f9494614 51#include "exec/target_page.h"
56e93d26 52#include "qemu/rcu_queue.h"
a91246c9 53#include "migration/colo.h"
53d37d36 54#include "block.h"
b0c3cf94 55#include "sysemu/cpu-throttle.h"
edd090c7 56#include "savevm.h"
b9ee2f7d 57#include "qemu/iov.h"
d32ca5ad 58#include "multifd.h"
278e2f55
AG
59#include "sysemu/runstate.h"
60
e5fdf920
LS
61#include "hw/boards.h" /* for machine_dump_guest_core() */
62
278e2f55
AG
63#if defined(__linux__)
64#include "qemu/userfaultfd.h"
65#endif /* defined(__linux__) */
56e93d26 66
56e93d26
JQ
67/***********************************************************/
68/* ram save/restore */
69
bb890ed5
JQ
70/* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
71 * worked for pages that where filled with the same char. We switched
72 * it to only search for the zero value. And to avoid confusion with
73 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
74 */
75
56e93d26 76#define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
bb890ed5 77#define RAM_SAVE_FLAG_ZERO 0x02
56e93d26
JQ
78#define RAM_SAVE_FLAG_MEM_SIZE 0x04
79#define RAM_SAVE_FLAG_PAGE 0x08
80#define RAM_SAVE_FLAG_EOS 0x10
81#define RAM_SAVE_FLAG_CONTINUE 0x20
82#define RAM_SAVE_FLAG_XBZRLE 0x40
83/* 0x80 is reserved in migration.h start with 0x100 next */
84#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
85
9360447d
JQ
86XBZRLECacheStats xbzrle_counters;
87
f1668764
PX
88/* used by the search for pages to send */
89struct PageSearchStatus {
90 /* The migration channel used for a specific host page */
91 QEMUFile *pss_channel;
ec6f3ab9
PX
92 /* Last block from where we have sent data */
93 RAMBlock *last_sent_block;
f1668764
PX
94 /* Current block being searched */
95 RAMBlock *block;
96 /* Current page to search from */
97 unsigned long page;
98 /* Set once we wrap around */
99 bool complete_round;
f1668764
PX
100 /* Whether we're sending a host page */
101 bool host_page_sending;
102 /* The start/end of current host page. Invalid if host_page_sending==false */
103 unsigned long host_page_start;
104 unsigned long host_page_end;
105};
106typedef struct PageSearchStatus PageSearchStatus;
107
56e93d26
JQ
108/* struct contains XBZRLE cache and a static page
109 used by the compression */
110static struct {
111 /* buffer used for XBZRLE encoding */
112 uint8_t *encoded_buf;
113 /* buffer for storing page content */
114 uint8_t *current_buf;
115 /* Cache for XBZRLE, Protected by lock. */
116 PageCache *cache;
117 QemuMutex lock;
c00e0928
JQ
118 /* it will store a page full of zeros */
119 uint8_t *zero_target_page;
f265e0e4
JQ
120 /* buffer used for XBZRLE decoding */
121 uint8_t *decoded_buf;
56e93d26
JQ
122} XBZRLE;
123
56e93d26
JQ
124static void XBZRLE_cache_lock(void)
125{
f4c51a6b 126 if (migrate_use_xbzrle()) {
56e93d26 127 qemu_mutex_lock(&XBZRLE.lock);
f4c51a6b 128 }
56e93d26
JQ
129}
130
131static void XBZRLE_cache_unlock(void)
132{
f4c51a6b 133 if (migrate_use_xbzrle()) {
56e93d26 134 qemu_mutex_unlock(&XBZRLE.lock);
f4c51a6b 135 }
56e93d26
JQ
136}
137
3d0684b2
JQ
138/**
139 * xbzrle_cache_resize: resize the xbzrle cache
140 *
cbde7be9 141 * This function is called from migrate_params_apply in main
3d0684b2
JQ
142 * thread, possibly while a migration is in progress. A running
143 * migration may be using the cache and might finish during this call,
144 * hence changes to the cache are protected by XBZRLE.lock().
145 *
c9dede2d 146 * Returns 0 for success or -1 for error
3d0684b2
JQ
147 *
148 * @new_size: new cache size
8acabf69 149 * @errp: set *errp if the check failed, with reason
56e93d26 150 */
8b9407a0 151int xbzrle_cache_resize(uint64_t new_size, Error **errp)
56e93d26
JQ
152{
153 PageCache *new_cache;
c9dede2d 154 int64_t ret = 0;
56e93d26 155
8acabf69
JQ
156 /* Check for truncation */
157 if (new_size != (size_t)new_size) {
158 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
159 "exceeding address space");
160 return -1;
161 }
162
2a313e5c
JQ
163 if (new_size == migrate_xbzrle_cache_size()) {
164 /* nothing to do */
c9dede2d 165 return 0;
2a313e5c
JQ
166 }
167
56e93d26
JQ
168 XBZRLE_cache_lock();
169
170 if (XBZRLE.cache != NULL) {
80f8dfde 171 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
56e93d26 172 if (!new_cache) {
56e93d26
JQ
173 ret = -1;
174 goto out;
175 }
176
177 cache_fini(XBZRLE.cache);
178 XBZRLE.cache = new_cache;
179 }
56e93d26
JQ
180out:
181 XBZRLE_cache_unlock();
182 return ret;
183}
184
20123ee1
PX
185static bool postcopy_preempt_active(void)
186{
187 return migrate_postcopy_preempt() && migration_in_postcopy();
188}
189
3ded54b1 190bool ramblock_is_ignored(RAMBlock *block)
fbd162e6
YK
191{
192 return !qemu_ram_is_migratable(block) ||
193 (migrate_ignore_shared() && qemu_ram_is_shared(block));
194}
195
343f632c
DDAG
196#undef RAMBLOCK_FOREACH
197
fbd162e6
YK
198int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
199{
200 RAMBlock *block;
201 int ret = 0;
202
89ac5a1d
DDAG
203 RCU_READ_LOCK_GUARD();
204
fbd162e6
YK
205 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
206 ret = func(block, opaque);
207 if (ret) {
208 break;
209 }
210 }
fbd162e6
YK
211 return ret;
212}
213
f9494614
AP
214static void ramblock_recv_map_init(void)
215{
216 RAMBlock *rb;
217
fbd162e6 218 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
f9494614
AP
219 assert(!rb->receivedmap);
220 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
221 }
222}
223
224int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
225{
226 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
227 rb->receivedmap);
228}
229
1cba9f6e
DDAG
230bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
231{
232 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
233}
234
f9494614
AP
235void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
236{
237 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
238}
239
240void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
241 size_t nr)
242{
243 bitmap_set_atomic(rb->receivedmap,
244 ramblock_recv_bitmap_offset(host_addr, rb),
245 nr);
246}
247
a335debb
PX
248#define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
249
250/*
251 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
252 *
253 * Returns >0 if success with sent bytes, or <0 if error.
254 */
255int64_t ramblock_recv_bitmap_send(QEMUFile *file,
256 const char *block_name)
257{
258 RAMBlock *block = qemu_ram_block_by_name(block_name);
259 unsigned long *le_bitmap, nbits;
260 uint64_t size;
261
262 if (!block) {
263 error_report("%s: invalid block name: %s", __func__, block_name);
264 return -1;
265 }
266
898ba906 267 nbits = block->postcopy_length >> TARGET_PAGE_BITS;
a335debb
PX
268
269 /*
270 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
271 * machines we may need 4 more bytes for padding (see below
272 * comment). So extend it a bit before hand.
273 */
274 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
275
276 /*
277 * Always use little endian when sending the bitmap. This is
278 * required that when source and destination VMs are not using the
3a4452d8 279 * same endianness. (Note: big endian won't work.)
a335debb
PX
280 */
281 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
282
283 /* Size of the bitmap, in bytes */
a725ef9f 284 size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
285
286 /*
287 * size is always aligned to 8 bytes for 64bit machines, but it
288 * may not be true for 32bit machines. We need this padding to
289 * make sure the migration can survive even between 32bit and
290 * 64bit machines.
291 */
292 size = ROUND_UP(size, 8);
293
294 qemu_put_be64(file, size);
295 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
296 /*
297 * Mark as an end, in case the middle part is screwed up due to
3a4452d8 298 * some "mysterious" reason.
a335debb
PX
299 */
300 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
301 qemu_fflush(file);
302
bf269906 303 g_free(le_bitmap);
a335debb
PX
304
305 if (qemu_file_get_error(file)) {
306 return qemu_file_get_error(file);
307 }
308
309 return size + sizeof(size);
310}
311
ec481c6c
JQ
312/*
313 * An outstanding page request, on the source, having been received
314 * and queued
315 */
316struct RAMSrcPageRequest {
317 RAMBlock *rb;
318 hwaddr offset;
319 hwaddr len;
320
321 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
322};
323
6f37bb8b
JQ
324/* State of RAM for migration */
325struct RAMState {
f1668764
PX
326 /*
327 * PageSearchStatus structures for the channels when send pages.
328 * Protected by the bitmap_mutex.
329 */
330 PageSearchStatus pss[RAM_CHANNEL_MAX];
278e2f55
AG
331 /* UFFD file descriptor, used in 'write-tracking' migration */
332 int uffdio_fd;
6f37bb8b
JQ
333 /* Last block that we have visited searching for dirty pages */
334 RAMBlock *last_seen_block;
269ace29
JQ
335 /* Last dirty target page we have sent */
336 ram_addr_t last_page;
6f37bb8b
JQ
337 /* last ram version we have seen */
338 uint32_t last_version;
8d820d6f
JQ
339 /* How many times we have dirty too many pages */
340 int dirty_rate_high_cnt;
f664da80
JQ
341 /* these variables are used for bitmap sync */
342 /* last time we did a full bitmap_sync */
343 int64_t time_last_bitmap_sync;
eac74159 344 /* bytes transferred at start_time */
c4bdf0cf 345 uint64_t bytes_xfer_prev;
a66cd90c 346 /* number of dirty pages since start_time */
68908ed6 347 uint64_t num_dirty_pages_period;
b5833fde
JQ
348 /* xbzrle misses since the beginning of the period */
349 uint64_t xbzrle_cache_miss_prev;
e460a4b1
WW
350 /* Amount of xbzrle pages since the beginning of the period */
351 uint64_t xbzrle_pages_prev;
352 /* Amount of xbzrle encoded bytes since the beginning of the period */
353 uint64_t xbzrle_bytes_prev;
1a373522
DH
354 /* Start using XBZRLE (e.g., after the first round). */
355 bool xbzrle_enabled;
05931ec5
JQ
356 /* Are we on the last stage of migration */
357 bool last_stage;
76e03000
XG
358 /* compression statistics since the beginning of the period */
359 /* amount of count that no free thread to compress data */
360 uint64_t compress_thread_busy_prev;
361 /* amount bytes after compression */
362 uint64_t compressed_size_prev;
363 /* amount of compressed pages */
364 uint64_t compress_pages_prev;
365
be8b02ed
XG
366 /* total handled target pages at the beginning of period */
367 uint64_t target_page_count_prev;
368 /* total handled target pages since start */
369 uint64_t target_page_count;
9360447d 370 /* number of dirty bits in the bitmap */
2dfaf12e 371 uint64_t migration_dirty_pages;
f1668764
PX
372 /*
373 * Protects:
374 * - dirty/clear bitmap
375 * - migration_dirty_pages
376 * - pss structures
377 */
108cfae0 378 QemuMutex bitmap_mutex;
68a098f3
JQ
379 /* The RAMBlock used in the last src_page_requests */
380 RAMBlock *last_req_rb;
ec481c6c
JQ
381 /* Queue of outstanding page requests from the destination */
382 QemuMutex src_page_req_mutex;
b58deb34 383 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
6f37bb8b
JQ
384};
385typedef struct RAMState RAMState;
386
53518d94 387static RAMState *ram_state;
6f37bb8b 388
bd227060
WW
389static NotifierWithReturnList precopy_notifier_list;
390
a1fe28df
PX
391/* Whether postcopy has queued requests? */
392static bool postcopy_has_request(RAMState *rs)
393{
394 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
395}
396
bd227060
WW
397void precopy_infrastructure_init(void)
398{
399 notifier_with_return_list_init(&precopy_notifier_list);
400}
401
402void precopy_add_notifier(NotifierWithReturn *n)
403{
404 notifier_with_return_list_add(&precopy_notifier_list, n);
405}
406
407void precopy_remove_notifier(NotifierWithReturn *n)
408{
409 notifier_with_return_remove(n);
410}
411
412int precopy_notify(PrecopyNotifyReason reason, Error **errp)
413{
414 PrecopyNotifyData pnd;
415 pnd.reason = reason;
416 pnd.errp = errp;
417
418 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
419}
420
9edabd4d 421uint64_t ram_bytes_remaining(void)
2f4fde93 422{
bae416e5
DDAG
423 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
424 0;
2f4fde93
JQ
425}
426
23b7576d
PX
427/*
428 * NOTE: not all stats in ram_counters are used in reality. See comments
429 * for struct MigrationAtomicStats. The ultimate result of ram migration
430 * counters will be a merged version with both ram_counters and the atomic
431 * fields in ram_atomic_counters.
432 */
9360447d 433MigrationStats ram_counters;
23b7576d 434MigrationAtomicStats ram_atomic_counters;
96506894 435
26a26069 436void ram_transferred_add(uint64_t bytes)
4c2d0f6d 437{
ae680668
DE
438 if (runstate_is_running()) {
439 ram_counters.precopy_bytes += bytes;
440 } else if (migration_in_postcopy()) {
23b7576d 441 stat64_add(&ram_atomic_counters.postcopy_bytes, bytes);
ae680668
DE
442 } else {
443 ram_counters.downtime_bytes += bytes;
444 }
23b7576d 445 stat64_add(&ram_atomic_counters.transferred, bytes);
4c2d0f6d
DE
446}
447
d59c40cc
LB
448void dirty_sync_missed_zero_copy(void)
449{
450 ram_counters.dirty_sync_missed_zero_copy++;
451}
452
76e03000
XG
453CompressionStats compression_counters;
454
56e93d26 455struct CompressParam {
56e93d26 456 bool done;
90e56fb4 457 bool quit;
5e5fdcff 458 bool zero_page;
56e93d26
JQ
459 QEMUFile *file;
460 QemuMutex mutex;
461 QemuCond cond;
462 RAMBlock *block;
463 ram_addr_t offset;
34ab9e97
XG
464
465 /* internally used fields */
dcaf446e 466 z_stream stream;
34ab9e97 467 uint8_t *originbuf;
56e93d26
JQ
468};
469typedef struct CompressParam CompressParam;
470
471struct DecompressParam {
73a8912b 472 bool done;
90e56fb4 473 bool quit;
56e93d26
JQ
474 QemuMutex mutex;
475 QemuCond cond;
476 void *des;
d341d9f3 477 uint8_t *compbuf;
56e93d26 478 int len;
797ca154 479 z_stream stream;
56e93d26
JQ
480};
481typedef struct DecompressParam DecompressParam;
482
483static CompressParam *comp_param;
484static QemuThread *compress_threads;
485/* comp_done_cond is used to wake up the migration thread when
486 * one of the compression threads has finished the compression.
487 * comp_done_lock is used to co-work with comp_done_cond.
488 */
0d9f9a5c
LL
489static QemuMutex comp_done_lock;
490static QemuCond comp_done_cond;
56e93d26 491
34ab9e97 492static QEMUFile *decomp_file;
56e93d26
JQ
493static DecompressParam *decomp_param;
494static QemuThread *decompress_threads;
73a8912b
LL
495static QemuMutex decomp_done_lock;
496static QemuCond decomp_done_cond;
56e93d26 497
93589827
PX
498static int ram_save_host_page_urgent(PageSearchStatus *pss);
499
5e5fdcff 500static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
6ef3771c 501 ram_addr_t offset, uint8_t *source_buf);
56e93d26 502
ebd88a49
PX
503/* NOTE: page is the PFN not real ram_addr_t. */
504static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page)
505{
506 pss->block = rb;
507 pss->page = page;
508 pss->complete_round = false;
509}
510
93589827
PX
511/*
512 * Check whether two PSSs are actively sending the same page. Return true
513 * if it is, false otherwise.
514 */
515static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2)
516{
517 return pss1->host_page_sending && pss2->host_page_sending &&
518 (pss1->host_page_start == pss2->host_page_start);
519}
520
56e93d26
JQ
521static void *do_data_compress(void *opaque)
522{
523 CompressParam *param = opaque;
a7a9a88f
LL
524 RAMBlock *block;
525 ram_addr_t offset;
5e5fdcff 526 bool zero_page;
56e93d26 527
a7a9a88f 528 qemu_mutex_lock(&param->mutex);
90e56fb4 529 while (!param->quit) {
a7a9a88f
LL
530 if (param->block) {
531 block = param->block;
532 offset = param->offset;
533 param->block = NULL;
534 qemu_mutex_unlock(&param->mutex);
535
5e5fdcff
XG
536 zero_page = do_compress_ram_page(param->file, &param->stream,
537 block, offset, param->originbuf);
a7a9a88f 538
0d9f9a5c 539 qemu_mutex_lock(&comp_done_lock);
a7a9a88f 540 param->done = true;
5e5fdcff 541 param->zero_page = zero_page;
0d9f9a5c
LL
542 qemu_cond_signal(&comp_done_cond);
543 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
544
545 qemu_mutex_lock(&param->mutex);
546 } else {
56e93d26
JQ
547 qemu_cond_wait(&param->cond, &param->mutex);
548 }
56e93d26 549 }
a7a9a88f 550 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
551
552 return NULL;
553}
554
f0afa331 555static void compress_threads_save_cleanup(void)
56e93d26
JQ
556{
557 int i, thread_count;
558
05306935 559 if (!migrate_use_compression() || !comp_param) {
56e93d26
JQ
560 return;
561 }
05306935 562
56e93d26
JQ
563 thread_count = migrate_compress_threads();
564 for (i = 0; i < thread_count; i++) {
dcaf446e
XG
565 /*
566 * we use it as a indicator which shows if the thread is
567 * properly init'd or not
568 */
569 if (!comp_param[i].file) {
570 break;
571 }
05306935
FL
572
573 qemu_mutex_lock(&comp_param[i].mutex);
574 comp_param[i].quit = true;
575 qemu_cond_signal(&comp_param[i].cond);
576 qemu_mutex_unlock(&comp_param[i].mutex);
577
56e93d26 578 qemu_thread_join(compress_threads + i);
56e93d26
JQ
579 qemu_mutex_destroy(&comp_param[i].mutex);
580 qemu_cond_destroy(&comp_param[i].cond);
dcaf446e 581 deflateEnd(&comp_param[i].stream);
34ab9e97 582 g_free(comp_param[i].originbuf);
dcaf446e
XG
583 qemu_fclose(comp_param[i].file);
584 comp_param[i].file = NULL;
56e93d26 585 }
0d9f9a5c
LL
586 qemu_mutex_destroy(&comp_done_lock);
587 qemu_cond_destroy(&comp_done_cond);
56e93d26
JQ
588 g_free(compress_threads);
589 g_free(comp_param);
56e93d26
JQ
590 compress_threads = NULL;
591 comp_param = NULL;
56e93d26
JQ
592}
593
dcaf446e 594static int compress_threads_save_setup(void)
56e93d26
JQ
595{
596 int i, thread_count;
597
598 if (!migrate_use_compression()) {
dcaf446e 599 return 0;
56e93d26 600 }
56e93d26
JQ
601 thread_count = migrate_compress_threads();
602 compress_threads = g_new0(QemuThread, thread_count);
603 comp_param = g_new0(CompressParam, thread_count);
0d9f9a5c
LL
604 qemu_cond_init(&comp_done_cond);
605 qemu_mutex_init(&comp_done_lock);
56e93d26 606 for (i = 0; i < thread_count; i++) {
34ab9e97
XG
607 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
608 if (!comp_param[i].originbuf) {
609 goto exit;
610 }
611
dcaf446e
XG
612 if (deflateInit(&comp_param[i].stream,
613 migrate_compress_level()) != Z_OK) {
34ab9e97 614 g_free(comp_param[i].originbuf);
dcaf446e
XG
615 goto exit;
616 }
617
e110aa91
C
618 /* comp_param[i].file is just used as a dummy buffer to save data,
619 * set its ops to empty.
56e93d26 620 */
77ef2dc1 621 comp_param[i].file = qemu_file_new_output(
c0e0825c 622 QIO_CHANNEL(qio_channel_null_new()));
56e93d26 623 comp_param[i].done = true;
90e56fb4 624 comp_param[i].quit = false;
56e93d26
JQ
625 qemu_mutex_init(&comp_param[i].mutex);
626 qemu_cond_init(&comp_param[i].cond);
627 qemu_thread_create(compress_threads + i, "compress",
628 do_data_compress, comp_param + i,
629 QEMU_THREAD_JOINABLE);
630 }
dcaf446e
XG
631 return 0;
632
633exit:
634 compress_threads_save_cleanup();
635 return -1;
56e93d26
JQ
636}
637
638/**
3d0684b2 639 * save_page_header: write page header to wire
56e93d26
JQ
640 *
641 * If this is the 1st block, it also writes the block identification
642 *
3d0684b2 643 * Returns the number of bytes written
56e93d26 644 *
ec6f3ab9 645 * @pss: current PSS channel status
56e93d26
JQ
646 * @block: block that contains the page we want to send
647 * @offset: offset inside the block for the page
648 * in the lower bits, it contains flags
649 */
ec6f3ab9 650static size_t save_page_header(PageSearchStatus *pss, RAMBlock *block,
2bf3aa85 651 ram_addr_t offset)
56e93d26 652{
9f5f380b 653 size_t size, len;
ec6f3ab9
PX
654 bool same_block = (block == pss->last_sent_block);
655 QEMUFile *f = pss->pss_channel;
56e93d26 656
10661f11 657 if (same_block) {
24795694
JQ
658 offset |= RAM_SAVE_FLAG_CONTINUE;
659 }
2bf3aa85 660 qemu_put_be64(f, offset);
56e93d26
JQ
661 size = 8;
662
10661f11 663 if (!same_block) {
9f5f380b 664 len = strlen(block->idstr);
2bf3aa85
JQ
665 qemu_put_byte(f, len);
666 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
9f5f380b 667 size += 1 + len;
ec6f3ab9 668 pss->last_sent_block = block;
56e93d26
JQ
669 }
670 return size;
671}
672
3d0684b2 673/**
179a8080 674 * mig_throttle_guest_down: throttle down the guest
3d0684b2
JQ
675 *
676 * Reduce amount of guest cpu execution to hopefully slow down memory
677 * writes. If guest dirty memory rate is reduced below the rate at
678 * which we can transfer pages to the destination then we should be
679 * able to complete migration. Some workloads dirty memory way too
680 * fast and will not effectively converge, even with auto-converge.
070afca2 681 */
cbbf8182
KZ
682static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
683 uint64_t bytes_dirty_threshold)
070afca2
JH
684{
685 MigrationState *s = migrate_get_current();
2594f56d 686 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
cbbf8182
KZ
687 uint64_t pct_increment = s->parameters.cpu_throttle_increment;
688 bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
4cbc9c7f 689 int pct_max = s->parameters.max_cpu_throttle;
070afca2 690
cbbf8182
KZ
691 uint64_t throttle_now = cpu_throttle_get_percentage();
692 uint64_t cpu_now, cpu_ideal, throttle_inc;
693
070afca2
JH
694 /* We have not started throttling yet. Let's start it. */
695 if (!cpu_throttle_active()) {
696 cpu_throttle_set(pct_initial);
697 } else {
698 /* Throttling already on, just increase the rate */
cbbf8182
KZ
699 if (!pct_tailslow) {
700 throttle_inc = pct_increment;
701 } else {
702 /* Compute the ideal CPU percentage used by Guest, which may
703 * make the dirty rate match the dirty rate threshold. */
704 cpu_now = 100 - throttle_now;
705 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
706 bytes_dirty_period);
707 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
708 }
709 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
070afca2
JH
710 }
711}
712
91fe9a8d
RL
713void mig_throttle_counter_reset(void)
714{
715 RAMState *rs = ram_state;
716
717 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
718 rs->num_dirty_pages_period = 0;
23b7576d 719 rs->bytes_xfer_prev = stat64_get(&ram_atomic_counters.transferred);
91fe9a8d
RL
720}
721
3d0684b2
JQ
722/**
723 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
724 *
6f37bb8b 725 * @rs: current RAM state
3d0684b2
JQ
726 * @current_addr: address for the zero page
727 *
728 * Update the xbzrle cache to reflect a page that's been sent as all 0.
56e93d26
JQ
729 * The important thing is that a stale (not-yet-0'd) page be replaced
730 * by the new data.
731 * As a bonus, if the page wasn't in the cache it gets added so that
3d0684b2 732 * when a small write is made into the 0'd page it gets XBZRLE sent.
56e93d26 733 */
6f37bb8b 734static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
56e93d26 735{
56e93d26
JQ
736 /* We don't care if this fails to allocate a new cache page
737 * as long as it updated an old one */
c00e0928 738 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
9360447d 739 ram_counters.dirty_sync_count);
56e93d26
JQ
740}
741
742#define ENCODING_FLAG_XBZRLE 0x1
743
744/**
745 * save_xbzrle_page: compress and send current page
746 *
747 * Returns: 1 means that we wrote the page
748 * 0 means that page is identical to the one already sent
749 * -1 means that xbzrle would be longer than normal
750 *
5a987738 751 * @rs: current RAM state
ec6f3ab9 752 * @pss: current PSS channel
3d0684b2
JQ
753 * @current_data: pointer to the address of the page contents
754 * @current_addr: addr of the page
56e93d26
JQ
755 * @block: block that contains the page we want to send
756 * @offset: offset inside the block for the page
56e93d26 757 */
ec6f3ab9 758static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
61717ea9
PX
759 uint8_t **current_data, ram_addr_t current_addr,
760 RAMBlock *block, ram_addr_t offset)
56e93d26
JQ
761{
762 int encoded_len = 0, bytes_xbzrle;
763 uint8_t *prev_cached_page;
ec6f3ab9 764 QEMUFile *file = pss->pss_channel;
56e93d26 765
9360447d
JQ
766 if (!cache_is_cached(XBZRLE.cache, current_addr,
767 ram_counters.dirty_sync_count)) {
768 xbzrle_counters.cache_miss++;
05931ec5 769 if (!rs->last_stage) {
56e93d26 770 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
9360447d 771 ram_counters.dirty_sync_count) == -1) {
56e93d26
JQ
772 return -1;
773 } else {
774 /* update *current_data when the page has been
775 inserted into cache */
776 *current_data = get_cached_data(XBZRLE.cache, current_addr);
777 }
778 }
779 return -1;
780 }
781
e460a4b1
WW
782 /*
783 * Reaching here means the page has hit the xbzrle cache, no matter what
784 * encoding result it is (normal encoding, overflow or skipping the page),
3a4452d8 785 * count the page as encoded. This is used to calculate the encoding rate.
e460a4b1
WW
786 *
787 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
788 * 2nd page turns out to be skipped (i.e. no new bytes written to the
789 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
790 * skipped page included. In this way, the encoding rate can tell if the
791 * guest page is good for xbzrle encoding.
792 */
793 xbzrle_counters.pages++;
56e93d26
JQ
794 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
795
796 /* save current buffer into memory */
797 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
798
799 /* XBZRLE encoding (if there is no overflow) */
800 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
801 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
802 TARGET_PAGE_SIZE);
ca353803
WY
803
804 /*
805 * Update the cache contents, so that it corresponds to the data
806 * sent, in all cases except where we skip the page.
807 */
05931ec5 808 if (!rs->last_stage && encoded_len != 0) {
ca353803
WY
809 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
810 /*
811 * In the case where we couldn't compress, ensure that the caller
812 * sends the data from the cache, since the guest might have
813 * changed the RAM since we copied it.
814 */
815 *current_data = prev_cached_page;
816 }
817
56e93d26 818 if (encoded_len == 0) {
55c4446b 819 trace_save_xbzrle_page_skipping();
56e93d26
JQ
820 return 0;
821 } else if (encoded_len == -1) {
55c4446b 822 trace_save_xbzrle_page_overflow();
9360447d 823 xbzrle_counters.overflow++;
e460a4b1 824 xbzrle_counters.bytes += TARGET_PAGE_SIZE;
56e93d26
JQ
825 return -1;
826 }
827
56e93d26 828 /* Send XBZRLE based compressed page */
ec6f3ab9 829 bytes_xbzrle = save_page_header(pss, block,
204b88b8 830 offset | RAM_SAVE_FLAG_XBZRLE);
61717ea9
PX
831 qemu_put_byte(file, ENCODING_FLAG_XBZRLE);
832 qemu_put_be16(file, encoded_len);
833 qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len);
56e93d26 834 bytes_xbzrle += encoded_len + 1 + 2;
e460a4b1
WW
835 /*
836 * Like compressed_size (please see update_compress_thread_counts),
837 * the xbzrle encoded bytes don't count the 8 byte header with
838 * RAM_SAVE_FLAG_CONTINUE.
839 */
840 xbzrle_counters.bytes += bytes_xbzrle - 8;
4c2d0f6d 841 ram_transferred_add(bytes_xbzrle);
56e93d26
JQ
842
843 return 1;
844}
845
3d0684b2 846/**
d9e474ea 847 * pss_find_next_dirty: find the next dirty page of current ramblock
f3f491fc 848 *
d9e474ea
PX
849 * This function updates pss->page to point to the next dirty page index
850 * within the ramblock to migrate, or the end of ramblock when nothing
851 * found. Note that when pss->host_page_sending==true it means we're
852 * during sending a host page, so we won't look for dirty page that is
853 * outside the host page boundary.
3d0684b2 854 *
d9e474ea 855 * @pss: the current page search status
f3f491fc 856 */
d9e474ea 857static void pss_find_next_dirty(PageSearchStatus *pss)
56e93d26 858{
d9e474ea 859 RAMBlock *rb = pss->block;
6b6712ef
JQ
860 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
861 unsigned long *bitmap = rb->bmap;
56e93d26 862
fbd162e6 863 if (ramblock_is_ignored(rb)) {
d9e474ea
PX
864 /* Points directly to the end, so we know no dirty page */
865 pss->page = size;
866 return;
867 }
868
869 /*
870 * If during sending a host page, only look for dirty pages within the
871 * current host page being send.
872 */
873 if (pss->host_page_sending) {
874 assert(pss->host_page_end);
875 size = MIN(size, pss->host_page_end);
b895de50
CLG
876 }
877
d9e474ea 878 pss->page = find_next_bit(bitmap, size, pss->page);
56e93d26
JQ
879}
880
1230a25f 881static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
3143577d
WW
882 unsigned long page)
883{
884 uint8_t shift;
885 hwaddr size, start;
886
887 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
888 return;
889 }
890
891 shift = rb->clear_bmap_shift;
892 /*
893 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
894 * can make things easier sometimes since then start address
895 * of the small chunk will always be 64 pages aligned so the
896 * bitmap will always be aligned to unsigned long. We should
897 * even be able to remove this restriction but I'm simply
898 * keeping it.
899 */
900 assert(shift >= 6);
901
902 size = 1ULL << (TARGET_PAGE_BITS + shift);
7648297d 903 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
3143577d
WW
904 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
905 memory_region_clear_dirty_bitmap(rb->mr, start, size);
906}
907
908static void
1230a25f 909migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
3143577d
WW
910 unsigned long start,
911 unsigned long npages)
912{
913 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
914 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
915 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
916
917 /*
918 * Clear pages from start to start + npages - 1, so the end boundary is
919 * exclusive.
920 */
921 for (i = chunk_start; i < chunk_end; i += chunk_pages) {
1230a25f 922 migration_clear_memory_region_dirty_bitmap(rb, i);
3143577d
WW
923 }
924}
925
a6a83cef
RL
926/*
927 * colo_bitmap_find_diry:find contiguous dirty pages from start
928 *
929 * Returns the page offset within memory region of the start of the contiguout
930 * dirty page
931 *
932 * @rs: current RAM state
933 * @rb: RAMBlock where to search for dirty pages
934 * @start: page where we start the search
935 * @num: the number of contiguous dirty pages
936 */
937static inline
938unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
939 unsigned long start, unsigned long *num)
940{
941 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
942 unsigned long *bitmap = rb->bmap;
943 unsigned long first, next;
944
945 *num = 0;
946
947 if (ramblock_is_ignored(rb)) {
948 return size;
949 }
950
951 first = find_next_bit(bitmap, size, start);
952 if (first >= size) {
953 return first;
954 }
955 next = find_next_zero_bit(bitmap, size, first + 1);
956 assert(next >= first);
957 *num = next - first;
958 return first;
959}
960
06b10688 961static inline bool migration_bitmap_clear_dirty(RAMState *rs,
f20e2865
JQ
962 RAMBlock *rb,
963 unsigned long page)
a82d593b
DDAG
964{
965 bool ret;
a82d593b 966
002cad6b
PX
967 /*
968 * Clear dirty bitmap if needed. This _must_ be called before we
969 * send any of the page in the chunk because we need to make sure
970 * we can capture further page content changes when we sync dirty
971 * log the next time. So as long as we are going to send any of
972 * the page in the chunk we clear the remote dirty bitmap for all.
973 * Clearing it earlier won't be a problem, but too late will.
974 */
1230a25f 975 migration_clear_memory_region_dirty_bitmap(rb, page);
002cad6b 976
6b6712ef 977 ret = test_and_clear_bit(page, rb->bmap);
a82d593b 978 if (ret) {
0d8ec885 979 rs->migration_dirty_pages--;
a82d593b 980 }
386a907b 981
a82d593b
DDAG
982 return ret;
983}
984
be39b4cd
DH
985static void dirty_bitmap_clear_section(MemoryRegionSection *section,
986 void *opaque)
987{
988 const hwaddr offset = section->offset_within_region;
989 const hwaddr size = int128_get64(section->size);
990 const unsigned long start = offset >> TARGET_PAGE_BITS;
991 const unsigned long npages = size >> TARGET_PAGE_BITS;
992 RAMBlock *rb = section->mr->ram_block;
993 uint64_t *cleared_bits = opaque;
994
995 /*
996 * We don't grab ram_state->bitmap_mutex because we expect to run
997 * only when starting migration or during postcopy recovery where
998 * we don't have concurrent access.
999 */
1000 if (!migration_in_postcopy() && !migrate_background_snapshot()) {
1001 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
1002 }
1003 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
1004 bitmap_clear(rb->bmap, start, npages);
1005}
1006
1007/*
1008 * Exclude all dirty pages from migration that fall into a discarded range as
1009 * managed by a RamDiscardManager responsible for the mapped memory region of
1010 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
1011 *
1012 * Discarded pages ("logically unplugged") have undefined content and must
1013 * not get migrated, because even reading these pages for migration might
1014 * result in undesired behavior.
1015 *
1016 * Returns the number of cleared bits in the RAMBlock dirty bitmap.
1017 *
1018 * Note: The result is only stable while migrating (precopy/postcopy).
1019 */
1020static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
1021{
1022 uint64_t cleared_bits = 0;
1023
1024 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
1025 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1026 MemoryRegionSection section = {
1027 .mr = rb->mr,
1028 .offset_within_region = 0,
1029 .size = int128_make64(qemu_ram_get_used_length(rb)),
1030 };
1031
1032 ram_discard_manager_replay_discarded(rdm, &section,
1033 dirty_bitmap_clear_section,
1034 &cleared_bits);
1035 }
1036 return cleared_bits;
1037}
1038
9470c5e0
DH
1039/*
1040 * Check if a host-page aligned page falls into a discarded range as managed by
1041 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
1042 *
1043 * Note: The result is only stable while migrating (precopy/postcopy).
1044 */
1045bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
1046{
1047 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1048 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1049 MemoryRegionSection section = {
1050 .mr = rb->mr,
1051 .offset_within_region = start,
1052 .size = int128_make64(qemu_ram_pagesize(rb)),
1053 };
1054
1055 return !ram_discard_manager_is_populated(rdm, &section);
1056 }
1057 return false;
1058}
1059
267691b6 1060/* Called with RCU critical section */
7a3e9571 1061static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
56e93d26 1062{
fb613580
KZ
1063 uint64_t new_dirty_pages =
1064 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
1065
1066 rs->migration_dirty_pages += new_dirty_pages;
1067 rs->num_dirty_pages_period += new_dirty_pages;
56e93d26
JQ
1068}
1069
3d0684b2
JQ
1070/**
1071 * ram_pagesize_summary: calculate all the pagesizes of a VM
1072 *
1073 * Returns a summary bitmap of the page sizes of all RAMBlocks
1074 *
1075 * For VMs with just normal pages this is equivalent to the host page
1076 * size. If it's got some huge pages then it's the OR of all the
1077 * different page sizes.
e8ca1db2
DDAG
1078 */
1079uint64_t ram_pagesize_summary(void)
1080{
1081 RAMBlock *block;
1082 uint64_t summary = 0;
1083
fbd162e6 1084 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
e8ca1db2
DDAG
1085 summary |= block->page_size;
1086 }
1087
1088 return summary;
1089}
1090
aecbfe9c
XG
1091uint64_t ram_get_total_transferred_pages(void)
1092{
23b7576d
PX
1093 return stat64_get(&ram_atomic_counters.normal) +
1094 stat64_get(&ram_atomic_counters.duplicate) +
1095 compression_counters.pages + xbzrle_counters.pages;
aecbfe9c
XG
1096}
1097
b734035b
XG
1098static void migration_update_rates(RAMState *rs, int64_t end_time)
1099{
be8b02ed 1100 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
76e03000 1101 double compressed_size;
b734035b
XG
1102
1103 /* calculate period counters */
1104 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1105 / (end_time - rs->time_last_bitmap_sync);
1106
be8b02ed 1107 if (!page_count) {
b734035b
XG
1108 return;
1109 }
1110
1111 if (migrate_use_xbzrle()) {
e460a4b1
WW
1112 double encoded_size, unencoded_size;
1113
b734035b 1114 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
be8b02ed 1115 rs->xbzrle_cache_miss_prev) / page_count;
b734035b 1116 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
e460a4b1
WW
1117 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1118 TARGET_PAGE_SIZE;
1119 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
92271402 1120 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
e460a4b1 1121 xbzrle_counters.encoding_rate = 0;
e460a4b1
WW
1122 } else {
1123 xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1124 }
1125 rs->xbzrle_pages_prev = xbzrle_counters.pages;
1126 rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
b734035b 1127 }
76e03000
XG
1128
1129 if (migrate_use_compression()) {
1130 compression_counters.busy_rate = (double)(compression_counters.busy -
1131 rs->compress_thread_busy_prev) / page_count;
1132 rs->compress_thread_busy_prev = compression_counters.busy;
1133
1134 compressed_size = compression_counters.compressed_size -
1135 rs->compressed_size_prev;
1136 if (compressed_size) {
1137 double uncompressed_size = (compression_counters.pages -
1138 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1139
1140 /* Compression-Ratio = Uncompressed-size / Compressed-size */
1141 compression_counters.compression_rate =
1142 uncompressed_size / compressed_size;
1143
1144 rs->compress_pages_prev = compression_counters.pages;
1145 rs->compressed_size_prev = compression_counters.compressed_size;
1146 }
1147 }
b734035b
XG
1148}
1149
dc14a470
KZ
1150static void migration_trigger_throttle(RAMState *rs)
1151{
1152 MigrationState *s = migrate_get_current();
1153 uint64_t threshold = s->parameters.throttle_trigger_threshold;
23b7576d
PX
1154 uint64_t bytes_xfer_period =
1155 stat64_get(&ram_atomic_counters.transferred) - rs->bytes_xfer_prev;
dc14a470
KZ
1156 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1157 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1158
1159 /* During block migration the auto-converge logic incorrectly detects
1160 * that ram migration makes no progress. Avoid this by disabling the
1161 * throttling logic during the bulk phase of block migration. */
1162 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1163 /* The following detection logic can be refined later. For now:
1164 Check to see if the ratio between dirtied bytes and the approx.
1165 amount of bytes that just got transferred since the last time
1166 we were in this routine reaches the threshold. If that happens
1167 twice, start or increase throttling. */
1168
1169 if ((bytes_dirty_period > bytes_dirty_threshold) &&
1170 (++rs->dirty_rate_high_cnt >= 2)) {
1171 trace_migration_throttle();
1172 rs->dirty_rate_high_cnt = 0;
cbbf8182
KZ
1173 mig_throttle_guest_down(bytes_dirty_period,
1174 bytes_dirty_threshold);
dc14a470
KZ
1175 }
1176 }
1177}
1178
8d820d6f 1179static void migration_bitmap_sync(RAMState *rs)
56e93d26
JQ
1180{
1181 RAMBlock *block;
56e93d26 1182 int64_t end_time;
56e93d26 1183
9360447d 1184 ram_counters.dirty_sync_count++;
56e93d26 1185
f664da80
JQ
1186 if (!rs->time_last_bitmap_sync) {
1187 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
56e93d26
JQ
1188 }
1189
1190 trace_migration_bitmap_sync_start();
9c1f8f44 1191 memory_global_dirty_log_sync();
56e93d26 1192
108cfae0 1193 qemu_mutex_lock(&rs->bitmap_mutex);
89ac5a1d
DDAG
1194 WITH_RCU_READ_LOCK_GUARD() {
1195 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1196 ramblock_sync_dirty_bitmap(rs, block);
1197 }
1198 ram_counters.remaining = ram_bytes_remaining();
56e93d26 1199 }
108cfae0 1200 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26 1201
9458a9a1 1202 memory_global_after_dirty_log_sync();
a66cd90c 1203 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1ffb5dfd 1204
56e93d26
JQ
1205 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1206
1207 /* more than 1 second = 1000 millisecons */
f664da80 1208 if (end_time > rs->time_last_bitmap_sync + 1000) {
dc14a470 1209 migration_trigger_throttle(rs);
070afca2 1210
b734035b
XG
1211 migration_update_rates(rs, end_time);
1212
be8b02ed 1213 rs->target_page_count_prev = rs->target_page_count;
d693c6f1
FF
1214
1215 /* reset period counters */
f664da80 1216 rs->time_last_bitmap_sync = end_time;
a66cd90c 1217 rs->num_dirty_pages_period = 0;
23b7576d 1218 rs->bytes_xfer_prev = stat64_get(&ram_atomic_counters.transferred);
56e93d26 1219 }
4addcd4f 1220 if (migrate_use_events()) {
3ab72385 1221 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
4addcd4f 1222 }
56e93d26
JQ
1223}
1224
bd227060
WW
1225static void migration_bitmap_sync_precopy(RAMState *rs)
1226{
1227 Error *local_err = NULL;
1228
1229 /*
1230 * The current notifier usage is just an optimization to migration, so we
1231 * don't stop the normal migration process in the error case.
1232 */
1233 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1234 error_report_err(local_err);
b4a1733c 1235 local_err = NULL;
bd227060
WW
1236 }
1237
1238 migration_bitmap_sync(rs);
1239
1240 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1241 error_report_err(local_err);
1242 }
1243}
1244
a4dbaf8e 1245void ram_release_page(const char *rbname, uint64_t offset)
47fe16ff
JQ
1246{
1247 if (!migrate_release_ram() || !migration_in_postcopy()) {
1248 return;
1249 }
1250
1251 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1252}
1253
6c97ec5f
XG
1254/**
1255 * save_zero_page_to_file: send the zero page to the file
1256 *
1257 * Returns the size of data written to the file, 0 means the page is not
1258 * a zero page
1259 *
ec6f3ab9 1260 * @pss: current PSS channel
6c97ec5f
XG
1261 * @block: block that contains the page we want to send
1262 * @offset: offset inside the block for the page
1263 */
ec6f3ab9 1264static int save_zero_page_to_file(PageSearchStatus *pss,
6c97ec5f
XG
1265 RAMBlock *block, ram_addr_t offset)
1266{
1267 uint8_t *p = block->host + offset;
ec6f3ab9 1268 QEMUFile *file = pss->pss_channel;
6c97ec5f
XG
1269 int len = 0;
1270
bad452a7 1271 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
ec6f3ab9 1272 len += save_page_header(pss, block, offset | RAM_SAVE_FLAG_ZERO);
6c97ec5f
XG
1273 qemu_put_byte(file, 0);
1274 len += 1;
47fe16ff 1275 ram_release_page(block->idstr, offset);
6c97ec5f
XG
1276 }
1277 return len;
1278}
1279
56e93d26 1280/**
3d0684b2 1281 * save_zero_page: send the zero page to the stream
56e93d26 1282 *
3d0684b2 1283 * Returns the number of pages written.
56e93d26 1284 *
ec6f3ab9 1285 * @pss: current PSS channel
56e93d26
JQ
1286 * @block: block that contains the page we want to send
1287 * @offset: offset inside the block for the page
56e93d26 1288 */
ec6f3ab9 1289static int save_zero_page(PageSearchStatus *pss, RAMBlock *block,
61717ea9 1290 ram_addr_t offset)
56e93d26 1291{
ec6f3ab9 1292 int len = save_zero_page_to_file(pss, block, offset);
56e93d26 1293
6c97ec5f 1294 if (len) {
23b7576d 1295 stat64_add(&ram_atomic_counters.duplicate, 1);
4c2d0f6d 1296 ram_transferred_add(len);
6c97ec5f 1297 return 1;
56e93d26 1298 }
6c97ec5f 1299 return -1;
56e93d26
JQ
1300}
1301
059ff0fb
XG
1302/*
1303 * @pages: the number of pages written by the control path,
1304 * < 0 - error
1305 * > 0 - number of pages written
1306 *
1307 * Return true if the pages has been saved, otherwise false is returned.
1308 */
61717ea9
PX
1309static bool control_save_page(PageSearchStatus *pss, RAMBlock *block,
1310 ram_addr_t offset, int *pages)
059ff0fb
XG
1311{
1312 uint64_t bytes_xmit = 0;
1313 int ret;
1314
1315 *pages = -1;
61717ea9
PX
1316 ret = ram_control_save_page(pss->pss_channel, block->offset, offset,
1317 TARGET_PAGE_SIZE, &bytes_xmit);
059ff0fb
XG
1318 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1319 return false;
1320 }
1321
1322 if (bytes_xmit) {
4c2d0f6d 1323 ram_transferred_add(bytes_xmit);
059ff0fb
XG
1324 *pages = 1;
1325 }
1326
1327 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1328 return true;
1329 }
1330
1331 if (bytes_xmit > 0) {
23b7576d 1332 stat64_add(&ram_atomic_counters.normal, 1);
059ff0fb 1333 } else if (bytes_xmit == 0) {
23b7576d 1334 stat64_add(&ram_atomic_counters.duplicate, 1);
059ff0fb
XG
1335 }
1336
1337 return true;
1338}
1339
65dacaa0
XG
1340/*
1341 * directly send the page to the stream
1342 *
1343 * Returns the number of pages written.
1344 *
ec6f3ab9 1345 * @pss: current PSS channel
65dacaa0
XG
1346 * @block: block that contains the page we want to send
1347 * @offset: offset inside the block for the page
1348 * @buf: the page to be sent
1349 * @async: send to page asyncly
1350 */
ec6f3ab9 1351static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
61717ea9 1352 ram_addr_t offset, uint8_t *buf, bool async)
65dacaa0 1353{
ec6f3ab9
PX
1354 QEMUFile *file = pss->pss_channel;
1355
1356 ram_transferred_add(save_page_header(pss, block,
4c2d0f6d 1357 offset | RAM_SAVE_FLAG_PAGE));
65dacaa0 1358 if (async) {
61717ea9 1359 qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
f912ec5b 1360 migrate_release_ram() &&
65dacaa0
XG
1361 migration_in_postcopy());
1362 } else {
61717ea9 1363 qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
65dacaa0 1364 }
4c2d0f6d 1365 ram_transferred_add(TARGET_PAGE_SIZE);
23b7576d 1366 stat64_add(&ram_atomic_counters.normal, 1);
65dacaa0
XG
1367 return 1;
1368}
1369
56e93d26 1370/**
3d0684b2 1371 * ram_save_page: send the given page to the stream
56e93d26 1372 *
3d0684b2 1373 * Returns the number of pages written.
3fd3c4b3
DDAG
1374 * < 0 - error
1375 * >=0 - Number of pages written - this might legally be 0
1376 * if xbzrle noticed the page was the same.
56e93d26 1377 *
6f37bb8b 1378 * @rs: current RAM state
56e93d26
JQ
1379 * @block: block that contains the page we want to send
1380 * @offset: offset inside the block for the page
56e93d26 1381 */
05931ec5 1382static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
56e93d26
JQ
1383{
1384 int pages = -1;
56e93d26 1385 uint8_t *p;
56e93d26 1386 bool send_async = true;
a08f6890 1387 RAMBlock *block = pss->block;
8bba004c 1388 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
059ff0fb 1389 ram_addr_t current_addr = block->offset + offset;
56e93d26 1390
2f68e399 1391 p = block->host + offset;
1db9d8e5 1392 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
56e93d26 1393
56e93d26 1394 XBZRLE_cache_lock();
1a373522 1395 if (rs->xbzrle_enabled && !migration_in_postcopy()) {
ec6f3ab9 1396 pages = save_xbzrle_page(rs, pss, &p, current_addr,
61717ea9 1397 block, offset);
05931ec5 1398 if (!rs->last_stage) {
059ff0fb
XG
1399 /* Can't send this cached data async, since the cache page
1400 * might get updated before it gets to the wire
56e93d26 1401 */
059ff0fb 1402 send_async = false;
56e93d26
JQ
1403 }
1404 }
1405
1406 /* XBZRLE overflow or normal page */
1407 if (pages == -1) {
ec6f3ab9 1408 pages = save_normal_page(pss, block, offset, p, send_async);
56e93d26
JQ
1409 }
1410
1411 XBZRLE_cache_unlock();
1412
1413 return pages;
1414}
1415
61717ea9 1416static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block,
b9ee2f7d
JQ
1417 ram_addr_t offset)
1418{
61717ea9 1419 if (multifd_queue_page(file, block, offset) < 0) {
713f762a
IR
1420 return -1;
1421 }
23b7576d 1422 stat64_add(&ram_atomic_counters.normal, 1);
b9ee2f7d
JQ
1423
1424 return 1;
1425}
1426
5e5fdcff 1427static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
6ef3771c 1428 ram_addr_t offset, uint8_t *source_buf)
56e93d26 1429{
53518d94 1430 RAMState *rs = ram_state;
ec6f3ab9 1431 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
20d549cb 1432 uint8_t *p = block->host + offset;
6ef3771c 1433 int ret;
56e93d26 1434
ec6f3ab9 1435 if (save_zero_page_to_file(pss, block, offset)) {
e7f2e190 1436 return true;
5e5fdcff
XG
1437 }
1438
ec6f3ab9 1439 save_page_header(pss, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
34ab9e97
XG
1440
1441 /*
1442 * copy it to a internal buffer to avoid it being modified by VM
1443 * so that we can catch up the error during compression and
1444 * decompression
1445 */
1446 memcpy(source_buf, p, TARGET_PAGE_SIZE);
6ef3771c
XG
1447 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1448 if (ret < 0) {
1449 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
b3be2896 1450 error_report("compressed data failed!");
b3be2896 1451 }
e7f2e190 1452 return false;
5e5fdcff
XG
1453}
1454
1455static void
1456update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1457{
4c2d0f6d 1458 ram_transferred_add(bytes_xmit);
76e03000 1459
5e5fdcff 1460 if (param->zero_page) {
23b7576d 1461 stat64_add(&ram_atomic_counters.duplicate, 1);
76e03000 1462 return;
5e5fdcff 1463 }
76e03000
XG
1464
1465 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1466 compression_counters.compressed_size += bytes_xmit - 8;
1467 compression_counters.pages++;
56e93d26
JQ
1468}
1469
32b05495
XG
1470static bool save_page_use_compression(RAMState *rs);
1471
ce25d337 1472static void flush_compressed_data(RAMState *rs)
56e93d26 1473{
eaa238ab 1474 MigrationState *ms = migrate_get_current();
56e93d26
JQ
1475 int idx, len, thread_count;
1476
32b05495 1477 if (!save_page_use_compression(rs)) {
56e93d26
JQ
1478 return;
1479 }
1480 thread_count = migrate_compress_threads();
a7a9a88f 1481
0d9f9a5c 1482 qemu_mutex_lock(&comp_done_lock);
56e93d26 1483 for (idx = 0; idx < thread_count; idx++) {
a7a9a88f 1484 while (!comp_param[idx].done) {
0d9f9a5c 1485 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26 1486 }
a7a9a88f 1487 }
0d9f9a5c 1488 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
1489
1490 for (idx = 0; idx < thread_count; idx++) {
1491 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 1492 if (!comp_param[idx].quit) {
eaa238ab 1493 len = qemu_put_qemu_file(ms->to_dst_file, comp_param[idx].file);
5e5fdcff
XG
1494 /*
1495 * it's safe to fetch zero_page without holding comp_done_lock
1496 * as there is no further request submitted to the thread,
1497 * i.e, the thread should be waiting for a request at this point.
1498 */
1499 update_compress_thread_counts(&comp_param[idx], len);
56e93d26 1500 }
a7a9a88f 1501 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26
JQ
1502 }
1503}
1504
1505static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1506 ram_addr_t offset)
1507{
1508 param->block = block;
1509 param->offset = offset;
1510}
1511
eaa238ab 1512static int compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset)
56e93d26
JQ
1513{
1514 int idx, thread_count, bytes_xmit = -1, pages = -1;
1d58872a 1515 bool wait = migrate_compress_wait_thread();
eaa238ab 1516 MigrationState *ms = migrate_get_current();
56e93d26
JQ
1517
1518 thread_count = migrate_compress_threads();
0d9f9a5c 1519 qemu_mutex_lock(&comp_done_lock);
1d58872a
XG
1520retry:
1521 for (idx = 0; idx < thread_count; idx++) {
1522 if (comp_param[idx].done) {
1523 comp_param[idx].done = false;
eaa238ab
PX
1524 bytes_xmit = qemu_put_qemu_file(ms->to_dst_file,
1525 comp_param[idx].file);
1d58872a
XG
1526 qemu_mutex_lock(&comp_param[idx].mutex);
1527 set_compress_params(&comp_param[idx], block, offset);
1528 qemu_cond_signal(&comp_param[idx].cond);
1529 qemu_mutex_unlock(&comp_param[idx].mutex);
1530 pages = 1;
5e5fdcff 1531 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
56e93d26 1532 break;
56e93d26
JQ
1533 }
1534 }
1d58872a
XG
1535
1536 /*
1537 * wait for the free thread if the user specifies 'compress-wait-thread',
1538 * otherwise we will post the page out in the main thread as normal page.
1539 */
1540 if (pages < 0 && wait) {
1541 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1542 goto retry;
1543 }
0d9f9a5c 1544 qemu_mutex_unlock(&comp_done_lock);
56e93d26
JQ
1545
1546 return pages;
1547}
1548
3d0684b2
JQ
1549/**
1550 * find_dirty_block: find the next dirty page and update any state
1551 * associated with the search process.
b9e60928 1552 *
a5f7b1a6 1553 * Returns true if a page is found
b9e60928 1554 *
6f37bb8b 1555 * @rs: current RAM state
3d0684b2
JQ
1556 * @pss: data about the state of the current dirty page scan
1557 * @again: set to false if the search has scanned the whole of RAM
b9e60928 1558 */
f20e2865 1559static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
b9e60928 1560{
d9e474ea
PX
1561 /* Update pss->page for the next dirty bit in ramblock */
1562 pss_find_next_dirty(pss);
1563
6f37bb8b 1564 if (pss->complete_round && pss->block == rs->last_seen_block &&
a935e30f 1565 pss->page >= rs->last_page) {
b9e60928
DDAG
1566 /*
1567 * We've been once around the RAM and haven't found anything.
1568 * Give up.
1569 */
1570 *again = false;
1571 return false;
1572 }
542147f4
DH
1573 if (!offset_in_ramblock(pss->block,
1574 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
b9e60928 1575 /* Didn't find anything in this RAM Block */
a935e30f 1576 pss->page = 0;
b9e60928
DDAG
1577 pss->block = QLIST_NEXT_RCU(pss->block, next);
1578 if (!pss->block) {
48df9d80
XG
1579 /*
1580 * If memory migration starts over, we will meet a dirtied page
1581 * which may still exists in compression threads's ring, so we
1582 * should flush the compressed data to make sure the new page
1583 * is not overwritten by the old one in the destination.
1584 *
1585 * Also If xbzrle is on, stop using the data compression at this
1586 * point. In theory, xbzrle can do better than compression.
1587 */
1588 flush_compressed_data(rs);
1589
b9e60928
DDAG
1590 /* Hit the end of the list */
1591 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1592 /* Flag that we've looped */
1593 pss->complete_round = true;
1a373522
DH
1594 /* After the first round, enable XBZRLE. */
1595 if (migrate_use_xbzrle()) {
1596 rs->xbzrle_enabled = true;
1597 }
b9e60928
DDAG
1598 }
1599 /* Didn't find anything this time, but try again on the new block */
1600 *again = true;
1601 return false;
1602 } else {
1603 /* Can go around again, but... */
1604 *again = true;
1605 /* We've found something so probably don't need to */
1606 return true;
1607 }
1608}
1609
3d0684b2
JQ
1610/**
1611 * unqueue_page: gets a page of the queue
1612 *
a82d593b 1613 * Helper for 'get_queued_page' - gets a page off the queue
a82d593b 1614 *
3d0684b2
JQ
1615 * Returns the block of the page (or NULL if none available)
1616 *
ec481c6c 1617 * @rs: current RAM state
3d0684b2 1618 * @offset: used to return the offset within the RAMBlock
a82d593b 1619 */
f20e2865 1620static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
a82d593b 1621{
a1fe28df 1622 struct RAMSrcPageRequest *entry;
a82d593b
DDAG
1623 RAMBlock *block = NULL;
1624
a1fe28df 1625 if (!postcopy_has_request(rs)) {
ae526e32
XG
1626 return NULL;
1627 }
1628
6e8a355d 1629 QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
a1fe28df
PX
1630
1631 /*
1632 * This should _never_ change even after we take the lock, because no one
1633 * should be taking anything off the request list other than us.
1634 */
1635 assert(postcopy_has_request(rs));
1636
1637 entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1638 block = entry->rb;
1639 *offset = entry->offset;
1640
777f53c7
TH
1641 if (entry->len > TARGET_PAGE_SIZE) {
1642 entry->len -= TARGET_PAGE_SIZE;
1643 entry->offset += TARGET_PAGE_SIZE;
a1fe28df
PX
1644 } else {
1645 memory_region_unref(block->mr);
1646 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1647 g_free(entry);
1648 migration_consume_urgent_request();
a82d593b 1649 }
a82d593b
DDAG
1650
1651 return block;
1652}
1653
278e2f55
AG
1654#if defined(__linux__)
1655/**
1656 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1657 * is found, return RAM block pointer and page offset
1658 *
1659 * Returns pointer to the RAMBlock containing faulting page,
1660 * NULL if no write faults are pending
1661 *
1662 * @rs: current RAM state
1663 * @offset: page offset from the beginning of the block
1664 */
1665static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1666{
1667 struct uffd_msg uffd_msg;
1668 void *page_address;
82ea3e3b 1669 RAMBlock *block;
278e2f55
AG
1670 int res;
1671
1672 if (!migrate_background_snapshot()) {
1673 return NULL;
1674 }
1675
1676 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1677 if (res <= 0) {
1678 return NULL;
1679 }
1680
1681 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
82ea3e3b
AG
1682 block = qemu_ram_block_from_host(page_address, false, offset);
1683 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1684 return block;
278e2f55
AG
1685}
1686
1687/**
1688 * ram_save_release_protection: release UFFD write protection after
1689 * a range of pages has been saved
1690 *
1691 * @rs: current RAM state
1692 * @pss: page-search-status structure
1693 * @start_page: index of the first page in the range relative to pss->block
1694 *
1695 * Returns 0 on success, negative value in case of an error
1696*/
1697static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1698 unsigned long start_page)
1699{
1700 int res = 0;
1701
1702 /* Check if page is from UFFD-managed region. */
1703 if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1704 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
258f5c98 1705 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
278e2f55
AG
1706
1707 /* Flush async buffers before un-protect. */
61717ea9 1708 qemu_fflush(pss->pss_channel);
278e2f55
AG
1709 /* Un-protect memory range. */
1710 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1711 false, false);
1712 }
1713
1714 return res;
1715}
1716
1717/* ram_write_tracking_available: check if kernel supports required UFFD features
1718 *
1719 * Returns true if supports, false otherwise
1720 */
1721bool ram_write_tracking_available(void)
1722{
1723 uint64_t uffd_features;
1724 int res;
1725
1726 res = uffd_query_features(&uffd_features);
1727 return (res == 0 &&
1728 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1729}
1730
1731/* ram_write_tracking_compatible: check if guest configuration is
1732 * compatible with 'write-tracking'
1733 *
1734 * Returns true if compatible, false otherwise
1735 */
1736bool ram_write_tracking_compatible(void)
1737{
1738 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1739 int uffd_fd;
82ea3e3b 1740 RAMBlock *block;
278e2f55
AG
1741 bool ret = false;
1742
1743 /* Open UFFD file descriptor */
1744 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1745 if (uffd_fd < 0) {
1746 return false;
1747 }
1748
1749 RCU_READ_LOCK_GUARD();
1750
82ea3e3b 1751 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
278e2f55
AG
1752 uint64_t uffd_ioctls;
1753
1754 /* Nothing to do with read-only and MMIO-writable regions */
82ea3e3b 1755 if (block->mr->readonly || block->mr->rom_device) {
278e2f55
AG
1756 continue;
1757 }
1758 /* Try to register block memory via UFFD-IO to track writes */
82ea3e3b 1759 if (uffd_register_memory(uffd_fd, block->host, block->max_length,
278e2f55
AG
1760 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1761 goto out;
1762 }
1763 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1764 goto out;
1765 }
1766 }
1767 ret = true;
1768
1769out:
1770 uffd_close_fd(uffd_fd);
1771 return ret;
1772}
1773
f7b9dcfb
DH
1774static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1775 ram_addr_t size)
1776{
5f19a449
DH
1777 const ram_addr_t end = offset + size;
1778
f7b9dcfb
DH
1779 /*
1780 * We read one byte of each page; this will preallocate page tables if
1781 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1782 * where no page was populated yet. This might require adaption when
1783 * supporting other mappings, like shmem.
1784 */
5f19a449 1785 for (; offset < end; offset += block->page_size) {
f7b9dcfb
DH
1786 char tmp = *((char *)block->host + offset);
1787
1788 /* Don't optimize the read out */
1789 asm volatile("" : "+r" (tmp));
1790 }
1791}
1792
6fee3a1f
DH
1793static inline int populate_read_section(MemoryRegionSection *section,
1794 void *opaque)
1795{
1796 const hwaddr size = int128_get64(section->size);
1797 hwaddr offset = section->offset_within_region;
1798 RAMBlock *block = section->mr->ram_block;
1799
1800 populate_read_range(block, offset, size);
1801 return 0;
1802}
1803
eeccb99c 1804/*
f7b9dcfb
DH
1805 * ram_block_populate_read: preallocate page tables and populate pages in the
1806 * RAM block by reading a byte of each page.
eeccb99c
AG
1807 *
1808 * Since it's solely used for userfault_fd WP feature, here we just
1809 * hardcode page size to qemu_real_host_page_size.
1810 *
82ea3e3b 1811 * @block: RAM block to populate
eeccb99c 1812 */
6fee3a1f 1813static void ram_block_populate_read(RAMBlock *rb)
eeccb99c 1814{
6fee3a1f
DH
1815 /*
1816 * Skip populating all pages that fall into a discarded range as managed by
1817 * a RamDiscardManager responsible for the mapped memory region of the
1818 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1819 * must not get populated automatically. We don't have to track
1820 * modifications via userfaultfd WP reliably, because these pages will
1821 * not be part of the migration stream either way -- see
1822 * ramblock_dirty_bitmap_exclude_discarded_pages().
1823 *
1824 * Note: The result is only stable while migrating (precopy/postcopy).
1825 */
1826 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1827 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1828 MemoryRegionSection section = {
1829 .mr = rb->mr,
1830 .offset_within_region = 0,
1831 .size = rb->mr->size,
1832 };
1833
1834 ram_discard_manager_replay_populated(rdm, &section,
1835 populate_read_section, NULL);
1836 } else {
1837 populate_read_range(rb, 0, rb->used_length);
1838 }
eeccb99c
AG
1839}
1840
1841/*
1842 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1843 */
1844void ram_write_tracking_prepare(void)
1845{
82ea3e3b 1846 RAMBlock *block;
eeccb99c
AG
1847
1848 RCU_READ_LOCK_GUARD();
1849
82ea3e3b 1850 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
eeccb99c 1851 /* Nothing to do with read-only and MMIO-writable regions */
82ea3e3b 1852 if (block->mr->readonly || block->mr->rom_device) {
eeccb99c
AG
1853 continue;
1854 }
1855
1856 /*
1857 * Populate pages of the RAM block before enabling userfault_fd
1858 * write protection.
1859 *
1860 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1861 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1862 * pages with pte_none() entries in page table.
1863 */
f7b9dcfb 1864 ram_block_populate_read(block);
eeccb99c
AG
1865 }
1866}
1867
e41c5770
DH
1868static inline int uffd_protect_section(MemoryRegionSection *section,
1869 void *opaque)
1870{
1871 const hwaddr size = int128_get64(section->size);
1872 const hwaddr offset = section->offset_within_region;
1873 RAMBlock *rb = section->mr->ram_block;
1874 int uffd_fd = (uintptr_t)opaque;
1875
1876 return uffd_change_protection(uffd_fd, rb->host + offset, size, true,
1877 false);
1878}
1879
1880static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd)
1881{
1882 assert(rb->flags & RAM_UF_WRITEPROTECT);
1883
1884 /* See ram_block_populate_read() */
1885 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1886 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1887 MemoryRegionSection section = {
1888 .mr = rb->mr,
1889 .offset_within_region = 0,
1890 .size = rb->mr->size,
1891 };
1892
1893 return ram_discard_manager_replay_populated(rdm, &section,
1894 uffd_protect_section,
1895 (void *)(uintptr_t)uffd_fd);
1896 }
1897 return uffd_change_protection(uffd_fd, rb->host,
1898 rb->used_length, true, false);
1899}
1900
278e2f55
AG
1901/*
1902 * ram_write_tracking_start: start UFFD-WP memory tracking
1903 *
1904 * Returns 0 for success or negative value in case of error
1905 */
1906int ram_write_tracking_start(void)
1907{
1908 int uffd_fd;
1909 RAMState *rs = ram_state;
82ea3e3b 1910 RAMBlock *block;
278e2f55
AG
1911
1912 /* Open UFFD file descriptor */
1913 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1914 if (uffd_fd < 0) {
1915 return uffd_fd;
1916 }
1917 rs->uffdio_fd = uffd_fd;
1918
1919 RCU_READ_LOCK_GUARD();
1920
82ea3e3b 1921 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
278e2f55 1922 /* Nothing to do with read-only and MMIO-writable regions */
82ea3e3b 1923 if (block->mr->readonly || block->mr->rom_device) {
278e2f55
AG
1924 continue;
1925 }
1926
1927 /* Register block memory with UFFD to track writes */
82ea3e3b
AG
1928 if (uffd_register_memory(rs->uffdio_fd, block->host,
1929 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
278e2f55
AG
1930 goto fail;
1931 }
72ef3a37
DH
1932 block->flags |= RAM_UF_WRITEPROTECT;
1933 memory_region_ref(block->mr);
1934
278e2f55 1935 /* Apply UFFD write protection to the block memory range */
e41c5770 1936 if (ram_block_uffd_protect(block, uffd_fd)) {
278e2f55
AG
1937 goto fail;
1938 }
278e2f55 1939
82ea3e3b
AG
1940 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1941 block->host, block->max_length);
278e2f55
AG
1942 }
1943
1944 return 0;
1945
1946fail:
1947 error_report("ram_write_tracking_start() failed: restoring initial memory state");
1948
82ea3e3b
AG
1949 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1950 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
278e2f55
AG
1951 continue;
1952 }
82ea3e3b 1953 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
278e2f55 1954 /* Cleanup flags and remove reference */
82ea3e3b
AG
1955 block->flags &= ~RAM_UF_WRITEPROTECT;
1956 memory_region_unref(block->mr);
278e2f55
AG
1957 }
1958
1959 uffd_close_fd(uffd_fd);
1960 rs->uffdio_fd = -1;
1961 return -1;
1962}
1963
1964/**
1965 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1966 */
1967void ram_write_tracking_stop(void)
1968{
1969 RAMState *rs = ram_state;
82ea3e3b 1970 RAMBlock *block;
278e2f55
AG
1971
1972 RCU_READ_LOCK_GUARD();
1973
82ea3e3b
AG
1974 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1975 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
278e2f55
AG
1976 continue;
1977 }
82ea3e3b 1978 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
278e2f55 1979
82ea3e3b
AG
1980 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1981 block->host, block->max_length);
278e2f55
AG
1982
1983 /* Cleanup flags and remove reference */
82ea3e3b
AG
1984 block->flags &= ~RAM_UF_WRITEPROTECT;
1985 memory_region_unref(block->mr);
278e2f55
AG
1986 }
1987
1988 /* Finally close UFFD file descriptor */
1989 uffd_close_fd(rs->uffdio_fd);
1990 rs->uffdio_fd = -1;
1991}
1992
1993#else
1994/* No target OS support, stubs just fail or ignore */
1995
1996static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1997{
1998 (void) rs;
1999 (void) offset;
2000
2001 return NULL;
2002}
2003
2004static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
2005 unsigned long start_page)
2006{
2007 (void) rs;
2008 (void) pss;
2009 (void) start_page;
2010
2011 return 0;
2012}
2013
2014bool ram_write_tracking_available(void)
2015{
2016 return false;
2017}
2018
2019bool ram_write_tracking_compatible(void)
2020{
2021 assert(0);
2022 return false;
2023}
2024
2025int ram_write_tracking_start(void)
2026{
2027 assert(0);
2028 return -1;
2029}
2030
2031void ram_write_tracking_stop(void)
2032{
2033 assert(0);
2034}
2035#endif /* defined(__linux__) */
2036
3d0684b2 2037/**
ff1543af 2038 * get_queued_page: unqueue a page from the postcopy requests
3d0684b2
JQ
2039 *
2040 * Skips pages that are already sent (!dirty)
a82d593b 2041 *
a5f7b1a6 2042 * Returns true if a queued page is found
a82d593b 2043 *
6f37bb8b 2044 * @rs: current RAM state
3d0684b2 2045 * @pss: data about the state of the current dirty page scan
a82d593b 2046 */
f20e2865 2047static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
a82d593b
DDAG
2048{
2049 RAMBlock *block;
2050 ram_addr_t offset;
777f53c7
TH
2051 bool dirty;
2052
2053 do {
2054 block = unqueue_page(rs, &offset);
2055 /*
2056 * We're sending this page, and since it's postcopy nothing else
2057 * will dirty it, and we must make sure it doesn't get sent again
2058 * even if this queue request was received after the background
2059 * search already sent it.
2060 */
2061 if (block) {
2062 unsigned long page;
2063
2064 page = offset >> TARGET_PAGE_BITS;
2065 dirty = test_bit(page, block->bmap);
2066 if (!dirty) {
2067 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2068 page);
2069 } else {
2070 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2071 }
2072 }
a82d593b 2073
777f53c7 2074 } while (block && !dirty);
a82d593b 2075
b062106d 2076 if (!block) {
278e2f55
AG
2077 /*
2078 * Poll write faults too if background snapshot is enabled; that's
2079 * when we have vcpus got blocked by the write protected pages.
2080 */
2081 block = poll_fault_page(rs, &offset);
2082 }
2083
a82d593b 2084 if (block) {
a82d593b
DDAG
2085 /*
2086 * We want the background search to continue from the queued page
2087 * since the guest is likely to want other pages near to the page
2088 * it just requested.
2089 */
2090 pss->block = block;
a935e30f 2091 pss->page = offset >> TARGET_PAGE_BITS;
422314e7
WY
2092
2093 /*
2094 * This unqueued page would break the "one round" check, even is
2095 * really rare.
2096 */
2097 pss->complete_round = false;
a82d593b
DDAG
2098 }
2099
2100 return !!block;
2101}
2102
6c595cde 2103/**
5e58f968
JQ
2104 * migration_page_queue_free: drop any remaining pages in the ram
2105 * request queue
6c595cde 2106 *
3d0684b2
JQ
2107 * It should be empty at the end anyway, but in error cases there may
2108 * be some left. in case that there is any page left, we drop it.
2109 *
6c595cde 2110 */
83c13382 2111static void migration_page_queue_free(RAMState *rs)
6c595cde 2112{
ec481c6c 2113 struct RAMSrcPageRequest *mspr, *next_mspr;
6c595cde
DDAG
2114 /* This queue generally should be empty - but in the case of a failed
2115 * migration might have some droppings in.
2116 */
89ac5a1d 2117 RCU_READ_LOCK_GUARD();
ec481c6c 2118 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
6c595cde 2119 memory_region_unref(mspr->rb->mr);
ec481c6c 2120 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
6c595cde
DDAG
2121 g_free(mspr);
2122 }
6c595cde
DDAG
2123}
2124
2125/**
3d0684b2
JQ
2126 * ram_save_queue_pages: queue the page for transmission
2127 *
2128 * A request from postcopy destination for example.
2129 *
2130 * Returns zero on success or negative on error
2131 *
3d0684b2
JQ
2132 * @rbname: Name of the RAMBLock of the request. NULL means the
2133 * same that last one.
2134 * @start: starting address from the start of the RAMBlock
2135 * @len: length (in bytes) to send
6c595cde 2136 */
96506894 2137int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
6c595cde
DDAG
2138{
2139 RAMBlock *ramblock;
53518d94 2140 RAMState *rs = ram_state;
6c595cde 2141
9360447d 2142 ram_counters.postcopy_requests++;
89ac5a1d
DDAG
2143 RCU_READ_LOCK_GUARD();
2144
6c595cde
DDAG
2145 if (!rbname) {
2146 /* Reuse last RAMBlock */
68a098f3 2147 ramblock = rs->last_req_rb;
6c595cde
DDAG
2148
2149 if (!ramblock) {
2150 /*
2151 * Shouldn't happen, we can't reuse the last RAMBlock if
2152 * it's the 1st request.
2153 */
2154 error_report("ram_save_queue_pages no previous block");
03acb4e9 2155 return -1;
6c595cde
DDAG
2156 }
2157 } else {
2158 ramblock = qemu_ram_block_by_name(rbname);
2159
2160 if (!ramblock) {
2161 /* We shouldn't be asked for a non-existent RAMBlock */
2162 error_report("ram_save_queue_pages no block '%s'", rbname);
03acb4e9 2163 return -1;
6c595cde 2164 }
68a098f3 2165 rs->last_req_rb = ramblock;
6c595cde
DDAG
2166 }
2167 trace_ram_save_queue_pages(ramblock->idstr, start, len);
542147f4 2168 if (!offset_in_ramblock(ramblock, start + len - 1)) {
9458ad6b
JQ
2169 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2170 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
6c595cde 2171 __func__, start, len, ramblock->used_length);
03acb4e9 2172 return -1;
6c595cde
DDAG
2173 }
2174
93589827
PX
2175 /*
2176 * When with postcopy preempt, we send back the page directly in the
2177 * rp-return thread.
2178 */
2179 if (postcopy_preempt_active()) {
2180 ram_addr_t page_start = start >> TARGET_PAGE_BITS;
2181 size_t page_size = qemu_ram_pagesize(ramblock);
2182 PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY];
2183 int ret = 0;
2184
2185 qemu_mutex_lock(&rs->bitmap_mutex);
2186
2187 pss_init(pss, ramblock, page_start);
2188 /*
2189 * Always use the preempt channel, and make sure it's there. It's
2190 * safe to access without lock, because when rp-thread is running
2191 * we should be the only one who operates on the qemufile
2192 */
2193 pss->pss_channel = migrate_get_current()->postcopy_qemufile_src;
93589827
PX
2194 assert(pss->pss_channel);
2195
2196 /*
2197 * It must be either one or multiple of host page size. Just
2198 * assert; if something wrong we're mostly split brain anyway.
2199 */
2200 assert(len % page_size == 0);
2201 while (len) {
2202 if (ram_save_host_page_urgent(pss)) {
2203 error_report("%s: ram_save_host_page_urgent() failed: "
2204 "ramblock=%s, start_addr=0x"RAM_ADDR_FMT,
2205 __func__, ramblock->idstr, start);
2206 ret = -1;
2207 break;
2208 }
2209 /*
2210 * NOTE: after ram_save_host_page_urgent() succeeded, pss->page
2211 * will automatically be moved and point to the next host page
2212 * we're going to send, so no need to update here.
2213 *
2214 * Normally QEMU never sends >1 host page in requests, so
2215 * logically we don't even need that as the loop should only
2216 * run once, but just to be consistent.
2217 */
2218 len -= page_size;
2219 };
2220 qemu_mutex_unlock(&rs->bitmap_mutex);
2221
2222 return ret;
2223 }
2224
ec481c6c 2225 struct RAMSrcPageRequest *new_entry =
b21e2380 2226 g_new0(struct RAMSrcPageRequest, 1);
6c595cde
DDAG
2227 new_entry->rb = ramblock;
2228 new_entry->offset = start;
2229 new_entry->len = len;
2230
2231 memory_region_ref(ramblock->mr);
ec481c6c
JQ
2232 qemu_mutex_lock(&rs->src_page_req_mutex);
2233 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
e03a34f8 2234 migration_make_urgent_request();
ec481c6c 2235 qemu_mutex_unlock(&rs->src_page_req_mutex);
6c595cde
DDAG
2236
2237 return 0;
6c595cde
DDAG
2238}
2239
d7400a34
XG
2240static bool save_page_use_compression(RAMState *rs)
2241{
2242 if (!migrate_use_compression()) {
2243 return false;
2244 }
2245
2246 /*
1a373522
DH
2247 * If xbzrle is enabled (e.g., after first round of migration), stop
2248 * using the data compression. In theory, xbzrle can do better than
2249 * compression.
d7400a34 2250 */
1a373522
DH
2251 if (rs->xbzrle_enabled) {
2252 return false;
d7400a34
XG
2253 }
2254
1a373522 2255 return true;
d7400a34
XG
2256}
2257
5e5fdcff
XG
2258/*
2259 * try to compress the page before posting it out, return true if the page
2260 * has been properly handled by compression, otherwise needs other
2261 * paths to handle it
2262 */
ec6f3ab9
PX
2263static bool save_compress_page(RAMState *rs, PageSearchStatus *pss,
2264 RAMBlock *block, ram_addr_t offset)
5e5fdcff
XG
2265{
2266 if (!save_page_use_compression(rs)) {
2267 return false;
2268 }
2269
2270 /*
2271 * When starting the process of a new block, the first page of
2272 * the block should be sent out before other pages in the same
2273 * block, and all the pages in last block should have been sent
2274 * out, keeping this order is important, because the 'cont' flag
2275 * is used to avoid resending the block name.
2276 *
2277 * We post the fist page as normal page as compression will take
2278 * much CPU resource.
2279 */
ec6f3ab9 2280 if (block != pss->last_sent_block) {
5e5fdcff
XG
2281 flush_compressed_data(rs);
2282 return false;
2283 }
2284
eaa238ab 2285 if (compress_page_with_multi_thread(block, offset) > 0) {
5e5fdcff
XG
2286 return true;
2287 }
2288
76e03000 2289 compression_counters.busy++;
5e5fdcff
XG
2290 return false;
2291}
2292
a82d593b 2293/**
3d0684b2 2294 * ram_save_target_page: save one target page
a82d593b 2295 *
3d0684b2 2296 * Returns the number of pages written
a82d593b 2297 *
6f37bb8b 2298 * @rs: current RAM state
3d0684b2 2299 * @pss: data about the page we want to send
a82d593b 2300 */
05931ec5 2301static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss)
a82d593b 2302{
a8ec91f9 2303 RAMBlock *block = pss->block;
8bba004c 2304 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
a8ec91f9
XG
2305 int res;
2306
61717ea9 2307 if (control_save_page(pss, block, offset, &res)) {
a8ec91f9
XG
2308 return res;
2309 }
2310
ec6f3ab9 2311 if (save_compress_page(rs, pss, block, offset)) {
5e5fdcff 2312 return 1;
d7400a34
XG
2313 }
2314
ec6f3ab9 2315 res = save_zero_page(pss, block, offset);
d7400a34
XG
2316 if (res > 0) {
2317 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2318 * page would be stale
2319 */
ef5c3d13 2320 if (rs->xbzrle_enabled) {
d7400a34
XG
2321 XBZRLE_cache_lock();
2322 xbzrle_cache_zero_page(rs, block->offset + offset);
2323 XBZRLE_cache_unlock();
2324 }
d7400a34
XG
2325 return res;
2326 }
2327
da3f56cb 2328 /*
6f39c90b
PX
2329 * Do not use multifd in postcopy as one whole host page should be
2330 * placed. Meanwhile postcopy requires atomic update of pages, so even
2331 * if host page size == guest page size the dest guest during run may
2332 * still see partially copied pages which is data corruption.
da3f56cb 2333 */
6f39c90b 2334 if (migrate_use_multifd() && !migration_in_postcopy()) {
61717ea9 2335 return ram_save_multifd_page(pss->pss_channel, block, offset);
a82d593b
DDAG
2336 }
2337
05931ec5 2338 return ram_save_page(rs, pss);
a82d593b
DDAG
2339}
2340
d9e474ea
PX
2341/* Should be called before sending a host page */
2342static void pss_host_page_prepare(PageSearchStatus *pss)
2343{
2344 /* How many guest pages are there in one host page? */
2345 size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2346
2347 pss->host_page_sending = true;
301d7ffe
PX
2348 if (guest_pfns <= 1) {
2349 /*
2350 * This covers both when guest psize == host psize, or when guest
2351 * has larger psize than the host (guest_pfns==0).
2352 *
2353 * For the latter, we always send one whole guest page per
2354 * iteration of the host page (example: an Alpha VM on x86 host
2355 * will have guest psize 8K while host psize 4K).
2356 */
2357 pss->host_page_start = pss->page;
2358 pss->host_page_end = pss->page + 1;
2359 } else {
2360 /*
2361 * The host page spans over multiple guest pages, we send them
2362 * within the same host page iteration.
2363 */
2364 pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
2365 pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
2366 }
d9e474ea
PX
2367}
2368
2369/*
2370 * Whether the page pointed by PSS is within the host page being sent.
2371 * Must be called after a previous pss_host_page_prepare().
2372 */
2373static bool pss_within_range(PageSearchStatus *pss)
2374{
2375 ram_addr_t ram_addr;
2376
2377 assert(pss->host_page_sending);
2378
2379 /* Over host-page boundary? */
2380 if (pss->page >= pss->host_page_end) {
2381 return false;
2382 }
2383
2384 ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2385
2386 return offset_in_ramblock(pss->block, ram_addr);
2387}
2388
2389static void pss_host_page_finish(PageSearchStatus *pss)
2390{
2391 pss->host_page_sending = false;
2392 /* This is not needed, but just to reset it */
2393 pss->host_page_start = pss->host_page_end = 0;
2394}
2395
93589827
PX
2396/*
2397 * Send an urgent host page specified by `pss'. Need to be called with
2398 * bitmap_mutex held.
2399 *
2400 * Returns 0 if save host page succeeded, false otherwise.
2401 */
2402static int ram_save_host_page_urgent(PageSearchStatus *pss)
2403{
2404 bool page_dirty, sent = false;
2405 RAMState *rs = ram_state;
2406 int ret = 0;
2407
2408 trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2409 pss_host_page_prepare(pss);
2410
2411 /*
2412 * If precopy is sending the same page, let it be done in precopy, or
2413 * we could send the same page in two channels and none of them will
2414 * receive the whole page.
2415 */
2416 if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) {
2417 trace_postcopy_preempt_hit(pss->block->idstr,
2418 pss->page << TARGET_PAGE_BITS);
2419 return 0;
2420 }
2421
2422 do {
2423 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2424
2425 if (page_dirty) {
2426 /* Be strict to return code; it must be 1, or what else? */
2427 if (ram_save_target_page(rs, pss) != 1) {
2428 error_report_once("%s: ram_save_target_page failed", __func__);
2429 ret = -1;
2430 goto out;
2431 }
2432 sent = true;
2433 }
2434 pss_find_next_dirty(pss);
2435 } while (pss_within_range(pss));
2436out:
2437 pss_host_page_finish(pss);
2438 /* For urgent requests, flush immediately if sent */
2439 if (sent) {
2440 qemu_fflush(pss->pss_channel);
2441 }
2442 return ret;
2443}
2444
a82d593b 2445/**
3d0684b2 2446 * ram_save_host_page: save a whole host page
a82d593b 2447 *
3d0684b2
JQ
2448 * Starting at *offset send pages up to the end of the current host
2449 * page. It's valid for the initial offset to point into the middle of
2450 * a host page in which case the remainder of the hostpage is sent.
2451 * Only dirty target pages are sent. Note that the host page size may
2452 * be a huge page for this block.
f3321554 2453 *
1eb3fc0a
DDAG
2454 * The saving stops at the boundary of the used_length of the block
2455 * if the RAMBlock isn't a multiple of the host page size.
a82d593b 2456 *
f3321554
PX
2457 * The caller must be with ram_state.bitmap_mutex held to call this
2458 * function. Note that this function can temporarily release the lock, but
2459 * when the function is returned it'll make sure the lock is still held.
2460 *
3d0684b2
JQ
2461 * Returns the number of pages written or negative on error
2462 *
6f37bb8b 2463 * @rs: current RAM state
3d0684b2 2464 * @pss: data about the page we want to send
a82d593b 2465 */
05931ec5 2466static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
a82d593b 2467{
f3321554 2468 bool page_dirty, preempt_active = postcopy_preempt_active();
a82d593b 2469 int tmppages, pages = 0;
a935e30f
JQ
2470 size_t pagesize_bits =
2471 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
278e2f55
AG
2472 unsigned long start_page = pss->page;
2473 int res;
4c011c37 2474
fbd162e6 2475 if (ramblock_is_ignored(pss->block)) {
b895de50
CLG
2476 error_report("block %s should not be migrated !", pss->block->idstr);
2477 return 0;
2478 }
2479
d9e474ea
PX
2480 /* Update host page boundary information */
2481 pss_host_page_prepare(pss);
2482
a82d593b 2483 do {
f3321554 2484 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
a82d593b 2485
f3321554
PX
2486 /* Check the pages is dirty and if it is send it */
2487 if (page_dirty) {
ba1b7c81 2488 /*
f3321554
PX
2489 * Properly yield the lock only in postcopy preempt mode
2490 * because both migration thread and rp-return thread can
2491 * operate on the bitmaps.
ba1b7c81 2492 */
f3321554
PX
2493 if (preempt_active) {
2494 qemu_mutex_unlock(&rs->bitmap_mutex);
ba1b7c81 2495 }
f3321554
PX
2496 tmppages = ram_save_target_page(rs, pss);
2497 if (tmppages >= 0) {
2498 pages += tmppages;
2499 /*
2500 * Allow rate limiting to happen in the middle of huge pages if
2501 * something is sent in the current iteration.
2502 */
2503 if (pagesize_bits > 1 && tmppages > 0) {
2504 migration_rate_limit();
2505 }
2506 }
2507 if (preempt_active) {
2508 qemu_mutex_lock(&rs->bitmap_mutex);
2509 }
2510 } else {
2511 tmppages = 0;
23feba90 2512 }
f3321554
PX
2513
2514 if (tmppages < 0) {
d9e474ea 2515 pss_host_page_finish(pss);
f3321554
PX
2516 return tmppages;
2517 }
2518
d9e474ea
PX
2519 pss_find_next_dirty(pss);
2520 } while (pss_within_range(pss));
2521
2522 pss_host_page_finish(pss);
278e2f55
AG
2523
2524 res = ram_save_release_protection(rs, pss, start_page);
2525 return (res < 0 ? res : pages);
a82d593b 2526}
6c595cde 2527
56e93d26 2528/**
3d0684b2 2529 * ram_find_and_save_block: finds a dirty page and sends it to f
56e93d26
JQ
2530 *
2531 * Called within an RCU critical section.
2532 *
e8f3735f
XG
2533 * Returns the number of pages written where zero means no dirty pages,
2534 * or negative on error
56e93d26 2535 *
6f37bb8b 2536 * @rs: current RAM state
a82d593b
DDAG
2537 *
2538 * On systems where host-page-size > target-page-size it will send all the
2539 * pages in a host page that are dirty.
56e93d26 2540 */
05931ec5 2541static int ram_find_and_save_block(RAMState *rs)
56e93d26 2542{
f1668764 2543 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
56e93d26 2544 int pages = 0;
56e93d26 2545
0827b9e9
AA
2546 /* No dirty page as there is zero RAM */
2547 if (!ram_bytes_total()) {
2548 return pages;
2549 }
2550
4934a5dd
PX
2551 /*
2552 * Always keep last_seen_block/last_page valid during this procedure,
2553 * because find_dirty_block() relies on these values (e.g., we compare
2554 * last_seen_block with pss.block to see whether we searched all the
2555 * ramblocks) to detect the completion of migration. Having NULL value
2556 * of last_seen_block can conditionally cause below loop to run forever.
2557 */
2558 if (!rs->last_seen_block) {
2559 rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks);
2560 rs->last_page = 0;
2561 }
2562
f1668764 2563 pss_init(pss, rs->last_seen_block, rs->last_page);
b8fb8cb7 2564
b9e60928 2565 do {
51efd36f 2566 if (!get_queued_page(rs, pss)) {
b062106d 2567 /* priority queue empty, so just search for something dirty */
51efd36f
JQ
2568 bool again = true;
2569 if (!find_dirty_block(rs, pss, &again)) {
2570 if (!again) {
2571 break;
2572 }
2573 }
56e93d26 2574 }
51efd36f
JQ
2575 pages = ram_save_host_page(rs, pss);
2576 } while (!pages);
56e93d26 2577
f1668764
PX
2578 rs->last_seen_block = pss->block;
2579 rs->last_page = pss->page;
56e93d26
JQ
2580
2581 return pages;
2582}
2583
2584void acct_update_position(QEMUFile *f, size_t size, bool zero)
2585{
2586 uint64_t pages = size / TARGET_PAGE_SIZE;
f7ccd61b 2587
56e93d26 2588 if (zero) {
23b7576d 2589 stat64_add(&ram_atomic_counters.duplicate, pages);
56e93d26 2590 } else {
23b7576d 2591 stat64_add(&ram_atomic_counters.normal, pages);
4c2d0f6d 2592 ram_transferred_add(size);
1a93bd2f 2593 qemu_file_credit_transfer(f, size);
56e93d26
JQ
2594 }
2595}
2596
fbd162e6 2597static uint64_t ram_bytes_total_common(bool count_ignored)
56e93d26
JQ
2598{
2599 RAMBlock *block;
2600 uint64_t total = 0;
2601
89ac5a1d
DDAG
2602 RCU_READ_LOCK_GUARD();
2603
fbd162e6
YK
2604 if (count_ignored) {
2605 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2606 total += block->used_length;
2607 }
2608 } else {
2609 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2610 total += block->used_length;
2611 }
99e15582 2612 }
56e93d26
JQ
2613 return total;
2614}
2615
fbd162e6
YK
2616uint64_t ram_bytes_total(void)
2617{
2618 return ram_bytes_total_common(false);
2619}
2620
f265e0e4 2621static void xbzrle_load_setup(void)
56e93d26 2622{
f265e0e4 2623 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
56e93d26
JQ
2624}
2625
f265e0e4
JQ
2626static void xbzrle_load_cleanup(void)
2627{
2628 g_free(XBZRLE.decoded_buf);
2629 XBZRLE.decoded_buf = NULL;
2630}
2631
7d7c96be
PX
2632static void ram_state_cleanup(RAMState **rsp)
2633{
b9ccaf6d
DDAG
2634 if (*rsp) {
2635 migration_page_queue_free(*rsp);
2636 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2637 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2638 g_free(*rsp);
2639 *rsp = NULL;
2640 }
7d7c96be
PX
2641}
2642
84593a08
PX
2643static void xbzrle_cleanup(void)
2644{
2645 XBZRLE_cache_lock();
2646 if (XBZRLE.cache) {
2647 cache_fini(XBZRLE.cache);
2648 g_free(XBZRLE.encoded_buf);
2649 g_free(XBZRLE.current_buf);
2650 g_free(XBZRLE.zero_target_page);
2651 XBZRLE.cache = NULL;
2652 XBZRLE.encoded_buf = NULL;
2653 XBZRLE.current_buf = NULL;
2654 XBZRLE.zero_target_page = NULL;
2655 }
2656 XBZRLE_cache_unlock();
2657}
2658
f265e0e4 2659static void ram_save_cleanup(void *opaque)
56e93d26 2660{
53518d94 2661 RAMState **rsp = opaque;
6b6712ef 2662 RAMBlock *block;
eb859c53 2663
278e2f55
AG
2664 /* We don't use dirty log with background snapshots */
2665 if (!migrate_background_snapshot()) {
2666 /* caller have hold iothread lock or is in a bh, so there is
2667 * no writing race against the migration bitmap
2668 */
63b41db4
HH
2669 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2670 /*
2671 * do not stop dirty log without starting it, since
2672 * memory_global_dirty_log_stop will assert that
2673 * memory_global_dirty_log_start/stop used in pairs
2674 */
2675 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2676 }
278e2f55 2677 }
6b6712ef 2678
fbd162e6 2679 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
002cad6b
PX
2680 g_free(block->clear_bmap);
2681 block->clear_bmap = NULL;
6b6712ef
JQ
2682 g_free(block->bmap);
2683 block->bmap = NULL;
56e93d26
JQ
2684 }
2685
84593a08 2686 xbzrle_cleanup();
f0afa331 2687 compress_threads_save_cleanup();
7d7c96be 2688 ram_state_cleanup(rsp);
56e93d26
JQ
2689}
2690
6f37bb8b 2691static void ram_state_reset(RAMState *rs)
56e93d26 2692{
ec6f3ab9
PX
2693 int i;
2694
2695 for (i = 0; i < RAM_CHANNEL_MAX; i++) {
2696 rs->pss[i].last_sent_block = NULL;
2697 }
2698
6f37bb8b 2699 rs->last_seen_block = NULL;
269ace29 2700 rs->last_page = 0;
6f37bb8b 2701 rs->last_version = ram_list.version;
1a373522 2702 rs->xbzrle_enabled = false;
56e93d26
JQ
2703}
2704
2705#define MAX_WAIT 50 /* ms, half buffered_file limit */
2706
e0b266f0
DDAG
2707/* **** functions for postcopy ***** */
2708
ced1c616
PB
2709void ram_postcopy_migrated_memory_release(MigrationState *ms)
2710{
2711 struct RAMBlock *block;
ced1c616 2712
fbd162e6 2713 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
6b6712ef
JQ
2714 unsigned long *bitmap = block->bmap;
2715 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2716 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
ced1c616
PB
2717
2718 while (run_start < range) {
2719 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
8bba004c
AR
2720 ram_discard_range(block->idstr,
2721 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2722 ((ram_addr_t)(run_end - run_start))
2723 << TARGET_PAGE_BITS);
ced1c616
PB
2724 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2725 }
2726 }
2727}
2728
3d0684b2
JQ
2729/**
2730 * postcopy_send_discard_bm_ram: discard a RAMBlock
2731 *
e0b266f0 2732 * Callback from postcopy_each_ram_send_discard for each RAMBlock
3d0684b2
JQ
2733 *
2734 * @ms: current migration state
89dab31b 2735 * @block: RAMBlock to discard
e0b266f0 2736 */
9e7d1223 2737static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
e0b266f0 2738{
6b6712ef 2739 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
e0b266f0 2740 unsigned long current;
1e7cf8c3 2741 unsigned long *bitmap = block->bmap;
e0b266f0 2742
6b6712ef 2743 for (current = 0; current < end; ) {
1e7cf8c3 2744 unsigned long one = find_next_bit(bitmap, end, current);
33a5cb62 2745 unsigned long zero, discard_length;
e0b266f0 2746
33a5cb62
WY
2747 if (one >= end) {
2748 break;
2749 }
e0b266f0 2750
1e7cf8c3 2751 zero = find_next_zero_bit(bitmap, end, one + 1);
33a5cb62
WY
2752
2753 if (zero >= end) {
2754 discard_length = end - one;
e0b266f0 2755 } else {
33a5cb62
WY
2756 discard_length = zero - one;
2757 }
810cf2bb 2758 postcopy_discard_send_range(ms, one, discard_length);
33a5cb62 2759 current = one + discard_length;
e0b266f0 2760 }
e0b266f0
DDAG
2761}
2762
f30c2e5b
PX
2763static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2764
3d0684b2
JQ
2765/**
2766 * postcopy_each_ram_send_discard: discard all RAMBlocks
2767 *
e0b266f0
DDAG
2768 * Utility for the outgoing postcopy code.
2769 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2770 * passing it bitmap indexes and name.
e0b266f0
DDAG
2771 * (qemu_ram_foreach_block ends up passing unscaled lengths
2772 * which would mean postcopy code would have to deal with target page)
3d0684b2
JQ
2773 *
2774 * @ms: current migration state
e0b266f0 2775 */
739fcc1b 2776static void postcopy_each_ram_send_discard(MigrationState *ms)
e0b266f0
DDAG
2777{
2778 struct RAMBlock *block;
e0b266f0 2779
fbd162e6 2780 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
810cf2bb 2781 postcopy_discard_send_init(ms, block->idstr);
e0b266f0 2782
f30c2e5b
PX
2783 /*
2784 * Deal with TPS != HPS and huge pages. It discard any partially sent
2785 * host-page size chunks, mark any partially dirty host-page size
2786 * chunks as all dirty. In this case the host-page is the host-page
2787 * for the particular RAMBlock, i.e. it might be a huge page.
2788 */
2789 postcopy_chunk_hostpages_pass(ms, block);
2790
e0b266f0
DDAG
2791 /*
2792 * Postcopy sends chunks of bitmap over the wire, but it
2793 * just needs indexes at this point, avoids it having
2794 * target page specific code.
2795 */
739fcc1b 2796 postcopy_send_discard_bm_ram(ms, block);
810cf2bb 2797 postcopy_discard_send_finish(ms);
e0b266f0 2798 }
e0b266f0
DDAG
2799}
2800
3d0684b2 2801/**
8324ef86 2802 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
3d0684b2
JQ
2803 *
2804 * Helper for postcopy_chunk_hostpages; it's called twice to
2805 * canonicalize the two bitmaps, that are similar, but one is
2806 * inverted.
99e314eb 2807 *
3d0684b2
JQ
2808 * Postcopy requires that all target pages in a hostpage are dirty or
2809 * clean, not a mix. This function canonicalizes the bitmaps.
99e314eb 2810 *
3d0684b2 2811 * @ms: current migration state
3d0684b2 2812 * @block: block that contains the page we want to canonicalize
99e314eb 2813 */
1e7cf8c3 2814static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
99e314eb 2815{
53518d94 2816 RAMState *rs = ram_state;
6b6712ef 2817 unsigned long *bitmap = block->bmap;
29c59172 2818 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
6b6712ef 2819 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
99e314eb
DDAG
2820 unsigned long run_start;
2821
29c59172
DDAG
2822 if (block->page_size == TARGET_PAGE_SIZE) {
2823 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2824 return;
2825 }
2826
1e7cf8c3
WY
2827 /* Find a dirty page */
2828 run_start = find_next_bit(bitmap, pages, 0);
99e314eb 2829
6b6712ef 2830 while (run_start < pages) {
99e314eb
DDAG
2831
2832 /*
2833 * If the start of this run of pages is in the middle of a host
2834 * page, then we need to fixup this host page.
2835 */
9dec3cc3 2836 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
99e314eb 2837 /* Find the end of this run */
1e7cf8c3 2838 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
99e314eb
DDAG
2839 /*
2840 * If the end isn't at the start of a host page, then the
2841 * run doesn't finish at the end of a host page
2842 * and we need to discard.
2843 */
99e314eb
DDAG
2844 }
2845
9dec3cc3 2846 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
99e314eb 2847 unsigned long page;
dad45ab2
WY
2848 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2849 host_ratio);
2850 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
99e314eb 2851
99e314eb
DDAG
2852 /* Clean up the bitmap */
2853 for (page = fixup_start_addr;
2854 page < fixup_start_addr + host_ratio; page++) {
99e314eb
DDAG
2855 /*
2856 * Remark them as dirty, updating the count for any pages
2857 * that weren't previously dirty.
2858 */
0d8ec885 2859 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
99e314eb
DDAG
2860 }
2861 }
2862
1e7cf8c3
WY
2863 /* Find the next dirty page for the next iteration */
2864 run_start = find_next_bit(bitmap, pages, run_start);
99e314eb
DDAG
2865 }
2866}
2867
3d0684b2
JQ
2868/**
2869 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2870 *
e0b266f0
DDAG
2871 * Transmit the set of pages to be discarded after precopy to the target
2872 * these are pages that:
2873 * a) Have been previously transmitted but are now dirty again
2874 * b) Pages that have never been transmitted, this ensures that
2875 * any pages on the destination that have been mapped by background
2876 * tasks get discarded (transparent huge pages is the specific concern)
2877 * Hopefully this is pretty sparse
3d0684b2
JQ
2878 *
2879 * @ms: current migration state
e0b266f0 2880 */
739fcc1b 2881void ram_postcopy_send_discard_bitmap(MigrationState *ms)
e0b266f0 2882{
53518d94 2883 RAMState *rs = ram_state;
e0b266f0 2884
89ac5a1d 2885 RCU_READ_LOCK_GUARD();
e0b266f0
DDAG
2886
2887 /* This should be our last sync, the src is now paused */
eb859c53 2888 migration_bitmap_sync(rs);
e0b266f0 2889
6b6712ef 2890 /* Easiest way to make sure we don't resume in the middle of a host-page */
ec6f3ab9 2891 rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL;
6b6712ef 2892 rs->last_seen_block = NULL;
6b6712ef 2893 rs->last_page = 0;
e0b266f0 2894
739fcc1b 2895 postcopy_each_ram_send_discard(ms);
e0b266f0 2896
739fcc1b 2897 trace_ram_postcopy_send_discard_bitmap();
e0b266f0
DDAG
2898}
2899
3d0684b2
JQ
2900/**
2901 * ram_discard_range: discard dirtied pages at the beginning of postcopy
e0b266f0 2902 *
3d0684b2 2903 * Returns zero on success
e0b266f0 2904 *
36449157
JQ
2905 * @rbname: name of the RAMBlock of the request. NULL means the
2906 * same that last one.
3d0684b2
JQ
2907 * @start: RAMBlock starting page
2908 * @length: RAMBlock size
e0b266f0 2909 */
aaa2064c 2910int ram_discard_range(const char *rbname, uint64_t start, size_t length)
e0b266f0 2911{
36449157 2912 trace_ram_discard_range(rbname, start, length);
d3a5038c 2913
89ac5a1d 2914 RCU_READ_LOCK_GUARD();
36449157 2915 RAMBlock *rb = qemu_ram_block_by_name(rbname);
e0b266f0
DDAG
2916
2917 if (!rb) {
36449157 2918 error_report("ram_discard_range: Failed to find block '%s'", rbname);
03acb4e9 2919 return -1;
e0b266f0
DDAG
2920 }
2921
814bb08f
PX
2922 /*
2923 * On source VM, we don't need to update the received bitmap since
2924 * we don't even have one.
2925 */
2926 if (rb->receivedmap) {
2927 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2928 length >> qemu_target_page_bits());
2929 }
2930
03acb4e9 2931 return ram_block_discard_range(rb, start, length);
e0b266f0
DDAG
2932}
2933
84593a08
PX
2934/*
2935 * For every allocation, we will try not to crash the VM if the
2936 * allocation failed.
2937 */
2938static int xbzrle_init(void)
2939{
2940 Error *local_err = NULL;
2941
2942 if (!migrate_use_xbzrle()) {
2943 return 0;
2944 }
2945
2946 XBZRLE_cache_lock();
2947
2948 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2949 if (!XBZRLE.zero_target_page) {
2950 error_report("%s: Error allocating zero page", __func__);
2951 goto err_out;
2952 }
2953
2954 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2955 TARGET_PAGE_SIZE, &local_err);
2956 if (!XBZRLE.cache) {
2957 error_report_err(local_err);
2958 goto free_zero_page;
2959 }
2960
2961 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2962 if (!XBZRLE.encoded_buf) {
2963 error_report("%s: Error allocating encoded_buf", __func__);
2964 goto free_cache;
2965 }
2966
2967 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2968 if (!XBZRLE.current_buf) {
2969 error_report("%s: Error allocating current_buf", __func__);
2970 goto free_encoded_buf;
2971 }
2972
2973 /* We are all good */
2974 XBZRLE_cache_unlock();
2975 return 0;
2976
2977free_encoded_buf:
2978 g_free(XBZRLE.encoded_buf);
2979 XBZRLE.encoded_buf = NULL;
2980free_cache:
2981 cache_fini(XBZRLE.cache);
2982 XBZRLE.cache = NULL;
2983free_zero_page:
2984 g_free(XBZRLE.zero_target_page);
2985 XBZRLE.zero_target_page = NULL;
2986err_out:
2987 XBZRLE_cache_unlock();
2988 return -ENOMEM;
2989}
2990
53518d94 2991static int ram_state_init(RAMState **rsp)
56e93d26 2992{
7d00ee6a
PX
2993 *rsp = g_try_new0(RAMState, 1);
2994
2995 if (!*rsp) {
2996 error_report("%s: Init ramstate fail", __func__);
2997 return -1;
2998 }
53518d94
JQ
2999
3000 qemu_mutex_init(&(*rsp)->bitmap_mutex);
3001 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3002 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
56e93d26 3003
7d00ee6a 3004 /*
40c4d4a8
IR
3005 * Count the total number of pages used by ram blocks not including any
3006 * gaps due to alignment or unplugs.
03158519 3007 * This must match with the initial values of dirty bitmap.
7d00ee6a 3008 */
40c4d4a8 3009 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
7d00ee6a
PX
3010 ram_state_reset(*rsp);
3011
3012 return 0;
3013}
3014
d6eff5d7 3015static void ram_list_init_bitmaps(void)
7d00ee6a 3016{
002cad6b 3017 MigrationState *ms = migrate_get_current();
d6eff5d7
PX
3018 RAMBlock *block;
3019 unsigned long pages;
002cad6b 3020 uint8_t shift;
56e93d26 3021
0827b9e9
AA
3022 /* Skip setting bitmap if there is no RAM */
3023 if (ram_bytes_total()) {
002cad6b
PX
3024 shift = ms->clear_bitmap_shift;
3025 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3026 error_report("clear_bitmap_shift (%u) too big, using "
3027 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3028 shift = CLEAR_BITMAP_SHIFT_MAX;
3029 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3030 error_report("clear_bitmap_shift (%u) too small, using "
3031 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3032 shift = CLEAR_BITMAP_SHIFT_MIN;
3033 }
3034
fbd162e6 3035 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
d6eff5d7 3036 pages = block->max_length >> TARGET_PAGE_BITS;
03158519
WY
3037 /*
3038 * The initial dirty bitmap for migration must be set with all
3039 * ones to make sure we'll migrate every guest RAM page to
3040 * destination.
40c4d4a8
IR
3041 * Here we set RAMBlock.bmap all to 1 because when rebegin a
3042 * new migration after a failed migration, ram_list.
3043 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3044 * guest memory.
03158519 3045 */
6b6712ef 3046 block->bmap = bitmap_new(pages);
40c4d4a8 3047 bitmap_set(block->bmap, 0, pages);
002cad6b
PX
3048 block->clear_bmap_shift = shift;
3049 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
0827b9e9 3050 }
f3f491fc 3051 }
d6eff5d7
PX
3052}
3053
be39b4cd
DH
3054static void migration_bitmap_clear_discarded_pages(RAMState *rs)
3055{
3056 unsigned long pages;
3057 RAMBlock *rb;
3058
3059 RCU_READ_LOCK_GUARD();
3060
3061 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3062 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
3063 rs->migration_dirty_pages -= pages;
3064 }
3065}
3066
d6eff5d7
PX
3067static void ram_init_bitmaps(RAMState *rs)
3068{
3069 /* For memory_global_dirty_log_start below. */
3070 qemu_mutex_lock_iothread();
3071 qemu_mutex_lock_ramlist();
f3f491fc 3072
89ac5a1d
DDAG
3073 WITH_RCU_READ_LOCK_GUARD() {
3074 ram_list_init_bitmaps();
278e2f55
AG
3075 /* We don't use dirty log with background snapshots */
3076 if (!migrate_background_snapshot()) {
63b41db4 3077 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
278e2f55
AG
3078 migration_bitmap_sync_precopy(rs);
3079 }
89ac5a1d 3080 }
56e93d26 3081 qemu_mutex_unlock_ramlist();
49877834 3082 qemu_mutex_unlock_iothread();
be39b4cd
DH
3083
3084 /*
3085 * After an eventual first bitmap sync, fixup the initial bitmap
3086 * containing all 1s to exclude any discarded pages from migration.
3087 */
3088 migration_bitmap_clear_discarded_pages(rs);
d6eff5d7
PX
3089}
3090
3091static int ram_init_all(RAMState **rsp)
3092{
3093 if (ram_state_init(rsp)) {
3094 return -1;
3095 }
3096
3097 if (xbzrle_init()) {
3098 ram_state_cleanup(rsp);
3099 return -1;
3100 }
3101
3102 ram_init_bitmaps(*rsp);
a91246c9
HZ
3103
3104 return 0;
3105}
3106
08614f34
PX
3107static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3108{
3109 RAMBlock *block;
3110 uint64_t pages = 0;
3111
3112 /*
3113 * Postcopy is not using xbzrle/compression, so no need for that.
3114 * Also, since source are already halted, we don't need to care
3115 * about dirty page logging as well.
3116 */
3117
fbd162e6 3118 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
08614f34
PX
3119 pages += bitmap_count_one(block->bmap,
3120 block->used_length >> TARGET_PAGE_BITS);
3121 }
3122
3123 /* This may not be aligned with current bitmaps. Recalculate. */
3124 rs->migration_dirty_pages = pages;
3125
1a373522 3126 ram_state_reset(rs);
08614f34
PX
3127
3128 /* Update RAMState cache of output QEMUFile */
7f401b80 3129 rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out;
08614f34
PX
3130
3131 trace_ram_state_resume_prepare(pages);
3132}
3133
6bcb05fc
WW
3134/*
3135 * This function clears bits of the free pages reported by the caller from the
3136 * migration dirty bitmap. @addr is the host address corresponding to the
3137 * start of the continuous guest free pages, and @len is the total bytes of
3138 * those pages.
3139 */
3140void qemu_guest_free_page_hint(void *addr, size_t len)
3141{
3142 RAMBlock *block;
3143 ram_addr_t offset;
3144 size_t used_len, start, npages;
3145 MigrationState *s = migrate_get_current();
3146
3147 /* This function is currently expected to be used during live migration */
3148 if (!migration_is_setup_or_active(s->state)) {
3149 return;
3150 }
3151
3152 for (; len > 0; len -= used_len, addr += used_len) {
3153 block = qemu_ram_block_from_host(addr, false, &offset);
3154 if (unlikely(!block || offset >= block->used_length)) {
3155 /*
3156 * The implementation might not support RAMBlock resize during
3157 * live migration, but it could happen in theory with future
3158 * updates. So we add a check here to capture that case.
3159 */
3160 error_report_once("%s unexpected error", __func__);
3161 return;
3162 }
3163
3164 if (len <= block->used_length - offset) {
3165 used_len = len;
3166 } else {
3167 used_len = block->used_length - offset;
3168 }
3169
3170 start = offset >> TARGET_PAGE_BITS;
3171 npages = used_len >> TARGET_PAGE_BITS;
3172
3173 qemu_mutex_lock(&ram_state->bitmap_mutex);
3143577d
WW
3174 /*
3175 * The skipped free pages are equavalent to be sent from clear_bmap's
3176 * perspective, so clear the bits from the memory region bitmap which
3177 * are initially set. Otherwise those skipped pages will be sent in
3178 * the next round after syncing from the memory region bitmap.
3179 */
1230a25f 3180 migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
6bcb05fc
WW
3181 ram_state->migration_dirty_pages -=
3182 bitmap_count_one_with_offset(block->bmap, start, npages);
3183 bitmap_clear(block->bmap, start, npages);
3184 qemu_mutex_unlock(&ram_state->bitmap_mutex);
3185 }
3186}
3187
3d0684b2
JQ
3188/*
3189 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
a91246c9
HZ
3190 * long-running RCU critical section. When rcu-reclaims in the code
3191 * start to become numerous it will be necessary to reduce the
3192 * granularity of these critical sections.
3193 */
3194
3d0684b2
JQ
3195/**
3196 * ram_save_setup: Setup RAM for migration
3197 *
3198 * Returns zero to indicate success and negative for error
3199 *
3200 * @f: QEMUFile where to send the data
3201 * @opaque: RAMState pointer
3202 */
a91246c9
HZ
3203static int ram_save_setup(QEMUFile *f, void *opaque)
3204{
53518d94 3205 RAMState **rsp = opaque;
a91246c9 3206 RAMBlock *block;
33d70973 3207 int ret;
a91246c9 3208
dcaf446e
XG
3209 if (compress_threads_save_setup()) {
3210 return -1;
3211 }
3212
a91246c9
HZ
3213 /* migration has already setup the bitmap, reuse it. */
3214 if (!migration_in_colo_state()) {
7d00ee6a 3215 if (ram_init_all(rsp) != 0) {
dcaf446e 3216 compress_threads_save_cleanup();
a91246c9 3217 return -1;
53518d94 3218 }
a91246c9 3219 }
7f401b80 3220 (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f;
a91246c9 3221
0e6ebd48
DDAG
3222 WITH_RCU_READ_LOCK_GUARD() {
3223 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
56e93d26 3224
0e6ebd48
DDAG
3225 RAMBLOCK_FOREACH_MIGRATABLE(block) {
3226 qemu_put_byte(f, strlen(block->idstr));
3227 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3228 qemu_put_be64(f, block->used_length);
3229 if (migrate_postcopy_ram() && block->page_size !=
3230 qemu_host_page_size) {
3231 qemu_put_be64(f, block->page_size);
3232 }
3233 if (migrate_ignore_shared()) {
3234 qemu_put_be64(f, block->mr->addr);
3235 }
fbd162e6 3236 }
56e93d26
JQ
3237 }
3238
56e93d26
JQ
3239 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3240 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3241
33d70973
LB
3242 ret = multifd_send_sync_main(f);
3243 if (ret < 0) {
3244 return ret;
3245 }
3246
56e93d26 3247 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
35374cbd 3248 qemu_fflush(f);
56e93d26
JQ
3249
3250 return 0;
3251}
3252
3d0684b2
JQ
3253/**
3254 * ram_save_iterate: iterative stage for migration
3255 *
3256 * Returns zero to indicate success and negative for error
3257 *
3258 * @f: QEMUFile where to send the data
3259 * @opaque: RAMState pointer
3260 */
56e93d26
JQ
3261static int ram_save_iterate(QEMUFile *f, void *opaque)
3262{
53518d94
JQ
3263 RAMState **temp = opaque;
3264 RAMState *rs = *temp;
3d4095b2 3265 int ret = 0;
56e93d26
JQ
3266 int i;
3267 int64_t t0;
5c90308f 3268 int done = 0;
56e93d26 3269
b2557345
PL
3270 if (blk_mig_bulk_active()) {
3271 /* Avoid transferring ram during bulk phase of block migration as
3272 * the bulk phase will usually take a long time and transferring
3273 * ram updates during that time is pointless. */
3274 goto out;
3275 }
3276
63268c49
PX
3277 /*
3278 * We'll take this lock a little bit long, but it's okay for two reasons.
3279 * Firstly, the only possible other thread to take it is who calls
3280 * qemu_guest_free_page_hint(), which should be rare; secondly, see
3281 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3282 * guarantees that we'll at least released it in a regular basis.
3283 */
3284 qemu_mutex_lock(&rs->bitmap_mutex);
89ac5a1d
DDAG
3285 WITH_RCU_READ_LOCK_GUARD() {
3286 if (ram_list.version != rs->last_version) {
3287 ram_state_reset(rs);
3288 }
56e93d26 3289
89ac5a1d
DDAG
3290 /* Read version before ram_list.blocks */
3291 smp_rmb();
56e93d26 3292
89ac5a1d 3293 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
56e93d26 3294
89ac5a1d
DDAG
3295 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3296 i = 0;
3297 while ((ret = qemu_file_rate_limit(f)) == 0 ||
a1fe28df 3298 postcopy_has_request(rs)) {
89ac5a1d 3299 int pages;
e03a34f8 3300
89ac5a1d
DDAG
3301 if (qemu_file_get_error(f)) {
3302 break;
3303 }
e8f3735f 3304
05931ec5 3305 pages = ram_find_and_save_block(rs);
89ac5a1d
DDAG
3306 /* no more pages to sent */
3307 if (pages == 0) {
3308 done = 1;
3309 break;
3310 }
e8f3735f 3311
89ac5a1d
DDAG
3312 if (pages < 0) {
3313 qemu_file_set_error(f, pages);
56e93d26
JQ
3314 break;
3315 }
89ac5a1d
DDAG
3316
3317 rs->target_page_count += pages;
3318
644acf99
WY
3319 /*
3320 * During postcopy, it is necessary to make sure one whole host
3321 * page is sent in one chunk.
3322 */
3323 if (migrate_postcopy_ram()) {
3324 flush_compressed_data(rs);
3325 }
3326
89ac5a1d
DDAG
3327 /*
3328 * we want to check in the 1st loop, just in case it was the 1st
3329 * time and we had to sync the dirty bitmap.
3330 * qemu_clock_get_ns() is a bit expensive, so we only check each
3331 * some iterations
3332 */
3333 if ((i & 63) == 0) {
3334 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3335 1000000;
3336 if (t1 > MAX_WAIT) {
3337 trace_ram_save_iterate_big_wait(t1, i);
3338 break;
3339 }
3340 }
3341 i++;
56e93d26 3342 }
56e93d26 3343 }
63268c49 3344 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26
JQ
3345
3346 /*
3347 * Must occur before EOS (or any QEMUFile operation)
3348 * because of RDMA protocol.
3349 */
3350 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3351
b2557345 3352out:
b69a0227
JQ
3353 if (ret >= 0
3354 && migration_is_setup_or_active(migrate_get_current()->state)) {
7f401b80 3355 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
33d70973
LB
3356 if (ret < 0) {
3357 return ret;
3358 }
3359
3d4095b2
JQ
3360 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3361 qemu_fflush(f);
4c2d0f6d 3362 ram_transferred_add(8);
56e93d26 3363
3d4095b2
JQ
3364 ret = qemu_file_get_error(f);
3365 }
56e93d26
JQ
3366 if (ret < 0) {
3367 return ret;
3368 }
3369
5c90308f 3370 return done;
56e93d26
JQ
3371}
3372
3d0684b2
JQ
3373/**
3374 * ram_save_complete: function called to send the remaining amount of ram
3375 *
e8f3735f 3376 * Returns zero to indicate success or negative on error
3d0684b2
JQ
3377 *
3378 * Called with iothread lock
3379 *
3380 * @f: QEMUFile where to send the data
3381 * @opaque: RAMState pointer
3382 */
56e93d26
JQ
3383static int ram_save_complete(QEMUFile *f, void *opaque)
3384{
53518d94
JQ
3385 RAMState **temp = opaque;
3386 RAMState *rs = *temp;
e8f3735f 3387 int ret = 0;
6f37bb8b 3388
05931ec5
JQ
3389 rs->last_stage = !migration_in_colo_state();
3390
89ac5a1d
DDAG
3391 WITH_RCU_READ_LOCK_GUARD() {
3392 if (!migration_in_postcopy()) {
3393 migration_bitmap_sync_precopy(rs);
3394 }
56e93d26 3395
89ac5a1d 3396 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
56e93d26 3397
89ac5a1d 3398 /* try transferring iterative blocks of memory */
56e93d26 3399
89ac5a1d 3400 /* flush all remaining blocks regardless of rate limiting */
c13221b5 3401 qemu_mutex_lock(&rs->bitmap_mutex);
89ac5a1d
DDAG
3402 while (true) {
3403 int pages;
56e93d26 3404
05931ec5 3405 pages = ram_find_and_save_block(rs);
89ac5a1d
DDAG
3406 /* no more blocks to sent */
3407 if (pages == 0) {
3408 break;
3409 }
3410 if (pages < 0) {
3411 ret = pages;
3412 break;
3413 }
e8f3735f 3414 }
c13221b5 3415 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26 3416
89ac5a1d
DDAG
3417 flush_compressed_data(rs);
3418 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3419 }
d09a6fde 3420
33d70973
LB
3421 if (ret < 0) {
3422 return ret;
3d4095b2 3423 }
56e93d26 3424
7f401b80 3425 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
33d70973
LB
3426 if (ret < 0) {
3427 return ret;
3428 }
3429
3430 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3431 qemu_fflush(f);
3432
3433 return 0;
56e93d26
JQ
3434}
3435
fd70385d 3436static void ram_state_pending_estimate(void *opaque,
c8df4a7a
JQ
3437 uint64_t *res_precopy_only,
3438 uint64_t *res_compatible,
3439 uint64_t *res_postcopy_only)
56e93d26 3440{
53518d94
JQ
3441 RAMState **temp = opaque;
3442 RAMState *rs = *temp;
56e93d26 3443
c8df4a7a 3444 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 3445
c8df4a7a
JQ
3446 if (migrate_postcopy_ram()) {
3447 /* We can do postcopy, and all the data is postcopiable */
3448 *res_postcopy_only += remaining_size;
3449 } else {
3450 *res_precopy_only += remaining_size;
3451 }
3452}
3453
fd70385d 3454static void ram_state_pending_exact(void *opaque,
c8df4a7a
JQ
3455 uint64_t *res_precopy_only,
3456 uint64_t *res_compatible,
3457 uint64_t *res_postcopy_only)
3458{
3459 RAMState **temp = opaque;
3460 RAMState *rs = *temp;
3461
3462 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3463
3464 if (!migration_in_postcopy()) {
56e93d26 3465 qemu_mutex_lock_iothread();
89ac5a1d
DDAG
3466 WITH_RCU_READ_LOCK_GUARD() {
3467 migration_bitmap_sync_precopy(rs);
3468 }
56e93d26 3469 qemu_mutex_unlock_iothread();
9edabd4d 3470 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 3471 }
c31b098f 3472
86e1167e
VSO
3473 if (migrate_postcopy_ram()) {
3474 /* We can do postcopy, and all the data is postcopiable */
47995026 3475 *res_compatible += remaining_size;
86e1167e 3476 } else {
47995026 3477 *res_precopy_only += remaining_size;
86e1167e 3478 }
56e93d26
JQ
3479}
3480
3481static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3482{
3483 unsigned int xh_len;
3484 int xh_flags;
063e760a 3485 uint8_t *loaded_data;
56e93d26 3486
56e93d26
JQ
3487 /* extract RLE header */
3488 xh_flags = qemu_get_byte(f);
3489 xh_len = qemu_get_be16(f);
3490
3491 if (xh_flags != ENCODING_FLAG_XBZRLE) {
3492 error_report("Failed to load XBZRLE page - wrong compression!");
3493 return -1;
3494 }
3495
3496 if (xh_len > TARGET_PAGE_SIZE) {
3497 error_report("Failed to load XBZRLE page - len overflow!");
3498 return -1;
3499 }
f265e0e4 3500 loaded_data = XBZRLE.decoded_buf;
56e93d26 3501 /* load data and decode */
f265e0e4 3502 /* it can change loaded_data to point to an internal buffer */
063e760a 3503 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
56e93d26
JQ
3504
3505 /* decode RLE */
063e760a 3506 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
56e93d26
JQ
3507 TARGET_PAGE_SIZE) == -1) {
3508 error_report("Failed to load XBZRLE page - decode error!");
3509 return -1;
3510 }
3511
3512 return 0;
3513}
3514
3d0684b2
JQ
3515/**
3516 * ram_block_from_stream: read a RAMBlock id from the migration stream
3517 *
3518 * Must be called from within a rcu critical section.
3519 *
56e93d26 3520 * Returns a pointer from within the RCU-protected ram_list.
a7180877 3521 *
755e8d7c 3522 * @mis: the migration incoming state pointer
3d0684b2
JQ
3523 * @f: QEMUFile where to read the data from
3524 * @flags: Page flags (mostly to see if it's a continuation of previous block)
c01b16ed 3525 * @channel: the channel we're using
a7180877 3526 */
755e8d7c 3527static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
c01b16ed
PX
3528 QEMUFile *f, int flags,
3529 int channel)
56e93d26 3530{
c01b16ed 3531 RAMBlock *block = mis->last_recv_block[channel];
56e93d26
JQ
3532 char id[256];
3533 uint8_t len;
3534
3535 if (flags & RAM_SAVE_FLAG_CONTINUE) {
4c4bad48 3536 if (!block) {
56e93d26
JQ
3537 error_report("Ack, bad migration stream!");
3538 return NULL;
3539 }
4c4bad48 3540 return block;
56e93d26
JQ
3541 }
3542
3543 len = qemu_get_byte(f);
3544 qemu_get_buffer(f, (uint8_t *)id, len);
3545 id[len] = 0;
3546
e3dd7493 3547 block = qemu_ram_block_by_name(id);
4c4bad48
HZ
3548 if (!block) {
3549 error_report("Can't find block %s", id);
3550 return NULL;
56e93d26
JQ
3551 }
3552
fbd162e6 3553 if (ramblock_is_ignored(block)) {
b895de50
CLG
3554 error_report("block %s should not be migrated !", id);
3555 return NULL;
3556 }
3557
c01b16ed 3558 mis->last_recv_block[channel] = block;
755e8d7c 3559
4c4bad48
HZ
3560 return block;
3561}
3562
3563static inline void *host_from_ram_block_offset(RAMBlock *block,
3564 ram_addr_t offset)
3565{
3566 if (!offset_in_ramblock(block, offset)) {
3567 return NULL;
3568 }
3569
3570 return block->host + offset;
56e93d26
JQ
3571}
3572
6a23f639
DH
3573static void *host_page_from_ram_block_offset(RAMBlock *block,
3574 ram_addr_t offset)
3575{
3576 /* Note: Explicitly no check against offset_in_ramblock(). */
3577 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3578 block->page_size);
3579}
3580
3581static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3582 ram_addr_t offset)
3583{
3584 return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3585}
3586
13af18f2 3587static inline void *colo_cache_from_block_offset(RAMBlock *block,
8af66371 3588 ram_addr_t offset, bool record_bitmap)
13af18f2
ZC
3589{
3590 if (!offset_in_ramblock(block, offset)) {
3591 return NULL;
3592 }
3593 if (!block->colo_cache) {
3594 error_report("%s: colo_cache is NULL in block :%s",
3595 __func__, block->idstr);
3596 return NULL;
3597 }
7d9acafa
ZC
3598
3599 /*
3600 * During colo checkpoint, we need bitmap of these migrated pages.
3601 * It help us to decide which pages in ram cache should be flushed
3602 * into VM's RAM later.
3603 */
8af66371
HZ
3604 if (record_bitmap &&
3605 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
7d9acafa
ZC
3606 ram_state->migration_dirty_pages++;
3607 }
13af18f2
ZC
3608 return block->colo_cache + offset;
3609}
3610
3d0684b2
JQ
3611/**
3612 * ram_handle_compressed: handle the zero page case
3613 *
56e93d26
JQ
3614 * If a page (or a whole RDMA chunk) has been
3615 * determined to be zero, then zap it.
3d0684b2
JQ
3616 *
3617 * @host: host address for the zero page
3618 * @ch: what the page is filled from. We only support zero
3619 * @size: size of the zero page
56e93d26
JQ
3620 */
3621void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3622{
bad452a7 3623 if (ch != 0 || !buffer_is_zero(host, size)) {
56e93d26
JQ
3624 memset(host, ch, size);
3625 }
3626}
3627
797ca154
XG
3628/* return the size after decompression, or negative value on error */
3629static int
3630qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3631 const uint8_t *source, size_t source_len)
3632{
3633 int err;
3634
3635 err = inflateReset(stream);
3636 if (err != Z_OK) {
3637 return -1;
3638 }
3639
3640 stream->avail_in = source_len;
3641 stream->next_in = (uint8_t *)source;
3642 stream->avail_out = dest_len;
3643 stream->next_out = dest;
3644
3645 err = inflate(stream, Z_NO_FLUSH);
3646 if (err != Z_STREAM_END) {
3647 return -1;
3648 }
3649
3650 return stream->total_out;
3651}
3652
56e93d26
JQ
3653static void *do_data_decompress(void *opaque)
3654{
3655 DecompressParam *param = opaque;
3656 unsigned long pagesize;
33d151f4 3657 uint8_t *des;
34ab9e97 3658 int len, ret;
56e93d26 3659
33d151f4 3660 qemu_mutex_lock(&param->mutex);
90e56fb4 3661 while (!param->quit) {
33d151f4
LL
3662 if (param->des) {
3663 des = param->des;
3664 len = param->len;
3665 param->des = 0;
3666 qemu_mutex_unlock(&param->mutex);
3667
56e93d26 3668 pagesize = TARGET_PAGE_SIZE;
34ab9e97
XG
3669
3670 ret = qemu_uncompress_data(&param->stream, des, pagesize,
3671 param->compbuf, len);
f548222c 3672 if (ret < 0 && migrate_get_current()->decompress_error_check) {
34ab9e97
XG
3673 error_report("decompress data failed");
3674 qemu_file_set_error(decomp_file, ret);
3675 }
73a8912b 3676
33d151f4
LL
3677 qemu_mutex_lock(&decomp_done_lock);
3678 param->done = true;
3679 qemu_cond_signal(&decomp_done_cond);
3680 qemu_mutex_unlock(&decomp_done_lock);
3681
3682 qemu_mutex_lock(&param->mutex);
3683 } else {
3684 qemu_cond_wait(&param->cond, &param->mutex);
3685 }
56e93d26 3686 }
33d151f4 3687 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
3688
3689 return NULL;
3690}
3691
34ab9e97 3692static int wait_for_decompress_done(void)
5533b2e9
LL
3693{
3694 int idx, thread_count;
3695
3696 if (!migrate_use_compression()) {
34ab9e97 3697 return 0;
5533b2e9
LL
3698 }
3699
3700 thread_count = migrate_decompress_threads();
3701 qemu_mutex_lock(&decomp_done_lock);
3702 for (idx = 0; idx < thread_count; idx++) {
3703 while (!decomp_param[idx].done) {
3704 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3705 }
3706 }
3707 qemu_mutex_unlock(&decomp_done_lock);
34ab9e97 3708 return qemu_file_get_error(decomp_file);
5533b2e9
LL
3709}
3710
f0afa331 3711static void compress_threads_load_cleanup(void)
56e93d26
JQ
3712{
3713 int i, thread_count;
3714
3416ab5b
JQ
3715 if (!migrate_use_compression()) {
3716 return;
3717 }
56e93d26
JQ
3718 thread_count = migrate_decompress_threads();
3719 for (i = 0; i < thread_count; i++) {
797ca154
XG
3720 /*
3721 * we use it as a indicator which shows if the thread is
3722 * properly init'd or not
3723 */
3724 if (!decomp_param[i].compbuf) {
3725 break;
3726 }
3727
56e93d26 3728 qemu_mutex_lock(&decomp_param[i].mutex);
90e56fb4 3729 decomp_param[i].quit = true;
56e93d26
JQ
3730 qemu_cond_signal(&decomp_param[i].cond);
3731 qemu_mutex_unlock(&decomp_param[i].mutex);
3732 }
3733 for (i = 0; i < thread_count; i++) {
797ca154
XG
3734 if (!decomp_param[i].compbuf) {
3735 break;
3736 }
3737
56e93d26
JQ
3738 qemu_thread_join(decompress_threads + i);
3739 qemu_mutex_destroy(&decomp_param[i].mutex);
3740 qemu_cond_destroy(&decomp_param[i].cond);
797ca154 3741 inflateEnd(&decomp_param[i].stream);
56e93d26 3742 g_free(decomp_param[i].compbuf);
797ca154 3743 decomp_param[i].compbuf = NULL;
56e93d26
JQ
3744 }
3745 g_free(decompress_threads);
3746 g_free(decomp_param);
56e93d26
JQ
3747 decompress_threads = NULL;
3748 decomp_param = NULL;
34ab9e97 3749 decomp_file = NULL;
56e93d26
JQ
3750}
3751
34ab9e97 3752static int compress_threads_load_setup(QEMUFile *f)
797ca154
XG
3753{
3754 int i, thread_count;
3755
3756 if (!migrate_use_compression()) {
3757 return 0;
3758 }
3759
3760 thread_count = migrate_decompress_threads();
3761 decompress_threads = g_new0(QemuThread, thread_count);
3762 decomp_param = g_new0(DecompressParam, thread_count);
3763 qemu_mutex_init(&decomp_done_lock);
3764 qemu_cond_init(&decomp_done_cond);
34ab9e97 3765 decomp_file = f;
797ca154
XG
3766 for (i = 0; i < thread_count; i++) {
3767 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3768 goto exit;
3769 }
3770
3771 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3772 qemu_mutex_init(&decomp_param[i].mutex);
3773 qemu_cond_init(&decomp_param[i].cond);
3774 decomp_param[i].done = true;
3775 decomp_param[i].quit = false;
3776 qemu_thread_create(decompress_threads + i, "decompress",
3777 do_data_decompress, decomp_param + i,
3778 QEMU_THREAD_JOINABLE);
3779 }
3780 return 0;
3781exit:
3782 compress_threads_load_cleanup();
3783 return -1;
3784}
3785
c1bc6626 3786static void decompress_data_with_multi_threads(QEMUFile *f,
56e93d26
JQ
3787 void *host, int len)
3788{
3789 int idx, thread_count;
3790
3791 thread_count = migrate_decompress_threads();
37396950 3792 QEMU_LOCK_GUARD(&decomp_done_lock);
56e93d26
JQ
3793 while (true) {
3794 for (idx = 0; idx < thread_count; idx++) {
73a8912b 3795 if (decomp_param[idx].done) {
33d151f4
LL
3796 decomp_param[idx].done = false;
3797 qemu_mutex_lock(&decomp_param[idx].mutex);
c1bc6626 3798 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
56e93d26
JQ
3799 decomp_param[idx].des = host;
3800 decomp_param[idx].len = len;
33d151f4
LL
3801 qemu_cond_signal(&decomp_param[idx].cond);
3802 qemu_mutex_unlock(&decomp_param[idx].mutex);
56e93d26
JQ
3803 break;
3804 }
3805 }
3806 if (idx < thread_count) {
3807 break;
73a8912b
LL
3808 } else {
3809 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
56e93d26
JQ
3810 }
3811 }
3812}
3813
b70cb3b4
RL
3814static void colo_init_ram_state(void)
3815{
3816 ram_state_init(&ram_state);
b70cb3b4
RL
3817}
3818
13af18f2
ZC
3819/*
3820 * colo cache: this is for secondary VM, we cache the whole
3821 * memory of the secondary VM, it is need to hold the global lock
3822 * to call this helper.
3823 */
3824int colo_init_ram_cache(void)
3825{
3826 RAMBlock *block;
3827
44901b5a
PB
3828 WITH_RCU_READ_LOCK_GUARD() {
3829 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3830 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
8dbe22c6 3831 NULL, false, false);
44901b5a
PB
3832 if (!block->colo_cache) {
3833 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3834 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3835 block->used_length);
3836 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3837 if (block->colo_cache) {
3838 qemu_anon_ram_free(block->colo_cache, block->used_length);
3839 block->colo_cache = NULL;
3840 }
89ac5a1d 3841 }
44901b5a 3842 return -errno;
89ac5a1d 3843 }
e5fdf920
LS
3844 if (!machine_dump_guest_core(current_machine)) {
3845 qemu_madvise(block->colo_cache, block->used_length,
3846 QEMU_MADV_DONTDUMP);
3847 }
13af18f2 3848 }
13af18f2 3849 }
44901b5a 3850
7d9acafa
ZC
3851 /*
3852 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3853 * with to decide which page in cache should be flushed into SVM's RAM. Here
3854 * we use the same name 'ram_bitmap' as for migration.
3855 */
3856 if (ram_bytes_total()) {
3857 RAMBlock *block;
3858
fbd162e6 3859 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7d9acafa 3860 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
7d9acafa 3861 block->bmap = bitmap_new(pages);
7d9acafa
ZC
3862 }
3863 }
7d9acafa 3864
b70cb3b4 3865 colo_init_ram_state();
13af18f2 3866 return 0;
13af18f2
ZC
3867}
3868
0393031a
HZ
3869/* TODO: duplicated with ram_init_bitmaps */
3870void colo_incoming_start_dirty_log(void)
3871{
3872 RAMBlock *block = NULL;
3873 /* For memory_global_dirty_log_start below. */
3874 qemu_mutex_lock_iothread();
3875 qemu_mutex_lock_ramlist();
3876
3877 memory_global_dirty_log_sync();
3878 WITH_RCU_READ_LOCK_GUARD() {
3879 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3880 ramblock_sync_dirty_bitmap(ram_state, block);
3881 /* Discard this dirty bitmap record */
3882 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3883 }
63b41db4 3884 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
0393031a
HZ
3885 }
3886 ram_state->migration_dirty_pages = 0;
3887 qemu_mutex_unlock_ramlist();
3888 qemu_mutex_unlock_iothread();
3889}
3890
13af18f2
ZC
3891/* It is need to hold the global lock to call this helper */
3892void colo_release_ram_cache(void)
3893{
3894 RAMBlock *block;
3895
63b41db4 3896 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
fbd162e6 3897 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7d9acafa
ZC
3898 g_free(block->bmap);
3899 block->bmap = NULL;
3900 }
3901
89ac5a1d
DDAG
3902 WITH_RCU_READ_LOCK_GUARD() {
3903 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3904 if (block->colo_cache) {
3905 qemu_anon_ram_free(block->colo_cache, block->used_length);
3906 block->colo_cache = NULL;
3907 }
13af18f2
ZC
3908 }
3909 }
0393031a 3910 ram_state_cleanup(&ram_state);
13af18f2
ZC
3911}
3912
f265e0e4
JQ
3913/**
3914 * ram_load_setup: Setup RAM for migration incoming side
3915 *
3916 * Returns zero to indicate success and negative for error
3917 *
3918 * @f: QEMUFile where to receive the data
3919 * @opaque: RAMState pointer
3920 */
3921static int ram_load_setup(QEMUFile *f, void *opaque)
3922{
34ab9e97 3923 if (compress_threads_load_setup(f)) {
797ca154
XG
3924 return -1;
3925 }
3926
f265e0e4 3927 xbzrle_load_setup();
f9494614 3928 ramblock_recv_map_init();
13af18f2 3929
f265e0e4
JQ
3930 return 0;
3931}
3932
3933static int ram_load_cleanup(void *opaque)
3934{
f9494614 3935 RAMBlock *rb;
56eb90af 3936
fbd162e6 3937 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
bd108a44 3938 qemu_ram_block_writeback(rb);
56eb90af
JH
3939 }
3940
f265e0e4 3941 xbzrle_load_cleanup();
f0afa331 3942 compress_threads_load_cleanup();
f9494614 3943
fbd162e6 3944 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
f9494614
AP
3945 g_free(rb->receivedmap);
3946 rb->receivedmap = NULL;
3947 }
13af18f2 3948
f265e0e4
JQ
3949 return 0;
3950}
3951
3d0684b2
JQ
3952/**
3953 * ram_postcopy_incoming_init: allocate postcopy data structures
3954 *
3955 * Returns 0 for success and negative if there was one error
3956 *
3957 * @mis: current migration incoming state
3958 *
3959 * Allocate data structures etc needed by incoming migration with
3960 * postcopy-ram. postcopy-ram's similarly names
3961 * postcopy_ram_incoming_init does the work.
1caddf8a
DDAG
3962 */
3963int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3964{
c136180c 3965 return postcopy_ram_incoming_init(mis);
1caddf8a
DDAG
3966}
3967
3d0684b2
JQ
3968/**
3969 * ram_load_postcopy: load a page in postcopy case
3970 *
3971 * Returns 0 for success or -errno in case of error
3972 *
a7180877
DDAG
3973 * Called in postcopy mode by ram_load().
3974 * rcu_read_lock is taken prior to this being called.
3d0684b2
JQ
3975 *
3976 * @f: QEMUFile where to send the data
36f62f11 3977 * @channel: the channel to use for loading
a7180877 3978 */
36f62f11 3979int ram_load_postcopy(QEMUFile *f, int channel)
a7180877
DDAG
3980{
3981 int flags = 0, ret = 0;
3982 bool place_needed = false;
1aa83678 3983 bool matches_target_page_size = false;
a7180877 3984 MigrationIncomingState *mis = migration_incoming_get_current();
36f62f11 3985 PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
a7180877
DDAG
3986
3987 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3988 ram_addr_t addr;
a7180877
DDAG
3989 void *page_buffer = NULL;
3990 void *place_source = NULL;
df9ff5e1 3991 RAMBlock *block = NULL;
a7180877 3992 uint8_t ch;
644acf99 3993 int len;
a7180877
DDAG
3994
3995 addr = qemu_get_be64(f);
7a9ddfbf
PX
3996
3997 /*
3998 * If qemu file error, we should stop here, and then "addr"
3999 * may be invalid
4000 */
4001 ret = qemu_file_get_error(f);
4002 if (ret) {
4003 break;
4004 }
4005
a7180877
DDAG
4006 flags = addr & ~TARGET_PAGE_MASK;
4007 addr &= TARGET_PAGE_MASK;
4008
36f62f11 4009 trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
644acf99
WY
4010 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4011 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
c01b16ed 4012 block = ram_block_from_stream(mis, f, flags, channel);
6a23f639
DH
4013 if (!block) {
4014 ret = -EINVAL;
4015 break;
4016 }
4c4bad48 4017
898ba906
DH
4018 /*
4019 * Relying on used_length is racy and can result in false positives.
4020 * We might place pages beyond used_length in case RAM was shrunk
4021 * while in postcopy, which is fine - trying to place via
4022 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
4023 */
4024 if (!block->host || addr >= block->postcopy_length) {
a7180877
DDAG
4025 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4026 ret = -EINVAL;
4027 break;
4028 }
77dadc3f 4029 tmp_page->target_pages++;
1aa83678 4030 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
a7180877 4031 /*
28abd200
DDAG
4032 * Postcopy requires that we place whole host pages atomically;
4033 * these may be huge pages for RAMBlocks that are backed by
4034 * hugetlbfs.
a7180877
DDAG
4035 * To make it atomic, the data is read into a temporary page
4036 * that's moved into place later.
4037 * The migration protocol uses, possibly smaller, target-pages
4038 * however the source ensures it always sends all the components
91ba442f 4039 * of a host page in one chunk.
a7180877 4040 */
77dadc3f 4041 page_buffer = tmp_page->tmp_huge_page +
6a23f639
DH
4042 host_page_offset_from_ram_block_offset(block, addr);
4043 /* If all TP are zero then we can optimise the place */
77dadc3f
PX
4044 if (tmp_page->target_pages == 1) {
4045 tmp_page->host_addr =
4046 host_page_from_ram_block_offset(block, addr);
4047 } else if (tmp_page->host_addr !=
4048 host_page_from_ram_block_offset(block, addr)) {
c53b7ddc 4049 /* not the 1st TP within the HP */
36f62f11 4050 error_report("Non-same host page detected on channel %d: "
cfc7dc8a
PX
4051 "Target host page %p, received host page %p "
4052 "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
36f62f11 4053 channel, tmp_page->host_addr,
cfc7dc8a
PX
4054 host_page_from_ram_block_offset(block, addr),
4055 block->idstr, addr, tmp_page->target_pages);
6a23f639
DH
4056 ret = -EINVAL;
4057 break;
a7180877
DDAG
4058 }
4059
4060 /*
4061 * If it's the last part of a host page then we place the host
4062 * page
4063 */
77dadc3f
PX
4064 if (tmp_page->target_pages ==
4065 (block->page_size / TARGET_PAGE_SIZE)) {
4cbb3c63 4066 place_needed = true;
4cbb3c63 4067 }
77dadc3f 4068 place_source = tmp_page->tmp_huge_page;
a7180877
DDAG
4069 }
4070
4071 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
bb890ed5 4072 case RAM_SAVE_FLAG_ZERO:
a7180877 4073 ch = qemu_get_byte(f);
2e36bc1b
WY
4074 /*
4075 * Can skip to set page_buffer when
4076 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
4077 */
4078 if (ch || !matches_target_page_size) {
4079 memset(page_buffer, ch, TARGET_PAGE_SIZE);
4080 }
a7180877 4081 if (ch) {
77dadc3f 4082 tmp_page->all_zero = false;
a7180877
DDAG
4083 }
4084 break;
4085
4086 case RAM_SAVE_FLAG_PAGE:
77dadc3f 4087 tmp_page->all_zero = false;
1aa83678
PX
4088 if (!matches_target_page_size) {
4089 /* For huge pages, we always use temporary buffer */
a7180877
DDAG
4090 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4091 } else {
1aa83678
PX
4092 /*
4093 * For small pages that matches target page size, we
4094 * avoid the qemu_file copy. Instead we directly use
4095 * the buffer of QEMUFile to place the page. Note: we
4096 * cannot do any QEMUFile operation before using that
4097 * buffer to make sure the buffer is valid when
4098 * placing the page.
a7180877
DDAG
4099 */
4100 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4101 TARGET_PAGE_SIZE);
4102 }
4103 break;
644acf99 4104 case RAM_SAVE_FLAG_COMPRESS_PAGE:
77dadc3f 4105 tmp_page->all_zero = false;
644acf99
WY
4106 len = qemu_get_be32(f);
4107 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4108 error_report("Invalid compressed data length: %d", len);
4109 ret = -EINVAL;
4110 break;
4111 }
4112 decompress_data_with_multi_threads(f, page_buffer, len);
4113 break;
4114
a7180877
DDAG
4115 case RAM_SAVE_FLAG_EOS:
4116 /* normal exit */
6df264ac 4117 multifd_recv_sync_main();
a7180877
DDAG
4118 break;
4119 default:
29fccade 4120 error_report("Unknown combination of migration flags: 0x%x"
a7180877
DDAG
4121 " (postcopy mode)", flags);
4122 ret = -EINVAL;
7a9ddfbf
PX
4123 break;
4124 }
4125
644acf99
WY
4126 /* Got the whole host page, wait for decompress before placing. */
4127 if (place_needed) {
4128 ret |= wait_for_decompress_done();
4129 }
4130
7a9ddfbf
PX
4131 /* Detect for any possible file errors */
4132 if (!ret && qemu_file_get_error(f)) {
4133 ret = qemu_file_get_error(f);
a7180877
DDAG
4134 }
4135
7a9ddfbf 4136 if (!ret && place_needed) {
77dadc3f
PX
4137 if (tmp_page->all_zero) {
4138 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
a7180877 4139 } else {
77dadc3f
PX
4140 ret = postcopy_place_page(mis, tmp_page->host_addr,
4141 place_source, block);
a7180877 4142 }
ddf35bdf 4143 place_needed = false;
77dadc3f 4144 postcopy_temp_page_reset(tmp_page);
a7180877 4145 }
a7180877
DDAG
4146 }
4147
4148 return ret;
4149}
4150
acab30b8
DHB
4151static bool postcopy_is_running(void)
4152{
4153 PostcopyState ps = postcopy_state_get();
4154 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4155}
4156
e6f4aa18
ZC
4157/*
4158 * Flush content of RAM cache into SVM's memory.
4159 * Only flush the pages that be dirtied by PVM or SVM or both.
4160 */
24fa16f8 4161void colo_flush_ram_cache(void)
e6f4aa18
ZC
4162{
4163 RAMBlock *block = NULL;
4164 void *dst_host;
4165 void *src_host;
4166 unsigned long offset = 0;
4167
d1955d22 4168 memory_global_dirty_log_sync();
89ac5a1d
DDAG
4169 WITH_RCU_READ_LOCK_GUARD() {
4170 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4171 ramblock_sync_dirty_bitmap(ram_state, block);
4172 }
d1955d22 4173 }
d1955d22 4174
e6f4aa18 4175 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
89ac5a1d
DDAG
4176 WITH_RCU_READ_LOCK_GUARD() {
4177 block = QLIST_FIRST_RCU(&ram_list.blocks);
e6f4aa18 4178
89ac5a1d 4179 while (block) {
a6a83cef 4180 unsigned long num = 0;
e6f4aa18 4181
a6a83cef 4182 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
542147f4
DH
4183 if (!offset_in_ramblock(block,
4184 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
89ac5a1d 4185 offset = 0;
a6a83cef 4186 num = 0;
89ac5a1d
DDAG
4187 block = QLIST_NEXT_RCU(block, next);
4188 } else {
a6a83cef
RL
4189 unsigned long i = 0;
4190
4191 for (i = 0; i < num; i++) {
4192 migration_bitmap_clear_dirty(ram_state, block, offset + i);
4193 }
8bba004c
AR
4194 dst_host = block->host
4195 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4196 src_host = block->colo_cache
4197 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
a6a83cef
RL
4198 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
4199 offset += num;
89ac5a1d 4200 }
e6f4aa18
ZC
4201 }
4202 }
e6f4aa18
ZC
4203 trace_colo_flush_ram_cache_end();
4204}
4205
10da4a36
WY
4206/**
4207 * ram_load_precopy: load pages in precopy case
4208 *
4209 * Returns 0 for success or -errno in case of error
4210 *
4211 * Called in precopy mode by ram_load().
4212 * rcu_read_lock is taken prior to this being called.
4213 *
4214 * @f: QEMUFile where to send the data
4215 */
4216static int ram_load_precopy(QEMUFile *f)
56e93d26 4217{
755e8d7c 4218 MigrationIncomingState *mis = migration_incoming_get_current();
e65cec5e 4219 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
ef08fb38 4220 /* ADVISE is earlier, it shows the source has the postcopy capability on */
80fe315c 4221 bool postcopy_advised = migration_incoming_postcopy_advised();
edc60127
JQ
4222 if (!migrate_use_compression()) {
4223 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4224 }
a7180877 4225
10da4a36 4226 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
56e93d26 4227 ram_addr_t addr, total_ram_bytes;
0393031a 4228 void *host = NULL, *host_bak = NULL;
56e93d26
JQ
4229 uint8_t ch;
4230
e65cec5e
YK
4231 /*
4232 * Yield periodically to let main loop run, but an iteration of
4233 * the main loop is expensive, so do it each some iterations
4234 */
4235 if ((i & 32767) == 0 && qemu_in_coroutine()) {
4236 aio_co_schedule(qemu_get_current_aio_context(),
4237 qemu_coroutine_self());
4238 qemu_coroutine_yield();
4239 }
4240 i++;
4241
56e93d26
JQ
4242 addr = qemu_get_be64(f);
4243 flags = addr & ~TARGET_PAGE_MASK;
4244 addr &= TARGET_PAGE_MASK;
4245
edc60127
JQ
4246 if (flags & invalid_flags) {
4247 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4248 error_report("Received an unexpected compressed page");
4249 }
4250
4251 ret = -EINVAL;
4252 break;
4253 }
4254
bb890ed5 4255 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
a776aa15 4256 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
c01b16ed
PX
4257 RAMBlock *block = ram_block_from_stream(mis, f, flags,
4258 RAM_CHANNEL_PRECOPY);
4c4bad48 4259
0393031a 4260 host = host_from_ram_block_offset(block, addr);
13af18f2 4261 /*
0393031a
HZ
4262 * After going into COLO stage, we should not load the page
4263 * into SVM's memory directly, we put them into colo_cache firstly.
4264 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
4265 * Previously, we copied all these memory in preparing stage of COLO
4266 * while we need to stop VM, which is a time-consuming process.
4267 * Here we optimize it by a trick, back-up every page while in
4268 * migration process while COLO is enabled, though it affects the
4269 * speed of the migration, but it obviously reduce the downtime of
4270 * back-up all SVM'S memory in COLO preparing stage.
13af18f2 4271 */
0393031a
HZ
4272 if (migration_incoming_colo_enabled()) {
4273 if (migration_incoming_in_colo_state()) {
4274 /* In COLO stage, put all pages into cache temporarily */
8af66371 4275 host = colo_cache_from_block_offset(block, addr, true);
0393031a
HZ
4276 } else {
4277 /*
4278 * In migration stage but before COLO stage,
4279 * Put all pages into both cache and SVM's memory.
4280 */
8af66371 4281 host_bak = colo_cache_from_block_offset(block, addr, false);
0393031a 4282 }
13af18f2 4283 }
a776aa15
DDAG
4284 if (!host) {
4285 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4286 ret = -EINVAL;
4287 break;
4288 }
13af18f2
ZC
4289 if (!migration_incoming_in_colo_state()) {
4290 ramblock_recv_bitmap_set(block, host);
4291 }
4292
1db9d8e5 4293 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
a776aa15
DDAG
4294 }
4295
56e93d26
JQ
4296 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4297 case RAM_SAVE_FLAG_MEM_SIZE:
4298 /* Synchronize RAM block list */
4299 total_ram_bytes = addr;
4300 while (!ret && total_ram_bytes) {
4301 RAMBlock *block;
56e93d26
JQ
4302 char id[256];
4303 ram_addr_t length;
4304
4305 len = qemu_get_byte(f);
4306 qemu_get_buffer(f, (uint8_t *)id, len);
4307 id[len] = 0;
4308 length = qemu_get_be64(f);
4309
e3dd7493 4310 block = qemu_ram_block_by_name(id);
b895de50
CLG
4311 if (block && !qemu_ram_is_migratable(block)) {
4312 error_report("block %s should not be migrated !", id);
4313 ret = -EINVAL;
4314 } else if (block) {
e3dd7493
DDAG
4315 if (length != block->used_length) {
4316 Error *local_err = NULL;
56e93d26 4317
fa53a0e5 4318 ret = qemu_ram_resize(block, length,
e3dd7493
DDAG
4319 &local_err);
4320 if (local_err) {
4321 error_report_err(local_err);
56e93d26 4322 }
56e93d26 4323 }
ef08fb38 4324 /* For postcopy we need to check hugepage sizes match */
e846b746 4325 if (postcopy_advised && migrate_postcopy_ram() &&
ef08fb38
DDAG
4326 block->page_size != qemu_host_page_size) {
4327 uint64_t remote_page_size = qemu_get_be64(f);
4328 if (remote_page_size != block->page_size) {
4329 error_report("Mismatched RAM page size %s "
4330 "(local) %zd != %" PRId64,
4331 id, block->page_size,
4332 remote_page_size);
4333 ret = -EINVAL;
4334 }
4335 }
fbd162e6
YK
4336 if (migrate_ignore_shared()) {
4337 hwaddr addr = qemu_get_be64(f);
fbd162e6
YK
4338 if (ramblock_is_ignored(block) &&
4339 block->mr->addr != addr) {
4340 error_report("Mismatched GPAs for block %s "
4341 "%" PRId64 "!= %" PRId64,
4342 id, (uint64_t)addr,
4343 (uint64_t)block->mr->addr);
4344 ret = -EINVAL;
4345 }
4346 }
e3dd7493
DDAG
4347 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4348 block->idstr);
4349 } else {
56e93d26
JQ
4350 error_report("Unknown ramblock \"%s\", cannot "
4351 "accept migration", id);
4352 ret = -EINVAL;
4353 }
4354
4355 total_ram_bytes -= length;
4356 }
4357 break;
a776aa15 4358
bb890ed5 4359 case RAM_SAVE_FLAG_ZERO:
56e93d26
JQ
4360 ch = qemu_get_byte(f);
4361 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4362 break;
a776aa15 4363
56e93d26 4364 case RAM_SAVE_FLAG_PAGE:
56e93d26
JQ
4365 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4366 break;
56e93d26 4367
a776aa15 4368 case RAM_SAVE_FLAG_COMPRESS_PAGE:
56e93d26
JQ
4369 len = qemu_get_be32(f);
4370 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4371 error_report("Invalid compressed data length: %d", len);
4372 ret = -EINVAL;
4373 break;
4374 }
c1bc6626 4375 decompress_data_with_multi_threads(f, host, len);
56e93d26 4376 break;
a776aa15 4377
56e93d26 4378 case RAM_SAVE_FLAG_XBZRLE:
56e93d26
JQ
4379 if (load_xbzrle(f, addr, host) < 0) {
4380 error_report("Failed to decompress XBZRLE page at "
4381 RAM_ADDR_FMT, addr);
4382 ret = -EINVAL;
4383 break;
4384 }
4385 break;
4386 case RAM_SAVE_FLAG_EOS:
4387 /* normal exit */
6df264ac 4388 multifd_recv_sync_main();
56e93d26
JQ
4389 break;
4390 default:
4391 if (flags & RAM_SAVE_FLAG_HOOK) {
632e3a5c 4392 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
56e93d26 4393 } else {
29fccade 4394 error_report("Unknown combination of migration flags: 0x%x",
56e93d26
JQ
4395 flags);
4396 ret = -EINVAL;
4397 }
4398 }
4399 if (!ret) {
4400 ret = qemu_file_get_error(f);
4401 }
0393031a
HZ
4402 if (!ret && host_bak) {
4403 memcpy(host_bak, host, TARGET_PAGE_SIZE);
4404 }
56e93d26
JQ
4405 }
4406
ca1a6b70 4407 ret |= wait_for_decompress_done();
10da4a36
WY
4408 return ret;
4409}
4410
4411static int ram_load(QEMUFile *f, void *opaque, int version_id)
4412{
4413 int ret = 0;
4414 static uint64_t seq_iter;
4415 /*
4416 * If system is running in postcopy mode, page inserts to host memory must
4417 * be atomic
4418 */
4419 bool postcopy_running = postcopy_is_running();
4420
4421 seq_iter++;
4422
4423 if (version_id != 4) {
4424 return -EINVAL;
4425 }
4426
4427 /*
4428 * This RCU critical section can be very long running.
4429 * When RCU reclaims in the code start to become numerous,
4430 * it will be necessary to reduce the granularity of this
4431 * critical section.
4432 */
89ac5a1d
DDAG
4433 WITH_RCU_READ_LOCK_GUARD() {
4434 if (postcopy_running) {
36f62f11
PX
4435 /*
4436 * Note! Here RAM_CHANNEL_PRECOPY is the precopy channel of
4437 * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
4438 * service fast page faults.
4439 */
4440 ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
89ac5a1d
DDAG
4441 } else {
4442 ret = ram_load_precopy(f);
4443 }
10da4a36 4444 }
55c4446b 4445 trace_ram_load_complete(ret, seq_iter);
e6f4aa18 4446
56e93d26
JQ
4447 return ret;
4448}
4449
c6467627
VSO
4450static bool ram_has_postcopy(void *opaque)
4451{
469dd51b 4452 RAMBlock *rb;
fbd162e6 4453 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
469dd51b
JH
4454 if (ramblock_is_pmem(rb)) {
4455 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4456 "is not supported now!", rb->idstr, rb->host);
4457 return false;
4458 }
4459 }
4460
c6467627
VSO
4461 return migrate_postcopy_ram();
4462}
4463
edd090c7
PX
4464/* Sync all the dirty bitmap with destination VM. */
4465static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4466{
4467 RAMBlock *block;
4468 QEMUFile *file = s->to_dst_file;
4469 int ramblock_count = 0;
4470
4471 trace_ram_dirty_bitmap_sync_start();
4472
fbd162e6 4473 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
edd090c7
PX
4474 qemu_savevm_send_recv_bitmap(file, block->idstr);
4475 trace_ram_dirty_bitmap_request(block->idstr);
4476 ramblock_count++;
4477 }
4478
4479 trace_ram_dirty_bitmap_sync_wait();
4480
4481 /* Wait until all the ramblocks' dirty bitmap synced */
4482 while (ramblock_count--) {
4483 qemu_sem_wait(&s->rp_state.rp_sem);
4484 }
4485
4486 trace_ram_dirty_bitmap_sync_complete();
4487
4488 return 0;
4489}
4490
4491static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4492{
4493 qemu_sem_post(&s->rp_state.rp_sem);
4494}
4495
a335debb
PX
4496/*
4497 * Read the received bitmap, revert it as the initial dirty bitmap.
4498 * This is only used when the postcopy migration is paused but wants
4499 * to resume from a middle point.
4500 */
4501int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4502{
4503 int ret = -EINVAL;
43044ac0 4504 /* from_dst_file is always valid because we're within rp_thread */
a335debb
PX
4505 QEMUFile *file = s->rp_state.from_dst_file;
4506 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
a725ef9f 4507 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
4508 uint64_t size, end_mark;
4509
4510 trace_ram_dirty_bitmap_reload_begin(block->idstr);
4511
4512 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4513 error_report("%s: incorrect state %s", __func__,
4514 MigrationStatus_str(s->state));
4515 return -EINVAL;
4516 }
4517
4518 /*
4519 * Note: see comments in ramblock_recv_bitmap_send() on why we
3a4452d8 4520 * need the endianness conversion, and the paddings.
a335debb
PX
4521 */
4522 local_size = ROUND_UP(local_size, 8);
4523
4524 /* Add paddings */
4525 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4526
4527 size = qemu_get_be64(file);
4528
4529 /* The size of the bitmap should match with our ramblock */
4530 if (size != local_size) {
4531 error_report("%s: ramblock '%s' bitmap size mismatch "
4532 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4533 block->idstr, size, local_size);
4534 ret = -EINVAL;
4535 goto out;
4536 }
4537
4538 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4539 end_mark = qemu_get_be64(file);
4540
4541 ret = qemu_file_get_error(file);
4542 if (ret || size != local_size) {
4543 error_report("%s: read bitmap failed for ramblock '%s': %d"
4544 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4545 __func__, block->idstr, ret, local_size, size);
4546 ret = -EIO;
4547 goto out;
4548 }
4549
4550 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
af3bbbe9 4551 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
a335debb
PX
4552 __func__, block->idstr, end_mark);
4553 ret = -EINVAL;
4554 goto out;
4555 }
4556
4557 /*
3a4452d8 4558 * Endianness conversion. We are during postcopy (though paused).
a335debb
PX
4559 * The dirty bitmap won't change. We can directly modify it.
4560 */
4561 bitmap_from_le(block->bmap, le_bitmap, nbits);
4562
4563 /*
4564 * What we received is "received bitmap". Revert it as the initial
4565 * dirty bitmap for this ramblock.
4566 */
4567 bitmap_complement(block->bmap, block->bmap, nbits);
4568
be39b4cd
DH
4569 /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4570 ramblock_dirty_bitmap_clear_discarded_pages(block);
4571
4572 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
a335debb
PX
4573 trace_ram_dirty_bitmap_reload_complete(block->idstr);
4574
edd090c7
PX
4575 /*
4576 * We succeeded to sync bitmap for current ramblock. If this is
4577 * the last one to sync, we need to notify the main send thread.
4578 */
4579 ram_dirty_bitmap_reload_notify(s);
4580
a335debb
PX
4581 ret = 0;
4582out:
bf269906 4583 g_free(le_bitmap);
a335debb
PX
4584 return ret;
4585}
4586
edd090c7
PX
4587static int ram_resume_prepare(MigrationState *s, void *opaque)
4588{
4589 RAMState *rs = *(RAMState **)opaque;
08614f34 4590 int ret;
edd090c7 4591
08614f34
PX
4592 ret = ram_dirty_bitmap_sync_all(s, rs);
4593 if (ret) {
4594 return ret;
4595 }
4596
4597 ram_state_resume_prepare(rs, s->to_dst_file);
4598
4599 return 0;
edd090c7
PX
4600}
4601
36f62f11
PX
4602void postcopy_preempt_shutdown_file(MigrationState *s)
4603{
4604 qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
4605 qemu_fflush(s->postcopy_qemufile_src);
4606}
4607
56e93d26 4608static SaveVMHandlers savevm_ram_handlers = {
9907e842 4609 .save_setup = ram_save_setup,
56e93d26 4610 .save_live_iterate = ram_save_iterate,
763c906b 4611 .save_live_complete_postcopy = ram_save_complete,
a3e06c3d 4612 .save_live_complete_precopy = ram_save_complete,
c6467627 4613 .has_postcopy = ram_has_postcopy,
c8df4a7a
JQ
4614 .state_pending_exact = ram_state_pending_exact,
4615 .state_pending_estimate = ram_state_pending_estimate,
56e93d26 4616 .load_state = ram_load,
f265e0e4
JQ
4617 .save_cleanup = ram_save_cleanup,
4618 .load_setup = ram_load_setup,
4619 .load_cleanup = ram_load_cleanup,
edd090c7 4620 .resume_prepare = ram_resume_prepare,
56e93d26
JQ
4621};
4622
c7c0e724
DH
4623static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4624 size_t old_size, size_t new_size)
4625{
cc61c703 4626 PostcopyState ps = postcopy_state_get();
c7c0e724
DH
4627 ram_addr_t offset;
4628 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4629 Error *err = NULL;
4630
4631 if (ramblock_is_ignored(rb)) {
4632 return;
4633 }
4634
4635 if (!migration_is_idle()) {
4636 /*
4637 * Precopy code on the source cannot deal with the size of RAM blocks
4638 * changing at random points in time - especially after sending the
4639 * RAM block sizes in the migration stream, they must no longer change.
4640 * Abort and indicate a proper reason.
4641 */
4642 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
458fecca 4643 migration_cancel(err);
c7c0e724 4644 error_free(err);
c7c0e724 4645 }
cc61c703
DH
4646
4647 switch (ps) {
4648 case POSTCOPY_INCOMING_ADVISE:
4649 /*
4650 * Update what ram_postcopy_incoming_init()->init_range() does at the
4651 * time postcopy was advised. Syncing RAM blocks with the source will
4652 * result in RAM resizes.
4653 */
4654 if (old_size < new_size) {
4655 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4656 error_report("RAM block '%s' discard of resized RAM failed",
4657 rb->idstr);
4658 }
4659 }
898ba906 4660 rb->postcopy_length = new_size;
cc61c703
DH
4661 break;
4662 case POSTCOPY_INCOMING_NONE:
4663 case POSTCOPY_INCOMING_RUNNING:
4664 case POSTCOPY_INCOMING_END:
4665 /*
4666 * Once our guest is running, postcopy does no longer care about
4667 * resizes. When growing, the new memory was not available on the
4668 * source, no handler needed.
4669 */
4670 break;
4671 default:
4672 error_report("RAM block '%s' resized during postcopy state: %d",
4673 rb->idstr, ps);
4674 exit(-1);
4675 }
c7c0e724
DH
4676}
4677
4678static RAMBlockNotifier ram_mig_ram_notifier = {
4679 .ram_block_resized = ram_mig_ram_block_resized,
4680};
4681
56e93d26
JQ
4682void ram_mig_init(void)
4683{
4684 qemu_mutex_init(&XBZRLE.lock);
ce62df53 4685 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
c7c0e724 4686 ram_block_notifier_add(&ram_mig_ram_notifier);
56e93d26 4687}