]> git.proxmox.com Git - mirror_qemu.git/blame - migration/ram.c
Merge tag 'or1k-pull-request-20220515' of https://github.com/stffrdhrn/qemu into...
[mirror_qemu.git] / migration / ram.c
CommitLineData
56e93d26
JQ
1/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
76cc7b58
JQ
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
56e93d26
JQ
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
e688df6b 28
1393a485 29#include "qemu/osdep.h"
f348b6d1 30#include "qemu/cutils.h"
56e93d26
JQ
31#include "qemu/bitops.h"
32#include "qemu/bitmap.h"
b85ea5fa 33#include "qemu/madvise.h"
7205c9ec 34#include "qemu/main-loop.h"
709e3fe8 35#include "xbzrle.h"
7b1e1a22 36#include "ram.h"
6666c96a 37#include "migration.h"
f2a8f0a6 38#include "migration/register.h"
7b1e1a22 39#include "migration/misc.h"
08a0aee1 40#include "qemu-file.h"
be07b0ac 41#include "postcopy-ram.h"
53d37d36 42#include "page_cache.h"
56e93d26 43#include "qemu/error-report.h"
e688df6b 44#include "qapi/error.h"
ab7cbb0b 45#include "qapi/qapi-types-migration.h"
9af23989 46#include "qapi/qapi-events-migration.h"
8acabf69 47#include "qapi/qmp/qerror.h"
56e93d26 48#include "trace.h"
56e93d26 49#include "exec/ram_addr.h"
f9494614 50#include "exec/target_page.h"
56e93d26 51#include "qemu/rcu_queue.h"
a91246c9 52#include "migration/colo.h"
53d37d36 53#include "block.h"
b0c3cf94 54#include "sysemu/cpu-throttle.h"
edd090c7 55#include "savevm.h"
b9ee2f7d 56#include "qemu/iov.h"
d32ca5ad 57#include "multifd.h"
278e2f55
AG
58#include "sysemu/runstate.h"
59
e5fdf920
LS
60#include "hw/boards.h" /* for machine_dump_guest_core() */
61
278e2f55
AG
62#if defined(__linux__)
63#include "qemu/userfaultfd.h"
64#endif /* defined(__linux__) */
56e93d26 65
56e93d26
JQ
66/***********************************************************/
67/* ram save/restore */
68
bb890ed5
JQ
69/* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
70 * worked for pages that where filled with the same char. We switched
71 * it to only search for the zero value. And to avoid confusion with
72 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
73 */
74
56e93d26 75#define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
bb890ed5 76#define RAM_SAVE_FLAG_ZERO 0x02
56e93d26
JQ
77#define RAM_SAVE_FLAG_MEM_SIZE 0x04
78#define RAM_SAVE_FLAG_PAGE 0x08
79#define RAM_SAVE_FLAG_EOS 0x10
80#define RAM_SAVE_FLAG_CONTINUE 0x20
81#define RAM_SAVE_FLAG_XBZRLE 0x40
82/* 0x80 is reserved in migration.h start with 0x100 next */
83#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
84
9360447d
JQ
85XBZRLECacheStats xbzrle_counters;
86
56e93d26
JQ
87/* struct contains XBZRLE cache and a static page
88 used by the compression */
89static struct {
90 /* buffer used for XBZRLE encoding */
91 uint8_t *encoded_buf;
92 /* buffer for storing page content */
93 uint8_t *current_buf;
94 /* Cache for XBZRLE, Protected by lock. */
95 PageCache *cache;
96 QemuMutex lock;
c00e0928
JQ
97 /* it will store a page full of zeros */
98 uint8_t *zero_target_page;
f265e0e4
JQ
99 /* buffer used for XBZRLE decoding */
100 uint8_t *decoded_buf;
56e93d26
JQ
101} XBZRLE;
102
56e93d26
JQ
103static void XBZRLE_cache_lock(void)
104{
f4c51a6b 105 if (migrate_use_xbzrle()) {
56e93d26 106 qemu_mutex_lock(&XBZRLE.lock);
f4c51a6b 107 }
56e93d26
JQ
108}
109
110static void XBZRLE_cache_unlock(void)
111{
f4c51a6b 112 if (migrate_use_xbzrle()) {
56e93d26 113 qemu_mutex_unlock(&XBZRLE.lock);
f4c51a6b 114 }
56e93d26
JQ
115}
116
3d0684b2
JQ
117/**
118 * xbzrle_cache_resize: resize the xbzrle cache
119 *
cbde7be9 120 * This function is called from migrate_params_apply in main
3d0684b2
JQ
121 * thread, possibly while a migration is in progress. A running
122 * migration may be using the cache and might finish during this call,
123 * hence changes to the cache are protected by XBZRLE.lock().
124 *
c9dede2d 125 * Returns 0 for success or -1 for error
3d0684b2
JQ
126 *
127 * @new_size: new cache size
8acabf69 128 * @errp: set *errp if the check failed, with reason
56e93d26 129 */
8b9407a0 130int xbzrle_cache_resize(uint64_t new_size, Error **errp)
56e93d26
JQ
131{
132 PageCache *new_cache;
c9dede2d 133 int64_t ret = 0;
56e93d26 134
8acabf69
JQ
135 /* Check for truncation */
136 if (new_size != (size_t)new_size) {
137 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
138 "exceeding address space");
139 return -1;
140 }
141
2a313e5c
JQ
142 if (new_size == migrate_xbzrle_cache_size()) {
143 /* nothing to do */
c9dede2d 144 return 0;
2a313e5c
JQ
145 }
146
56e93d26
JQ
147 XBZRLE_cache_lock();
148
149 if (XBZRLE.cache != NULL) {
80f8dfde 150 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
56e93d26 151 if (!new_cache) {
56e93d26
JQ
152 ret = -1;
153 goto out;
154 }
155
156 cache_fini(XBZRLE.cache);
157 XBZRLE.cache = new_cache;
158 }
56e93d26
JQ
159out:
160 XBZRLE_cache_unlock();
161 return ret;
162}
163
3ded54b1 164bool ramblock_is_ignored(RAMBlock *block)
fbd162e6
YK
165{
166 return !qemu_ram_is_migratable(block) ||
167 (migrate_ignore_shared() && qemu_ram_is_shared(block));
168}
169
343f632c
DDAG
170#undef RAMBLOCK_FOREACH
171
fbd162e6
YK
172int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
173{
174 RAMBlock *block;
175 int ret = 0;
176
89ac5a1d
DDAG
177 RCU_READ_LOCK_GUARD();
178
fbd162e6
YK
179 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
180 ret = func(block, opaque);
181 if (ret) {
182 break;
183 }
184 }
fbd162e6
YK
185 return ret;
186}
187
f9494614
AP
188static void ramblock_recv_map_init(void)
189{
190 RAMBlock *rb;
191
fbd162e6 192 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
f9494614
AP
193 assert(!rb->receivedmap);
194 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
195 }
196}
197
198int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
199{
200 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
201 rb->receivedmap);
202}
203
1cba9f6e
DDAG
204bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
205{
206 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
207}
208
f9494614
AP
209void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
210{
211 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
212}
213
214void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
215 size_t nr)
216{
217 bitmap_set_atomic(rb->receivedmap,
218 ramblock_recv_bitmap_offset(host_addr, rb),
219 nr);
220}
221
a335debb
PX
222#define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
223
224/*
225 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
226 *
227 * Returns >0 if success with sent bytes, or <0 if error.
228 */
229int64_t ramblock_recv_bitmap_send(QEMUFile *file,
230 const char *block_name)
231{
232 RAMBlock *block = qemu_ram_block_by_name(block_name);
233 unsigned long *le_bitmap, nbits;
234 uint64_t size;
235
236 if (!block) {
237 error_report("%s: invalid block name: %s", __func__, block_name);
238 return -1;
239 }
240
898ba906 241 nbits = block->postcopy_length >> TARGET_PAGE_BITS;
a335debb
PX
242
243 /*
244 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
245 * machines we may need 4 more bytes for padding (see below
246 * comment). So extend it a bit before hand.
247 */
248 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
249
250 /*
251 * Always use little endian when sending the bitmap. This is
252 * required that when source and destination VMs are not using the
3a4452d8 253 * same endianness. (Note: big endian won't work.)
a335debb
PX
254 */
255 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
256
257 /* Size of the bitmap, in bytes */
a725ef9f 258 size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
259
260 /*
261 * size is always aligned to 8 bytes for 64bit machines, but it
262 * may not be true for 32bit machines. We need this padding to
263 * make sure the migration can survive even between 32bit and
264 * 64bit machines.
265 */
266 size = ROUND_UP(size, 8);
267
268 qemu_put_be64(file, size);
269 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
270 /*
271 * Mark as an end, in case the middle part is screwed up due to
3a4452d8 272 * some "mysterious" reason.
a335debb
PX
273 */
274 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
275 qemu_fflush(file);
276
bf269906 277 g_free(le_bitmap);
a335debb
PX
278
279 if (qemu_file_get_error(file)) {
280 return qemu_file_get_error(file);
281 }
282
283 return size + sizeof(size);
284}
285
ec481c6c
JQ
286/*
287 * An outstanding page request, on the source, having been received
288 * and queued
289 */
290struct RAMSrcPageRequest {
291 RAMBlock *rb;
292 hwaddr offset;
293 hwaddr len;
294
295 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
296};
297
6f37bb8b
JQ
298/* State of RAM for migration */
299struct RAMState {
204b88b8
JQ
300 /* QEMUFile used for this migration */
301 QEMUFile *f;
278e2f55
AG
302 /* UFFD file descriptor, used in 'write-tracking' migration */
303 int uffdio_fd;
6f37bb8b
JQ
304 /* Last block that we have visited searching for dirty pages */
305 RAMBlock *last_seen_block;
306 /* Last block from where we have sent data */
307 RAMBlock *last_sent_block;
269ace29
JQ
308 /* Last dirty target page we have sent */
309 ram_addr_t last_page;
6f37bb8b
JQ
310 /* last ram version we have seen */
311 uint32_t last_version;
8d820d6f
JQ
312 /* How many times we have dirty too many pages */
313 int dirty_rate_high_cnt;
f664da80
JQ
314 /* these variables are used for bitmap sync */
315 /* last time we did a full bitmap_sync */
316 int64_t time_last_bitmap_sync;
eac74159 317 /* bytes transferred at start_time */
c4bdf0cf 318 uint64_t bytes_xfer_prev;
a66cd90c 319 /* number of dirty pages since start_time */
68908ed6 320 uint64_t num_dirty_pages_period;
b5833fde
JQ
321 /* xbzrle misses since the beginning of the period */
322 uint64_t xbzrle_cache_miss_prev;
e460a4b1
WW
323 /* Amount of xbzrle pages since the beginning of the period */
324 uint64_t xbzrle_pages_prev;
325 /* Amount of xbzrle encoded bytes since the beginning of the period */
326 uint64_t xbzrle_bytes_prev;
1a373522
DH
327 /* Start using XBZRLE (e.g., after the first round). */
328 bool xbzrle_enabled;
05931ec5
JQ
329 /* Are we on the last stage of migration */
330 bool last_stage;
76e03000
XG
331 /* compression statistics since the beginning of the period */
332 /* amount of count that no free thread to compress data */
333 uint64_t compress_thread_busy_prev;
334 /* amount bytes after compression */
335 uint64_t compressed_size_prev;
336 /* amount of compressed pages */
337 uint64_t compress_pages_prev;
338
be8b02ed
XG
339 /* total handled target pages at the beginning of period */
340 uint64_t target_page_count_prev;
341 /* total handled target pages since start */
342 uint64_t target_page_count;
9360447d 343 /* number of dirty bits in the bitmap */
2dfaf12e 344 uint64_t migration_dirty_pages;
386a907b 345 /* Protects modification of the bitmap and migration dirty pages */
108cfae0 346 QemuMutex bitmap_mutex;
68a098f3
JQ
347 /* The RAMBlock used in the last src_page_requests */
348 RAMBlock *last_req_rb;
ec481c6c
JQ
349 /* Queue of outstanding page requests from the destination */
350 QemuMutex src_page_req_mutex;
b58deb34 351 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
6f37bb8b
JQ
352};
353typedef struct RAMState RAMState;
354
53518d94 355static RAMState *ram_state;
6f37bb8b 356
bd227060
WW
357static NotifierWithReturnList precopy_notifier_list;
358
a1fe28df
PX
359/* Whether postcopy has queued requests? */
360static bool postcopy_has_request(RAMState *rs)
361{
362 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
363}
364
bd227060
WW
365void precopy_infrastructure_init(void)
366{
367 notifier_with_return_list_init(&precopy_notifier_list);
368}
369
370void precopy_add_notifier(NotifierWithReturn *n)
371{
372 notifier_with_return_list_add(&precopy_notifier_list, n);
373}
374
375void precopy_remove_notifier(NotifierWithReturn *n)
376{
377 notifier_with_return_remove(n);
378}
379
380int precopy_notify(PrecopyNotifyReason reason, Error **errp)
381{
382 PrecopyNotifyData pnd;
383 pnd.reason = reason;
384 pnd.errp = errp;
385
386 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
387}
388
9edabd4d 389uint64_t ram_bytes_remaining(void)
2f4fde93 390{
bae416e5
DDAG
391 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
392 0;
2f4fde93
JQ
393}
394
9360447d 395MigrationStats ram_counters;
96506894 396
4c2d0f6d
DE
397static void ram_transferred_add(uint64_t bytes)
398{
ae680668
DE
399 if (runstate_is_running()) {
400 ram_counters.precopy_bytes += bytes;
401 } else if (migration_in_postcopy()) {
402 ram_counters.postcopy_bytes += bytes;
403 } else {
404 ram_counters.downtime_bytes += bytes;
405 }
4c2d0f6d
DE
406 ram_counters.transferred += bytes;
407}
408
b8fb8cb7
DDAG
409/* used by the search for pages to send */
410struct PageSearchStatus {
411 /* Current block being searched */
412 RAMBlock *block;
a935e30f
JQ
413 /* Current page to search from */
414 unsigned long page;
b8fb8cb7
DDAG
415 /* Set once we wrap around */
416 bool complete_round;
ea2faf0c
PX
417 /* Whether current page is explicitly requested by postcopy */
418 bool postcopy_requested;
b8fb8cb7
DDAG
419};
420typedef struct PageSearchStatus PageSearchStatus;
421
76e03000
XG
422CompressionStats compression_counters;
423
56e93d26 424struct CompressParam {
56e93d26 425 bool done;
90e56fb4 426 bool quit;
5e5fdcff 427 bool zero_page;
56e93d26
JQ
428 QEMUFile *file;
429 QemuMutex mutex;
430 QemuCond cond;
431 RAMBlock *block;
432 ram_addr_t offset;
34ab9e97
XG
433
434 /* internally used fields */
dcaf446e 435 z_stream stream;
34ab9e97 436 uint8_t *originbuf;
56e93d26
JQ
437};
438typedef struct CompressParam CompressParam;
439
440struct DecompressParam {
73a8912b 441 bool done;
90e56fb4 442 bool quit;
56e93d26
JQ
443 QemuMutex mutex;
444 QemuCond cond;
445 void *des;
d341d9f3 446 uint8_t *compbuf;
56e93d26 447 int len;
797ca154 448 z_stream stream;
56e93d26
JQ
449};
450typedef struct DecompressParam DecompressParam;
451
452static CompressParam *comp_param;
453static QemuThread *compress_threads;
454/* comp_done_cond is used to wake up the migration thread when
455 * one of the compression threads has finished the compression.
456 * comp_done_lock is used to co-work with comp_done_cond.
457 */
0d9f9a5c
LL
458static QemuMutex comp_done_lock;
459static QemuCond comp_done_cond;
56e93d26
JQ
460/* The empty QEMUFileOps will be used by file in CompressParam */
461static const QEMUFileOps empty_ops = { };
462
34ab9e97 463static QEMUFile *decomp_file;
56e93d26
JQ
464static DecompressParam *decomp_param;
465static QemuThread *decompress_threads;
73a8912b
LL
466static QemuMutex decomp_done_lock;
467static QemuCond decomp_done_cond;
56e93d26 468
5e5fdcff 469static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
6ef3771c 470 ram_addr_t offset, uint8_t *source_buf);
56e93d26
JQ
471
472static void *do_data_compress(void *opaque)
473{
474 CompressParam *param = opaque;
a7a9a88f
LL
475 RAMBlock *block;
476 ram_addr_t offset;
5e5fdcff 477 bool zero_page;
56e93d26 478
a7a9a88f 479 qemu_mutex_lock(&param->mutex);
90e56fb4 480 while (!param->quit) {
a7a9a88f
LL
481 if (param->block) {
482 block = param->block;
483 offset = param->offset;
484 param->block = NULL;
485 qemu_mutex_unlock(&param->mutex);
486
5e5fdcff
XG
487 zero_page = do_compress_ram_page(param->file, &param->stream,
488 block, offset, param->originbuf);
a7a9a88f 489
0d9f9a5c 490 qemu_mutex_lock(&comp_done_lock);
a7a9a88f 491 param->done = true;
5e5fdcff 492 param->zero_page = zero_page;
0d9f9a5c
LL
493 qemu_cond_signal(&comp_done_cond);
494 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
495
496 qemu_mutex_lock(&param->mutex);
497 } else {
56e93d26
JQ
498 qemu_cond_wait(&param->cond, &param->mutex);
499 }
56e93d26 500 }
a7a9a88f 501 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
502
503 return NULL;
504}
505
f0afa331 506static void compress_threads_save_cleanup(void)
56e93d26
JQ
507{
508 int i, thread_count;
509
05306935 510 if (!migrate_use_compression() || !comp_param) {
56e93d26
JQ
511 return;
512 }
05306935 513
56e93d26
JQ
514 thread_count = migrate_compress_threads();
515 for (i = 0; i < thread_count; i++) {
dcaf446e
XG
516 /*
517 * we use it as a indicator which shows if the thread is
518 * properly init'd or not
519 */
520 if (!comp_param[i].file) {
521 break;
522 }
05306935
FL
523
524 qemu_mutex_lock(&comp_param[i].mutex);
525 comp_param[i].quit = true;
526 qemu_cond_signal(&comp_param[i].cond);
527 qemu_mutex_unlock(&comp_param[i].mutex);
528
56e93d26 529 qemu_thread_join(compress_threads + i);
56e93d26
JQ
530 qemu_mutex_destroy(&comp_param[i].mutex);
531 qemu_cond_destroy(&comp_param[i].cond);
dcaf446e 532 deflateEnd(&comp_param[i].stream);
34ab9e97 533 g_free(comp_param[i].originbuf);
dcaf446e
XG
534 qemu_fclose(comp_param[i].file);
535 comp_param[i].file = NULL;
56e93d26 536 }
0d9f9a5c
LL
537 qemu_mutex_destroy(&comp_done_lock);
538 qemu_cond_destroy(&comp_done_cond);
56e93d26
JQ
539 g_free(compress_threads);
540 g_free(comp_param);
56e93d26
JQ
541 compress_threads = NULL;
542 comp_param = NULL;
56e93d26
JQ
543}
544
dcaf446e 545static int compress_threads_save_setup(void)
56e93d26
JQ
546{
547 int i, thread_count;
548
549 if (!migrate_use_compression()) {
dcaf446e 550 return 0;
56e93d26 551 }
56e93d26
JQ
552 thread_count = migrate_compress_threads();
553 compress_threads = g_new0(QemuThread, thread_count);
554 comp_param = g_new0(CompressParam, thread_count);
0d9f9a5c
LL
555 qemu_cond_init(&comp_done_cond);
556 qemu_mutex_init(&comp_done_lock);
56e93d26 557 for (i = 0; i < thread_count; i++) {
34ab9e97
XG
558 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
559 if (!comp_param[i].originbuf) {
560 goto exit;
561 }
562
dcaf446e
XG
563 if (deflateInit(&comp_param[i].stream,
564 migrate_compress_level()) != Z_OK) {
34ab9e97 565 g_free(comp_param[i].originbuf);
dcaf446e
XG
566 goto exit;
567 }
568
e110aa91
C
569 /* comp_param[i].file is just used as a dummy buffer to save data,
570 * set its ops to empty.
56e93d26 571 */
c6ad5be7 572 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops, false);
56e93d26 573 comp_param[i].done = true;
90e56fb4 574 comp_param[i].quit = false;
56e93d26
JQ
575 qemu_mutex_init(&comp_param[i].mutex);
576 qemu_cond_init(&comp_param[i].cond);
577 qemu_thread_create(compress_threads + i, "compress",
578 do_data_compress, comp_param + i,
579 QEMU_THREAD_JOINABLE);
580 }
dcaf446e
XG
581 return 0;
582
583exit:
584 compress_threads_save_cleanup();
585 return -1;
56e93d26
JQ
586}
587
588/**
3d0684b2 589 * save_page_header: write page header to wire
56e93d26
JQ
590 *
591 * If this is the 1st block, it also writes the block identification
592 *
3d0684b2 593 * Returns the number of bytes written
56e93d26
JQ
594 *
595 * @f: QEMUFile where to send the data
596 * @block: block that contains the page we want to send
597 * @offset: offset inside the block for the page
598 * in the lower bits, it contains flags
599 */
2bf3aa85
JQ
600static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
601 ram_addr_t offset)
56e93d26 602{
9f5f380b 603 size_t size, len;
56e93d26 604
24795694
JQ
605 if (block == rs->last_sent_block) {
606 offset |= RAM_SAVE_FLAG_CONTINUE;
607 }
2bf3aa85 608 qemu_put_be64(f, offset);
56e93d26
JQ
609 size = 8;
610
611 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
9f5f380b 612 len = strlen(block->idstr);
2bf3aa85
JQ
613 qemu_put_byte(f, len);
614 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
9f5f380b 615 size += 1 + len;
24795694 616 rs->last_sent_block = block;
56e93d26
JQ
617 }
618 return size;
619}
620
3d0684b2 621/**
179a8080 622 * mig_throttle_guest_down: throttle down the guest
3d0684b2
JQ
623 *
624 * Reduce amount of guest cpu execution to hopefully slow down memory
625 * writes. If guest dirty memory rate is reduced below the rate at
626 * which we can transfer pages to the destination then we should be
627 * able to complete migration. Some workloads dirty memory way too
628 * fast and will not effectively converge, even with auto-converge.
070afca2 629 */
cbbf8182
KZ
630static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
631 uint64_t bytes_dirty_threshold)
070afca2
JH
632{
633 MigrationState *s = migrate_get_current();
2594f56d 634 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
cbbf8182
KZ
635 uint64_t pct_increment = s->parameters.cpu_throttle_increment;
636 bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
4cbc9c7f 637 int pct_max = s->parameters.max_cpu_throttle;
070afca2 638
cbbf8182
KZ
639 uint64_t throttle_now = cpu_throttle_get_percentage();
640 uint64_t cpu_now, cpu_ideal, throttle_inc;
641
070afca2
JH
642 /* We have not started throttling yet. Let's start it. */
643 if (!cpu_throttle_active()) {
644 cpu_throttle_set(pct_initial);
645 } else {
646 /* Throttling already on, just increase the rate */
cbbf8182
KZ
647 if (!pct_tailslow) {
648 throttle_inc = pct_increment;
649 } else {
650 /* Compute the ideal CPU percentage used by Guest, which may
651 * make the dirty rate match the dirty rate threshold. */
652 cpu_now = 100 - throttle_now;
653 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
654 bytes_dirty_period);
655 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
656 }
657 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
070afca2
JH
658 }
659}
660
91fe9a8d
RL
661void mig_throttle_counter_reset(void)
662{
663 RAMState *rs = ram_state;
664
665 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
666 rs->num_dirty_pages_period = 0;
667 rs->bytes_xfer_prev = ram_counters.transferred;
668}
669
3d0684b2
JQ
670/**
671 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
672 *
6f37bb8b 673 * @rs: current RAM state
3d0684b2
JQ
674 * @current_addr: address for the zero page
675 *
676 * Update the xbzrle cache to reflect a page that's been sent as all 0.
56e93d26
JQ
677 * The important thing is that a stale (not-yet-0'd) page be replaced
678 * by the new data.
679 * As a bonus, if the page wasn't in the cache it gets added so that
3d0684b2 680 * when a small write is made into the 0'd page it gets XBZRLE sent.
56e93d26 681 */
6f37bb8b 682static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
56e93d26 683{
1a373522 684 if (!rs->xbzrle_enabled) {
56e93d26
JQ
685 return;
686 }
687
688 /* We don't care if this fails to allocate a new cache page
689 * as long as it updated an old one */
c00e0928 690 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
9360447d 691 ram_counters.dirty_sync_count);
56e93d26
JQ
692}
693
694#define ENCODING_FLAG_XBZRLE 0x1
695
696/**
697 * save_xbzrle_page: compress and send current page
698 *
699 * Returns: 1 means that we wrote the page
700 * 0 means that page is identical to the one already sent
701 * -1 means that xbzrle would be longer than normal
702 *
5a987738 703 * @rs: current RAM state
3d0684b2
JQ
704 * @current_data: pointer to the address of the page contents
705 * @current_addr: addr of the page
56e93d26
JQ
706 * @block: block that contains the page we want to send
707 * @offset: offset inside the block for the page
56e93d26 708 */
204b88b8 709static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
56e93d26 710 ram_addr_t current_addr, RAMBlock *block,
05931ec5 711 ram_addr_t offset)
56e93d26
JQ
712{
713 int encoded_len = 0, bytes_xbzrle;
714 uint8_t *prev_cached_page;
715
9360447d
JQ
716 if (!cache_is_cached(XBZRLE.cache, current_addr,
717 ram_counters.dirty_sync_count)) {
718 xbzrle_counters.cache_miss++;
05931ec5 719 if (!rs->last_stage) {
56e93d26 720 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
9360447d 721 ram_counters.dirty_sync_count) == -1) {
56e93d26
JQ
722 return -1;
723 } else {
724 /* update *current_data when the page has been
725 inserted into cache */
726 *current_data = get_cached_data(XBZRLE.cache, current_addr);
727 }
728 }
729 return -1;
730 }
731
e460a4b1
WW
732 /*
733 * Reaching here means the page has hit the xbzrle cache, no matter what
734 * encoding result it is (normal encoding, overflow or skipping the page),
3a4452d8 735 * count the page as encoded. This is used to calculate the encoding rate.
e460a4b1
WW
736 *
737 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
738 * 2nd page turns out to be skipped (i.e. no new bytes written to the
739 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
740 * skipped page included. In this way, the encoding rate can tell if the
741 * guest page is good for xbzrle encoding.
742 */
743 xbzrle_counters.pages++;
56e93d26
JQ
744 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
745
746 /* save current buffer into memory */
747 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
748
749 /* XBZRLE encoding (if there is no overflow) */
750 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
751 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
752 TARGET_PAGE_SIZE);
ca353803
WY
753
754 /*
755 * Update the cache contents, so that it corresponds to the data
756 * sent, in all cases except where we skip the page.
757 */
05931ec5 758 if (!rs->last_stage && encoded_len != 0) {
ca353803
WY
759 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
760 /*
761 * In the case where we couldn't compress, ensure that the caller
762 * sends the data from the cache, since the guest might have
763 * changed the RAM since we copied it.
764 */
765 *current_data = prev_cached_page;
766 }
767
56e93d26 768 if (encoded_len == 0) {
55c4446b 769 trace_save_xbzrle_page_skipping();
56e93d26
JQ
770 return 0;
771 } else if (encoded_len == -1) {
55c4446b 772 trace_save_xbzrle_page_overflow();
9360447d 773 xbzrle_counters.overflow++;
e460a4b1 774 xbzrle_counters.bytes += TARGET_PAGE_SIZE;
56e93d26
JQ
775 return -1;
776 }
777
56e93d26 778 /* Send XBZRLE based compressed page */
2bf3aa85 779 bytes_xbzrle = save_page_header(rs, rs->f, block,
204b88b8
JQ
780 offset | RAM_SAVE_FLAG_XBZRLE);
781 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
782 qemu_put_be16(rs->f, encoded_len);
783 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
56e93d26 784 bytes_xbzrle += encoded_len + 1 + 2;
e460a4b1
WW
785 /*
786 * Like compressed_size (please see update_compress_thread_counts),
787 * the xbzrle encoded bytes don't count the 8 byte header with
788 * RAM_SAVE_FLAG_CONTINUE.
789 */
790 xbzrle_counters.bytes += bytes_xbzrle - 8;
4c2d0f6d 791 ram_transferred_add(bytes_xbzrle);
56e93d26
JQ
792
793 return 1;
794}
795
3d0684b2
JQ
796/**
797 * migration_bitmap_find_dirty: find the next dirty page from start
f3f491fc 798 *
a5f7b1a6 799 * Returns the page offset within memory region of the start of a dirty page
3d0684b2 800 *
6f37bb8b 801 * @rs: current RAM state
3d0684b2 802 * @rb: RAMBlock where to search for dirty pages
a935e30f 803 * @start: page where we start the search
f3f491fc 804 */
56e93d26 805static inline
a935e30f 806unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
f20e2865 807 unsigned long start)
56e93d26 808{
6b6712ef
JQ
809 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
810 unsigned long *bitmap = rb->bmap;
56e93d26 811
fbd162e6 812 if (ramblock_is_ignored(rb)) {
b895de50
CLG
813 return size;
814 }
815
1a373522 816 return find_next_bit(bitmap, size, start);
56e93d26
JQ
817}
818
1230a25f 819static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
3143577d
WW
820 unsigned long page)
821{
822 uint8_t shift;
823 hwaddr size, start;
824
825 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
826 return;
827 }
828
829 shift = rb->clear_bmap_shift;
830 /*
831 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
832 * can make things easier sometimes since then start address
833 * of the small chunk will always be 64 pages aligned so the
834 * bitmap will always be aligned to unsigned long. We should
835 * even be able to remove this restriction but I'm simply
836 * keeping it.
837 */
838 assert(shift >= 6);
839
840 size = 1ULL << (TARGET_PAGE_BITS + shift);
7648297d 841 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
3143577d
WW
842 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
843 memory_region_clear_dirty_bitmap(rb->mr, start, size);
844}
845
846static void
1230a25f 847migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
3143577d
WW
848 unsigned long start,
849 unsigned long npages)
850{
851 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
852 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
853 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
854
855 /*
856 * Clear pages from start to start + npages - 1, so the end boundary is
857 * exclusive.
858 */
859 for (i = chunk_start; i < chunk_end; i += chunk_pages) {
1230a25f 860 migration_clear_memory_region_dirty_bitmap(rb, i);
3143577d
WW
861 }
862}
863
a6a83cef
RL
864/*
865 * colo_bitmap_find_diry:find contiguous dirty pages from start
866 *
867 * Returns the page offset within memory region of the start of the contiguout
868 * dirty page
869 *
870 * @rs: current RAM state
871 * @rb: RAMBlock where to search for dirty pages
872 * @start: page where we start the search
873 * @num: the number of contiguous dirty pages
874 */
875static inline
876unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
877 unsigned long start, unsigned long *num)
878{
879 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
880 unsigned long *bitmap = rb->bmap;
881 unsigned long first, next;
882
883 *num = 0;
884
885 if (ramblock_is_ignored(rb)) {
886 return size;
887 }
888
889 first = find_next_bit(bitmap, size, start);
890 if (first >= size) {
891 return first;
892 }
893 next = find_next_zero_bit(bitmap, size, first + 1);
894 assert(next >= first);
895 *num = next - first;
896 return first;
897}
898
06b10688 899static inline bool migration_bitmap_clear_dirty(RAMState *rs,
f20e2865
JQ
900 RAMBlock *rb,
901 unsigned long page)
a82d593b
DDAG
902{
903 bool ret;
a82d593b 904
002cad6b
PX
905 /*
906 * Clear dirty bitmap if needed. This _must_ be called before we
907 * send any of the page in the chunk because we need to make sure
908 * we can capture further page content changes when we sync dirty
909 * log the next time. So as long as we are going to send any of
910 * the page in the chunk we clear the remote dirty bitmap for all.
911 * Clearing it earlier won't be a problem, but too late will.
912 */
1230a25f 913 migration_clear_memory_region_dirty_bitmap(rb, page);
002cad6b 914
6b6712ef 915 ret = test_and_clear_bit(page, rb->bmap);
a82d593b 916 if (ret) {
0d8ec885 917 rs->migration_dirty_pages--;
a82d593b 918 }
386a907b 919
a82d593b
DDAG
920 return ret;
921}
922
be39b4cd
DH
923static void dirty_bitmap_clear_section(MemoryRegionSection *section,
924 void *opaque)
925{
926 const hwaddr offset = section->offset_within_region;
927 const hwaddr size = int128_get64(section->size);
928 const unsigned long start = offset >> TARGET_PAGE_BITS;
929 const unsigned long npages = size >> TARGET_PAGE_BITS;
930 RAMBlock *rb = section->mr->ram_block;
931 uint64_t *cleared_bits = opaque;
932
933 /*
934 * We don't grab ram_state->bitmap_mutex because we expect to run
935 * only when starting migration or during postcopy recovery where
936 * we don't have concurrent access.
937 */
938 if (!migration_in_postcopy() && !migrate_background_snapshot()) {
939 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
940 }
941 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
942 bitmap_clear(rb->bmap, start, npages);
943}
944
945/*
946 * Exclude all dirty pages from migration that fall into a discarded range as
947 * managed by a RamDiscardManager responsible for the mapped memory region of
948 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
949 *
950 * Discarded pages ("logically unplugged") have undefined content and must
951 * not get migrated, because even reading these pages for migration might
952 * result in undesired behavior.
953 *
954 * Returns the number of cleared bits in the RAMBlock dirty bitmap.
955 *
956 * Note: The result is only stable while migrating (precopy/postcopy).
957 */
958static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
959{
960 uint64_t cleared_bits = 0;
961
962 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
963 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
964 MemoryRegionSection section = {
965 .mr = rb->mr,
966 .offset_within_region = 0,
967 .size = int128_make64(qemu_ram_get_used_length(rb)),
968 };
969
970 ram_discard_manager_replay_discarded(rdm, &section,
971 dirty_bitmap_clear_section,
972 &cleared_bits);
973 }
974 return cleared_bits;
975}
976
9470c5e0
DH
977/*
978 * Check if a host-page aligned page falls into a discarded range as managed by
979 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
980 *
981 * Note: The result is only stable while migrating (precopy/postcopy).
982 */
983bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
984{
985 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
986 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
987 MemoryRegionSection section = {
988 .mr = rb->mr,
989 .offset_within_region = start,
990 .size = int128_make64(qemu_ram_pagesize(rb)),
991 };
992
993 return !ram_discard_manager_is_populated(rdm, &section);
994 }
995 return false;
996}
997
267691b6 998/* Called with RCU critical section */
7a3e9571 999static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
56e93d26 1000{
fb613580
KZ
1001 uint64_t new_dirty_pages =
1002 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
1003
1004 rs->migration_dirty_pages += new_dirty_pages;
1005 rs->num_dirty_pages_period += new_dirty_pages;
56e93d26
JQ
1006}
1007
3d0684b2
JQ
1008/**
1009 * ram_pagesize_summary: calculate all the pagesizes of a VM
1010 *
1011 * Returns a summary bitmap of the page sizes of all RAMBlocks
1012 *
1013 * For VMs with just normal pages this is equivalent to the host page
1014 * size. If it's got some huge pages then it's the OR of all the
1015 * different page sizes.
e8ca1db2
DDAG
1016 */
1017uint64_t ram_pagesize_summary(void)
1018{
1019 RAMBlock *block;
1020 uint64_t summary = 0;
1021
fbd162e6 1022 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
e8ca1db2
DDAG
1023 summary |= block->page_size;
1024 }
1025
1026 return summary;
1027}
1028
aecbfe9c
XG
1029uint64_t ram_get_total_transferred_pages(void)
1030{
1031 return ram_counters.normal + ram_counters.duplicate +
1032 compression_counters.pages + xbzrle_counters.pages;
1033}
1034
b734035b
XG
1035static void migration_update_rates(RAMState *rs, int64_t end_time)
1036{
be8b02ed 1037 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
76e03000 1038 double compressed_size;
b734035b
XG
1039
1040 /* calculate period counters */
1041 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1042 / (end_time - rs->time_last_bitmap_sync);
1043
be8b02ed 1044 if (!page_count) {
b734035b
XG
1045 return;
1046 }
1047
1048 if (migrate_use_xbzrle()) {
e460a4b1
WW
1049 double encoded_size, unencoded_size;
1050
b734035b 1051 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
be8b02ed 1052 rs->xbzrle_cache_miss_prev) / page_count;
b734035b 1053 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
e460a4b1
WW
1054 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1055 TARGET_PAGE_SIZE;
1056 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
92271402 1057 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
e460a4b1 1058 xbzrle_counters.encoding_rate = 0;
e460a4b1
WW
1059 } else {
1060 xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1061 }
1062 rs->xbzrle_pages_prev = xbzrle_counters.pages;
1063 rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
b734035b 1064 }
76e03000
XG
1065
1066 if (migrate_use_compression()) {
1067 compression_counters.busy_rate = (double)(compression_counters.busy -
1068 rs->compress_thread_busy_prev) / page_count;
1069 rs->compress_thread_busy_prev = compression_counters.busy;
1070
1071 compressed_size = compression_counters.compressed_size -
1072 rs->compressed_size_prev;
1073 if (compressed_size) {
1074 double uncompressed_size = (compression_counters.pages -
1075 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1076
1077 /* Compression-Ratio = Uncompressed-size / Compressed-size */
1078 compression_counters.compression_rate =
1079 uncompressed_size / compressed_size;
1080
1081 rs->compress_pages_prev = compression_counters.pages;
1082 rs->compressed_size_prev = compression_counters.compressed_size;
1083 }
1084 }
b734035b
XG
1085}
1086
dc14a470
KZ
1087static void migration_trigger_throttle(RAMState *rs)
1088{
1089 MigrationState *s = migrate_get_current();
1090 uint64_t threshold = s->parameters.throttle_trigger_threshold;
1091
1092 uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
1093 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1094 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1095
1096 /* During block migration the auto-converge logic incorrectly detects
1097 * that ram migration makes no progress. Avoid this by disabling the
1098 * throttling logic during the bulk phase of block migration. */
1099 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1100 /* The following detection logic can be refined later. For now:
1101 Check to see if the ratio between dirtied bytes and the approx.
1102 amount of bytes that just got transferred since the last time
1103 we were in this routine reaches the threshold. If that happens
1104 twice, start or increase throttling. */
1105
1106 if ((bytes_dirty_period > bytes_dirty_threshold) &&
1107 (++rs->dirty_rate_high_cnt >= 2)) {
1108 trace_migration_throttle();
1109 rs->dirty_rate_high_cnt = 0;
cbbf8182
KZ
1110 mig_throttle_guest_down(bytes_dirty_period,
1111 bytes_dirty_threshold);
dc14a470
KZ
1112 }
1113 }
1114}
1115
8d820d6f 1116static void migration_bitmap_sync(RAMState *rs)
56e93d26
JQ
1117{
1118 RAMBlock *block;
56e93d26 1119 int64_t end_time;
56e93d26 1120
9360447d 1121 ram_counters.dirty_sync_count++;
56e93d26 1122
f664da80
JQ
1123 if (!rs->time_last_bitmap_sync) {
1124 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
56e93d26
JQ
1125 }
1126
1127 trace_migration_bitmap_sync_start();
9c1f8f44 1128 memory_global_dirty_log_sync();
56e93d26 1129
108cfae0 1130 qemu_mutex_lock(&rs->bitmap_mutex);
89ac5a1d
DDAG
1131 WITH_RCU_READ_LOCK_GUARD() {
1132 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1133 ramblock_sync_dirty_bitmap(rs, block);
1134 }
1135 ram_counters.remaining = ram_bytes_remaining();
56e93d26 1136 }
108cfae0 1137 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26 1138
9458a9a1 1139 memory_global_after_dirty_log_sync();
a66cd90c 1140 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1ffb5dfd 1141
56e93d26
JQ
1142 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1143
1144 /* more than 1 second = 1000 millisecons */
f664da80 1145 if (end_time > rs->time_last_bitmap_sync + 1000) {
dc14a470 1146 migration_trigger_throttle(rs);
070afca2 1147
b734035b
XG
1148 migration_update_rates(rs, end_time);
1149
be8b02ed 1150 rs->target_page_count_prev = rs->target_page_count;
d693c6f1
FF
1151
1152 /* reset period counters */
f664da80 1153 rs->time_last_bitmap_sync = end_time;
a66cd90c 1154 rs->num_dirty_pages_period = 0;
dc14a470 1155 rs->bytes_xfer_prev = ram_counters.transferred;
56e93d26 1156 }
4addcd4f 1157 if (migrate_use_events()) {
3ab72385 1158 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
4addcd4f 1159 }
56e93d26
JQ
1160}
1161
bd227060
WW
1162static void migration_bitmap_sync_precopy(RAMState *rs)
1163{
1164 Error *local_err = NULL;
1165
1166 /*
1167 * The current notifier usage is just an optimization to migration, so we
1168 * don't stop the normal migration process in the error case.
1169 */
1170 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1171 error_report_err(local_err);
b4a1733c 1172 local_err = NULL;
bd227060
WW
1173 }
1174
1175 migration_bitmap_sync(rs);
1176
1177 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1178 error_report_err(local_err);
1179 }
1180}
1181
47fe16ff
JQ
1182static void ram_release_page(const char *rbname, uint64_t offset)
1183{
1184 if (!migrate_release_ram() || !migration_in_postcopy()) {
1185 return;
1186 }
1187
1188 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1189}
1190
6c97ec5f
XG
1191/**
1192 * save_zero_page_to_file: send the zero page to the file
1193 *
1194 * Returns the size of data written to the file, 0 means the page is not
1195 * a zero page
1196 *
1197 * @rs: current RAM state
1198 * @file: the file where the data is saved
1199 * @block: block that contains the page we want to send
1200 * @offset: offset inside the block for the page
1201 */
1202static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1203 RAMBlock *block, ram_addr_t offset)
1204{
1205 uint8_t *p = block->host + offset;
1206 int len = 0;
1207
bad452a7 1208 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
6c97ec5f
XG
1209 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1210 qemu_put_byte(file, 0);
1211 len += 1;
47fe16ff 1212 ram_release_page(block->idstr, offset);
6c97ec5f
XG
1213 }
1214 return len;
1215}
1216
56e93d26 1217/**
3d0684b2 1218 * save_zero_page: send the zero page to the stream
56e93d26 1219 *
3d0684b2 1220 * Returns the number of pages written.
56e93d26 1221 *
f7ccd61b 1222 * @rs: current RAM state
56e93d26
JQ
1223 * @block: block that contains the page we want to send
1224 * @offset: offset inside the block for the page
56e93d26 1225 */
7faccdc3 1226static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
56e93d26 1227{
6c97ec5f 1228 int len = save_zero_page_to_file(rs, rs->f, block, offset);
56e93d26 1229
6c97ec5f 1230 if (len) {
9360447d 1231 ram_counters.duplicate++;
4c2d0f6d 1232 ram_transferred_add(len);
6c97ec5f 1233 return 1;
56e93d26 1234 }
6c97ec5f 1235 return -1;
56e93d26
JQ
1236}
1237
059ff0fb
XG
1238/*
1239 * @pages: the number of pages written by the control path,
1240 * < 0 - error
1241 * > 0 - number of pages written
1242 *
1243 * Return true if the pages has been saved, otherwise false is returned.
1244 */
1245static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1246 int *pages)
1247{
1248 uint64_t bytes_xmit = 0;
1249 int ret;
1250
1251 *pages = -1;
1252 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1253 &bytes_xmit);
1254 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1255 return false;
1256 }
1257
1258 if (bytes_xmit) {
4c2d0f6d 1259 ram_transferred_add(bytes_xmit);
059ff0fb
XG
1260 *pages = 1;
1261 }
1262
1263 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1264 return true;
1265 }
1266
1267 if (bytes_xmit > 0) {
1268 ram_counters.normal++;
1269 } else if (bytes_xmit == 0) {
1270 ram_counters.duplicate++;
1271 }
1272
1273 return true;
1274}
1275
65dacaa0
XG
1276/*
1277 * directly send the page to the stream
1278 *
1279 * Returns the number of pages written.
1280 *
1281 * @rs: current RAM state
1282 * @block: block that contains the page we want to send
1283 * @offset: offset inside the block for the page
1284 * @buf: the page to be sent
1285 * @async: send to page asyncly
1286 */
1287static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1288 uint8_t *buf, bool async)
1289{
4c2d0f6d
DE
1290 ram_transferred_add(save_page_header(rs, rs->f, block,
1291 offset | RAM_SAVE_FLAG_PAGE));
65dacaa0
XG
1292 if (async) {
1293 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
f912ec5b 1294 migrate_release_ram() &&
65dacaa0
XG
1295 migration_in_postcopy());
1296 } else {
1297 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1298 }
4c2d0f6d 1299 ram_transferred_add(TARGET_PAGE_SIZE);
65dacaa0
XG
1300 ram_counters.normal++;
1301 return 1;
1302}
1303
56e93d26 1304/**
3d0684b2 1305 * ram_save_page: send the given page to the stream
56e93d26 1306 *
3d0684b2 1307 * Returns the number of pages written.
3fd3c4b3
DDAG
1308 * < 0 - error
1309 * >=0 - Number of pages written - this might legally be 0
1310 * if xbzrle noticed the page was the same.
56e93d26 1311 *
6f37bb8b 1312 * @rs: current RAM state
56e93d26
JQ
1313 * @block: block that contains the page we want to send
1314 * @offset: offset inside the block for the page
56e93d26 1315 */
05931ec5 1316static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
56e93d26
JQ
1317{
1318 int pages = -1;
56e93d26 1319 uint8_t *p;
56e93d26 1320 bool send_async = true;
a08f6890 1321 RAMBlock *block = pss->block;
8bba004c 1322 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
059ff0fb 1323 ram_addr_t current_addr = block->offset + offset;
56e93d26 1324
2f68e399 1325 p = block->host + offset;
1db9d8e5 1326 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
56e93d26 1327
56e93d26 1328 XBZRLE_cache_lock();
1a373522 1329 if (rs->xbzrle_enabled && !migration_in_postcopy()) {
059ff0fb 1330 pages = save_xbzrle_page(rs, &p, current_addr, block,
05931ec5
JQ
1331 offset);
1332 if (!rs->last_stage) {
059ff0fb
XG
1333 /* Can't send this cached data async, since the cache page
1334 * might get updated before it gets to the wire
56e93d26 1335 */
059ff0fb 1336 send_async = false;
56e93d26
JQ
1337 }
1338 }
1339
1340 /* XBZRLE overflow or normal page */
1341 if (pages == -1) {
65dacaa0 1342 pages = save_normal_page(rs, block, offset, p, send_async);
56e93d26
JQ
1343 }
1344
1345 XBZRLE_cache_unlock();
1346
1347 return pages;
1348}
1349
b9ee2f7d
JQ
1350static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1351 ram_addr_t offset)
1352{
67a4c891 1353 if (multifd_queue_page(rs->f, block, offset) < 0) {
713f762a
IR
1354 return -1;
1355 }
b9ee2f7d
JQ
1356 ram_counters.normal++;
1357
1358 return 1;
1359}
1360
5e5fdcff 1361static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
6ef3771c 1362 ram_addr_t offset, uint8_t *source_buf)
56e93d26 1363{
53518d94 1364 RAMState *rs = ram_state;
20d549cb 1365 uint8_t *p = block->host + offset;
6ef3771c 1366 int ret;
56e93d26 1367
5e5fdcff 1368 if (save_zero_page_to_file(rs, f, block, offset)) {
e7f2e190 1369 return true;
5e5fdcff
XG
1370 }
1371
6ef3771c 1372 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
34ab9e97
XG
1373
1374 /*
1375 * copy it to a internal buffer to avoid it being modified by VM
1376 * so that we can catch up the error during compression and
1377 * decompression
1378 */
1379 memcpy(source_buf, p, TARGET_PAGE_SIZE);
6ef3771c
XG
1380 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1381 if (ret < 0) {
1382 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
b3be2896 1383 error_report("compressed data failed!");
b3be2896 1384 }
e7f2e190 1385 return false;
5e5fdcff
XG
1386}
1387
1388static void
1389update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1390{
4c2d0f6d 1391 ram_transferred_add(bytes_xmit);
76e03000 1392
5e5fdcff
XG
1393 if (param->zero_page) {
1394 ram_counters.duplicate++;
76e03000 1395 return;
5e5fdcff 1396 }
76e03000
XG
1397
1398 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1399 compression_counters.compressed_size += bytes_xmit - 8;
1400 compression_counters.pages++;
56e93d26
JQ
1401}
1402
32b05495
XG
1403static bool save_page_use_compression(RAMState *rs);
1404
ce25d337 1405static void flush_compressed_data(RAMState *rs)
56e93d26
JQ
1406{
1407 int idx, len, thread_count;
1408
32b05495 1409 if (!save_page_use_compression(rs)) {
56e93d26
JQ
1410 return;
1411 }
1412 thread_count = migrate_compress_threads();
a7a9a88f 1413
0d9f9a5c 1414 qemu_mutex_lock(&comp_done_lock);
56e93d26 1415 for (idx = 0; idx < thread_count; idx++) {
a7a9a88f 1416 while (!comp_param[idx].done) {
0d9f9a5c 1417 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26 1418 }
a7a9a88f 1419 }
0d9f9a5c 1420 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
1421
1422 for (idx = 0; idx < thread_count; idx++) {
1423 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 1424 if (!comp_param[idx].quit) {
ce25d337 1425 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
5e5fdcff
XG
1426 /*
1427 * it's safe to fetch zero_page without holding comp_done_lock
1428 * as there is no further request submitted to the thread,
1429 * i.e, the thread should be waiting for a request at this point.
1430 */
1431 update_compress_thread_counts(&comp_param[idx], len);
56e93d26 1432 }
a7a9a88f 1433 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26
JQ
1434 }
1435}
1436
1437static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1438 ram_addr_t offset)
1439{
1440 param->block = block;
1441 param->offset = offset;
1442}
1443
ce25d337
JQ
1444static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1445 ram_addr_t offset)
56e93d26
JQ
1446{
1447 int idx, thread_count, bytes_xmit = -1, pages = -1;
1d58872a 1448 bool wait = migrate_compress_wait_thread();
56e93d26
JQ
1449
1450 thread_count = migrate_compress_threads();
0d9f9a5c 1451 qemu_mutex_lock(&comp_done_lock);
1d58872a
XG
1452retry:
1453 for (idx = 0; idx < thread_count; idx++) {
1454 if (comp_param[idx].done) {
1455 comp_param[idx].done = false;
1456 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1457 qemu_mutex_lock(&comp_param[idx].mutex);
1458 set_compress_params(&comp_param[idx], block, offset);
1459 qemu_cond_signal(&comp_param[idx].cond);
1460 qemu_mutex_unlock(&comp_param[idx].mutex);
1461 pages = 1;
5e5fdcff 1462 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
56e93d26 1463 break;
56e93d26
JQ
1464 }
1465 }
1d58872a
XG
1466
1467 /*
1468 * wait for the free thread if the user specifies 'compress-wait-thread',
1469 * otherwise we will post the page out in the main thread as normal page.
1470 */
1471 if (pages < 0 && wait) {
1472 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1473 goto retry;
1474 }
0d9f9a5c 1475 qemu_mutex_unlock(&comp_done_lock);
56e93d26
JQ
1476
1477 return pages;
1478}
1479
3d0684b2
JQ
1480/**
1481 * find_dirty_block: find the next dirty page and update any state
1482 * associated with the search process.
b9e60928 1483 *
a5f7b1a6 1484 * Returns true if a page is found
b9e60928 1485 *
6f37bb8b 1486 * @rs: current RAM state
3d0684b2
JQ
1487 * @pss: data about the state of the current dirty page scan
1488 * @again: set to false if the search has scanned the whole of RAM
b9e60928 1489 */
f20e2865 1490static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
b9e60928 1491{
ea2faf0c
PX
1492 /* This is not a postcopy requested page */
1493 pss->postcopy_requested = false;
1494
f20e2865 1495 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
6f37bb8b 1496 if (pss->complete_round && pss->block == rs->last_seen_block &&
a935e30f 1497 pss->page >= rs->last_page) {
b9e60928
DDAG
1498 /*
1499 * We've been once around the RAM and haven't found anything.
1500 * Give up.
1501 */
1502 *again = false;
1503 return false;
1504 }
542147f4
DH
1505 if (!offset_in_ramblock(pss->block,
1506 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
b9e60928 1507 /* Didn't find anything in this RAM Block */
a935e30f 1508 pss->page = 0;
b9e60928
DDAG
1509 pss->block = QLIST_NEXT_RCU(pss->block, next);
1510 if (!pss->block) {
48df9d80
XG
1511 /*
1512 * If memory migration starts over, we will meet a dirtied page
1513 * which may still exists in compression threads's ring, so we
1514 * should flush the compressed data to make sure the new page
1515 * is not overwritten by the old one in the destination.
1516 *
1517 * Also If xbzrle is on, stop using the data compression at this
1518 * point. In theory, xbzrle can do better than compression.
1519 */
1520 flush_compressed_data(rs);
1521
b9e60928
DDAG
1522 /* Hit the end of the list */
1523 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1524 /* Flag that we've looped */
1525 pss->complete_round = true;
1a373522
DH
1526 /* After the first round, enable XBZRLE. */
1527 if (migrate_use_xbzrle()) {
1528 rs->xbzrle_enabled = true;
1529 }
b9e60928
DDAG
1530 }
1531 /* Didn't find anything this time, but try again on the new block */
1532 *again = true;
1533 return false;
1534 } else {
1535 /* Can go around again, but... */
1536 *again = true;
1537 /* We've found something so probably don't need to */
1538 return true;
1539 }
1540}
1541
3d0684b2
JQ
1542/**
1543 * unqueue_page: gets a page of the queue
1544 *
a82d593b 1545 * Helper for 'get_queued_page' - gets a page off the queue
a82d593b 1546 *
3d0684b2
JQ
1547 * Returns the block of the page (or NULL if none available)
1548 *
ec481c6c 1549 * @rs: current RAM state
3d0684b2 1550 * @offset: used to return the offset within the RAMBlock
a82d593b 1551 */
f20e2865 1552static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
a82d593b 1553{
a1fe28df 1554 struct RAMSrcPageRequest *entry;
a82d593b 1555 RAMBlock *block = NULL;
cfd66f30 1556 size_t page_size;
a82d593b 1557
a1fe28df 1558 if (!postcopy_has_request(rs)) {
ae526e32
XG
1559 return NULL;
1560 }
1561
6e8a355d 1562 QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
a1fe28df
PX
1563
1564 /*
1565 * This should _never_ change even after we take the lock, because no one
1566 * should be taking anything off the request list other than us.
1567 */
1568 assert(postcopy_has_request(rs));
1569
1570 entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1571 block = entry->rb;
1572 *offset = entry->offset;
cfd66f30
PX
1573 page_size = qemu_ram_pagesize(block);
1574 /* Each page request should only be multiple page size of the ramblock */
1575 assert((entry->len % page_size) == 0);
a1fe28df 1576
cfd66f30
PX
1577 if (entry->len > page_size) {
1578 entry->len -= page_size;
1579 entry->offset += page_size;
a1fe28df
PX
1580 } else {
1581 memory_region_unref(block->mr);
1582 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1583 g_free(entry);
1584 migration_consume_urgent_request();
a82d593b 1585 }
a82d593b 1586
cfd66f30
PX
1587 trace_unqueue_page(block->idstr, *offset,
1588 test_bit((*offset >> TARGET_PAGE_BITS), block->bmap));
1589
a82d593b
DDAG
1590 return block;
1591}
1592
278e2f55
AG
1593#if defined(__linux__)
1594/**
1595 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1596 * is found, return RAM block pointer and page offset
1597 *
1598 * Returns pointer to the RAMBlock containing faulting page,
1599 * NULL if no write faults are pending
1600 *
1601 * @rs: current RAM state
1602 * @offset: page offset from the beginning of the block
1603 */
1604static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1605{
1606 struct uffd_msg uffd_msg;
1607 void *page_address;
82ea3e3b 1608 RAMBlock *block;
278e2f55
AG
1609 int res;
1610
1611 if (!migrate_background_snapshot()) {
1612 return NULL;
1613 }
1614
1615 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1616 if (res <= 0) {
1617 return NULL;
1618 }
1619
1620 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
82ea3e3b
AG
1621 block = qemu_ram_block_from_host(page_address, false, offset);
1622 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1623 return block;
278e2f55
AG
1624}
1625
1626/**
1627 * ram_save_release_protection: release UFFD write protection after
1628 * a range of pages has been saved
1629 *
1630 * @rs: current RAM state
1631 * @pss: page-search-status structure
1632 * @start_page: index of the first page in the range relative to pss->block
1633 *
1634 * Returns 0 on success, negative value in case of an error
1635*/
1636static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1637 unsigned long start_page)
1638{
1639 int res = 0;
1640
1641 /* Check if page is from UFFD-managed region. */
1642 if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1643 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
258f5c98 1644 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
278e2f55
AG
1645
1646 /* Flush async buffers before un-protect. */
1647 qemu_fflush(rs->f);
1648 /* Un-protect memory range. */
1649 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1650 false, false);
1651 }
1652
1653 return res;
1654}
1655
1656/* ram_write_tracking_available: check if kernel supports required UFFD features
1657 *
1658 * Returns true if supports, false otherwise
1659 */
1660bool ram_write_tracking_available(void)
1661{
1662 uint64_t uffd_features;
1663 int res;
1664
1665 res = uffd_query_features(&uffd_features);
1666 return (res == 0 &&
1667 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1668}
1669
1670/* ram_write_tracking_compatible: check if guest configuration is
1671 * compatible with 'write-tracking'
1672 *
1673 * Returns true if compatible, false otherwise
1674 */
1675bool ram_write_tracking_compatible(void)
1676{
1677 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1678 int uffd_fd;
82ea3e3b 1679 RAMBlock *block;
278e2f55
AG
1680 bool ret = false;
1681
1682 /* Open UFFD file descriptor */
1683 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1684 if (uffd_fd < 0) {
1685 return false;
1686 }
1687
1688 RCU_READ_LOCK_GUARD();
1689
82ea3e3b 1690 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
278e2f55
AG
1691 uint64_t uffd_ioctls;
1692
1693 /* Nothing to do with read-only and MMIO-writable regions */
82ea3e3b 1694 if (block->mr->readonly || block->mr->rom_device) {
278e2f55
AG
1695 continue;
1696 }
1697 /* Try to register block memory via UFFD-IO to track writes */
82ea3e3b 1698 if (uffd_register_memory(uffd_fd, block->host, block->max_length,
278e2f55
AG
1699 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1700 goto out;
1701 }
1702 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1703 goto out;
1704 }
1705 }
1706 ret = true;
1707
1708out:
1709 uffd_close_fd(uffd_fd);
1710 return ret;
1711}
1712
f7b9dcfb
DH
1713static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1714 ram_addr_t size)
1715{
1716 /*
1717 * We read one byte of each page; this will preallocate page tables if
1718 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1719 * where no page was populated yet. This might require adaption when
1720 * supporting other mappings, like shmem.
1721 */
1722 for (; offset < size; offset += block->page_size) {
1723 char tmp = *((char *)block->host + offset);
1724
1725 /* Don't optimize the read out */
1726 asm volatile("" : "+r" (tmp));
1727 }
1728}
1729
6fee3a1f
DH
1730static inline int populate_read_section(MemoryRegionSection *section,
1731 void *opaque)
1732{
1733 const hwaddr size = int128_get64(section->size);
1734 hwaddr offset = section->offset_within_region;
1735 RAMBlock *block = section->mr->ram_block;
1736
1737 populate_read_range(block, offset, size);
1738 return 0;
1739}
1740
eeccb99c 1741/*
f7b9dcfb
DH
1742 * ram_block_populate_read: preallocate page tables and populate pages in the
1743 * RAM block by reading a byte of each page.
eeccb99c
AG
1744 *
1745 * Since it's solely used for userfault_fd WP feature, here we just
1746 * hardcode page size to qemu_real_host_page_size.
1747 *
82ea3e3b 1748 * @block: RAM block to populate
eeccb99c 1749 */
6fee3a1f 1750static void ram_block_populate_read(RAMBlock *rb)
eeccb99c 1751{
6fee3a1f
DH
1752 /*
1753 * Skip populating all pages that fall into a discarded range as managed by
1754 * a RamDiscardManager responsible for the mapped memory region of the
1755 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1756 * must not get populated automatically. We don't have to track
1757 * modifications via userfaultfd WP reliably, because these pages will
1758 * not be part of the migration stream either way -- see
1759 * ramblock_dirty_bitmap_exclude_discarded_pages().
1760 *
1761 * Note: The result is only stable while migrating (precopy/postcopy).
1762 */
1763 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1764 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1765 MemoryRegionSection section = {
1766 .mr = rb->mr,
1767 .offset_within_region = 0,
1768 .size = rb->mr->size,
1769 };
1770
1771 ram_discard_manager_replay_populated(rdm, &section,
1772 populate_read_section, NULL);
1773 } else {
1774 populate_read_range(rb, 0, rb->used_length);
1775 }
eeccb99c
AG
1776}
1777
1778/*
1779 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1780 */
1781void ram_write_tracking_prepare(void)
1782{
82ea3e3b 1783 RAMBlock *block;
eeccb99c
AG
1784
1785 RCU_READ_LOCK_GUARD();
1786
82ea3e3b 1787 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
eeccb99c 1788 /* Nothing to do with read-only and MMIO-writable regions */
82ea3e3b 1789 if (block->mr->readonly || block->mr->rom_device) {
eeccb99c
AG
1790 continue;
1791 }
1792
1793 /*
1794 * Populate pages of the RAM block before enabling userfault_fd
1795 * write protection.
1796 *
1797 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1798 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1799 * pages with pte_none() entries in page table.
1800 */
f7b9dcfb 1801 ram_block_populate_read(block);
eeccb99c
AG
1802 }
1803}
1804
278e2f55
AG
1805/*
1806 * ram_write_tracking_start: start UFFD-WP memory tracking
1807 *
1808 * Returns 0 for success or negative value in case of error
1809 */
1810int ram_write_tracking_start(void)
1811{
1812 int uffd_fd;
1813 RAMState *rs = ram_state;
82ea3e3b 1814 RAMBlock *block;
278e2f55
AG
1815
1816 /* Open UFFD file descriptor */
1817 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1818 if (uffd_fd < 0) {
1819 return uffd_fd;
1820 }
1821 rs->uffdio_fd = uffd_fd;
1822
1823 RCU_READ_LOCK_GUARD();
1824
82ea3e3b 1825 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
278e2f55 1826 /* Nothing to do with read-only and MMIO-writable regions */
82ea3e3b 1827 if (block->mr->readonly || block->mr->rom_device) {
278e2f55
AG
1828 continue;
1829 }
1830
1831 /* Register block memory with UFFD to track writes */
82ea3e3b
AG
1832 if (uffd_register_memory(rs->uffdio_fd, block->host,
1833 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
278e2f55
AG
1834 goto fail;
1835 }
1836 /* Apply UFFD write protection to the block memory range */
82ea3e3b
AG
1837 if (uffd_change_protection(rs->uffdio_fd, block->host,
1838 block->max_length, true, false)) {
278e2f55
AG
1839 goto fail;
1840 }
82ea3e3b
AG
1841 block->flags |= RAM_UF_WRITEPROTECT;
1842 memory_region_ref(block->mr);
278e2f55 1843
82ea3e3b
AG
1844 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1845 block->host, block->max_length);
278e2f55
AG
1846 }
1847
1848 return 0;
1849
1850fail:
1851 error_report("ram_write_tracking_start() failed: restoring initial memory state");
1852
82ea3e3b
AG
1853 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1854 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
278e2f55
AG
1855 continue;
1856 }
1857 /*
1858 * In case some memory block failed to be write-protected
1859 * remove protection and unregister all succeeded RAM blocks
1860 */
82ea3e3b
AG
1861 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1862 false, false);
1863 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
278e2f55 1864 /* Cleanup flags and remove reference */
82ea3e3b
AG
1865 block->flags &= ~RAM_UF_WRITEPROTECT;
1866 memory_region_unref(block->mr);
278e2f55
AG
1867 }
1868
1869 uffd_close_fd(uffd_fd);
1870 rs->uffdio_fd = -1;
1871 return -1;
1872}
1873
1874/**
1875 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1876 */
1877void ram_write_tracking_stop(void)
1878{
1879 RAMState *rs = ram_state;
82ea3e3b 1880 RAMBlock *block;
278e2f55
AG
1881
1882 RCU_READ_LOCK_GUARD();
1883
82ea3e3b
AG
1884 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1885 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
278e2f55
AG
1886 continue;
1887 }
1888 /* Remove protection and unregister all affected RAM blocks */
82ea3e3b
AG
1889 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1890 false, false);
1891 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
278e2f55 1892
82ea3e3b
AG
1893 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1894 block->host, block->max_length);
278e2f55
AG
1895
1896 /* Cleanup flags and remove reference */
82ea3e3b
AG
1897 block->flags &= ~RAM_UF_WRITEPROTECT;
1898 memory_region_unref(block->mr);
278e2f55
AG
1899 }
1900
1901 /* Finally close UFFD file descriptor */
1902 uffd_close_fd(rs->uffdio_fd);
1903 rs->uffdio_fd = -1;
1904}
1905
1906#else
1907/* No target OS support, stubs just fail or ignore */
1908
1909static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1910{
1911 (void) rs;
1912 (void) offset;
1913
1914 return NULL;
1915}
1916
1917static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1918 unsigned long start_page)
1919{
1920 (void) rs;
1921 (void) pss;
1922 (void) start_page;
1923
1924 return 0;
1925}
1926
1927bool ram_write_tracking_available(void)
1928{
1929 return false;
1930}
1931
1932bool ram_write_tracking_compatible(void)
1933{
1934 assert(0);
1935 return false;
1936}
1937
1938int ram_write_tracking_start(void)
1939{
1940 assert(0);
1941 return -1;
1942}
1943
1944void ram_write_tracking_stop(void)
1945{
1946 assert(0);
1947}
1948#endif /* defined(__linux__) */
1949
3d0684b2 1950/**
ff1543af 1951 * get_queued_page: unqueue a page from the postcopy requests
3d0684b2
JQ
1952 *
1953 * Skips pages that are already sent (!dirty)
a82d593b 1954 *
a5f7b1a6 1955 * Returns true if a queued page is found
a82d593b 1956 *
6f37bb8b 1957 * @rs: current RAM state
3d0684b2 1958 * @pss: data about the state of the current dirty page scan
a82d593b 1959 */
f20e2865 1960static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
a82d593b
DDAG
1961{
1962 RAMBlock *block;
1963 ram_addr_t offset;
a82d593b 1964
cfd66f30 1965 block = unqueue_page(rs, &offset);
a82d593b 1966
278e2f55
AG
1967 if (!block) {
1968 /*
1969 * Poll write faults too if background snapshot is enabled; that's
1970 * when we have vcpus got blocked by the write protected pages.
1971 */
1972 block = poll_fault_page(rs, &offset);
1973 }
1974
a82d593b 1975 if (block) {
a82d593b
DDAG
1976 /*
1977 * We want the background search to continue from the queued page
1978 * since the guest is likely to want other pages near to the page
1979 * it just requested.
1980 */
1981 pss->block = block;
a935e30f 1982 pss->page = offset >> TARGET_PAGE_BITS;
422314e7
WY
1983
1984 /*
1985 * This unqueued page would break the "one round" check, even is
1986 * really rare.
1987 */
1988 pss->complete_round = false;
ea2faf0c 1989 pss->postcopy_requested = true;
a82d593b
DDAG
1990 }
1991
1992 return !!block;
1993}
1994
6c595cde 1995/**
5e58f968
JQ
1996 * migration_page_queue_free: drop any remaining pages in the ram
1997 * request queue
6c595cde 1998 *
3d0684b2
JQ
1999 * It should be empty at the end anyway, but in error cases there may
2000 * be some left. in case that there is any page left, we drop it.
2001 *
6c595cde 2002 */
83c13382 2003static void migration_page_queue_free(RAMState *rs)
6c595cde 2004{
ec481c6c 2005 struct RAMSrcPageRequest *mspr, *next_mspr;
6c595cde
DDAG
2006 /* This queue generally should be empty - but in the case of a failed
2007 * migration might have some droppings in.
2008 */
89ac5a1d 2009 RCU_READ_LOCK_GUARD();
ec481c6c 2010 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
6c595cde 2011 memory_region_unref(mspr->rb->mr);
ec481c6c 2012 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
6c595cde
DDAG
2013 g_free(mspr);
2014 }
6c595cde
DDAG
2015}
2016
2017/**
3d0684b2
JQ
2018 * ram_save_queue_pages: queue the page for transmission
2019 *
2020 * A request from postcopy destination for example.
2021 *
2022 * Returns zero on success or negative on error
2023 *
3d0684b2
JQ
2024 * @rbname: Name of the RAMBLock of the request. NULL means the
2025 * same that last one.
2026 * @start: starting address from the start of the RAMBlock
2027 * @len: length (in bytes) to send
6c595cde 2028 */
96506894 2029int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
6c595cde
DDAG
2030{
2031 RAMBlock *ramblock;
53518d94 2032 RAMState *rs = ram_state;
6c595cde 2033
9360447d 2034 ram_counters.postcopy_requests++;
89ac5a1d
DDAG
2035 RCU_READ_LOCK_GUARD();
2036
6c595cde
DDAG
2037 if (!rbname) {
2038 /* Reuse last RAMBlock */
68a098f3 2039 ramblock = rs->last_req_rb;
6c595cde
DDAG
2040
2041 if (!ramblock) {
2042 /*
2043 * Shouldn't happen, we can't reuse the last RAMBlock if
2044 * it's the 1st request.
2045 */
2046 error_report("ram_save_queue_pages no previous block");
03acb4e9 2047 return -1;
6c595cde
DDAG
2048 }
2049 } else {
2050 ramblock = qemu_ram_block_by_name(rbname);
2051
2052 if (!ramblock) {
2053 /* We shouldn't be asked for a non-existent RAMBlock */
2054 error_report("ram_save_queue_pages no block '%s'", rbname);
03acb4e9 2055 return -1;
6c595cde 2056 }
68a098f3 2057 rs->last_req_rb = ramblock;
6c595cde
DDAG
2058 }
2059 trace_ram_save_queue_pages(ramblock->idstr, start, len);
542147f4 2060 if (!offset_in_ramblock(ramblock, start + len - 1)) {
9458ad6b
JQ
2061 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2062 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
6c595cde 2063 __func__, start, len, ramblock->used_length);
03acb4e9 2064 return -1;
6c595cde
DDAG
2065 }
2066
ec481c6c 2067 struct RAMSrcPageRequest *new_entry =
b21e2380 2068 g_new0(struct RAMSrcPageRequest, 1);
6c595cde
DDAG
2069 new_entry->rb = ramblock;
2070 new_entry->offset = start;
2071 new_entry->len = len;
2072
2073 memory_region_ref(ramblock->mr);
ec481c6c
JQ
2074 qemu_mutex_lock(&rs->src_page_req_mutex);
2075 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
e03a34f8 2076 migration_make_urgent_request();
ec481c6c 2077 qemu_mutex_unlock(&rs->src_page_req_mutex);
6c595cde
DDAG
2078
2079 return 0;
6c595cde
DDAG
2080}
2081
d7400a34
XG
2082static bool save_page_use_compression(RAMState *rs)
2083{
2084 if (!migrate_use_compression()) {
2085 return false;
2086 }
2087
2088 /*
1a373522
DH
2089 * If xbzrle is enabled (e.g., after first round of migration), stop
2090 * using the data compression. In theory, xbzrle can do better than
2091 * compression.
d7400a34 2092 */
1a373522
DH
2093 if (rs->xbzrle_enabled) {
2094 return false;
d7400a34
XG
2095 }
2096
1a373522 2097 return true;
d7400a34
XG
2098}
2099
5e5fdcff
XG
2100/*
2101 * try to compress the page before posting it out, return true if the page
2102 * has been properly handled by compression, otherwise needs other
2103 * paths to handle it
2104 */
2105static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2106{
2107 if (!save_page_use_compression(rs)) {
2108 return false;
2109 }
2110
2111 /*
2112 * When starting the process of a new block, the first page of
2113 * the block should be sent out before other pages in the same
2114 * block, and all the pages in last block should have been sent
2115 * out, keeping this order is important, because the 'cont' flag
2116 * is used to avoid resending the block name.
2117 *
2118 * We post the fist page as normal page as compression will take
2119 * much CPU resource.
2120 */
2121 if (block != rs->last_sent_block) {
2122 flush_compressed_data(rs);
2123 return false;
2124 }
2125
2126 if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2127 return true;
2128 }
2129
76e03000 2130 compression_counters.busy++;
5e5fdcff
XG
2131 return false;
2132}
2133
a82d593b 2134/**
3d0684b2 2135 * ram_save_target_page: save one target page
a82d593b 2136 *
3d0684b2 2137 * Returns the number of pages written
a82d593b 2138 *
6f37bb8b 2139 * @rs: current RAM state
3d0684b2 2140 * @pss: data about the page we want to send
a82d593b 2141 */
05931ec5 2142static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss)
a82d593b 2143{
a8ec91f9 2144 RAMBlock *block = pss->block;
8bba004c 2145 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
a8ec91f9
XG
2146 int res;
2147
2148 if (control_save_page(rs, block, offset, &res)) {
2149 return res;
2150 }
2151
5e5fdcff
XG
2152 if (save_compress_page(rs, block, offset)) {
2153 return 1;
d7400a34
XG
2154 }
2155
2156 res = save_zero_page(rs, block, offset);
2157 if (res > 0) {
2158 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2159 * page would be stale
2160 */
2161 if (!save_page_use_compression(rs)) {
2162 XBZRLE_cache_lock();
2163 xbzrle_cache_zero_page(rs, block->offset + offset);
2164 XBZRLE_cache_unlock();
2165 }
d7400a34
XG
2166 return res;
2167 }
2168
da3f56cb 2169 /*
c6b3a2e0
WY
2170 * Do not use multifd for:
2171 * 1. Compression as the first page in the new block should be posted out
2172 * before sending the compressed page
2173 * 2. In postcopy as one whole host page should be placed
da3f56cb 2174 */
c6b3a2e0
WY
2175 if (!save_page_use_compression(rs) && migrate_use_multifd()
2176 && !migration_in_postcopy()) {
b9ee2f7d 2177 return ram_save_multifd_page(rs, block, offset);
a82d593b
DDAG
2178 }
2179
05931ec5 2180 return ram_save_page(rs, pss);
a82d593b
DDAG
2181}
2182
2183/**
3d0684b2 2184 * ram_save_host_page: save a whole host page
a82d593b 2185 *
3d0684b2
JQ
2186 * Starting at *offset send pages up to the end of the current host
2187 * page. It's valid for the initial offset to point into the middle of
2188 * a host page in which case the remainder of the hostpage is sent.
2189 * Only dirty target pages are sent. Note that the host page size may
2190 * be a huge page for this block.
1eb3fc0a
DDAG
2191 * The saving stops at the boundary of the used_length of the block
2192 * if the RAMBlock isn't a multiple of the host page size.
a82d593b 2193 *
3d0684b2
JQ
2194 * Returns the number of pages written or negative on error
2195 *
6f37bb8b 2196 * @rs: current RAM state
3d0684b2 2197 * @pss: data about the page we want to send
a82d593b 2198 */
05931ec5 2199static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
a82d593b
DDAG
2200{
2201 int tmppages, pages = 0;
a935e30f
JQ
2202 size_t pagesize_bits =
2203 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
ba1b7c81
KJ
2204 unsigned long hostpage_boundary =
2205 QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
278e2f55
AG
2206 unsigned long start_page = pss->page;
2207 int res;
4c011c37 2208
fbd162e6 2209 if (ramblock_is_ignored(pss->block)) {
b895de50
CLG
2210 error_report("block %s should not be migrated !", pss->block->idstr);
2211 return 0;
2212 }
2213
a82d593b 2214 do {
1faa5665 2215 /* Check the pages is dirty and if it is send it */
ba1b7c81 2216 if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
05931ec5 2217 tmppages = ram_save_target_page(rs, pss);
ba1b7c81
KJ
2218 if (tmppages < 0) {
2219 return tmppages;
2220 }
a82d593b 2221
ba1b7c81
KJ
2222 pages += tmppages;
2223 /*
2224 * Allow rate limiting to happen in the middle of huge pages if
2225 * something is sent in the current iteration.
2226 */
2227 if (pagesize_bits > 1 && tmppages > 0) {
2228 migration_rate_limit();
2229 }
23feba90 2230 }
ba1b7c81
KJ
2231 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2232 } while ((pss->page < hostpage_boundary) &&
8bba004c
AR
2233 offset_in_ramblock(pss->block,
2234 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
ba1b7c81 2235 /* The offset we leave with is the min boundary of host page and block */
258f5c98 2236 pss->page = MIN(pss->page, hostpage_boundary);
278e2f55
AG
2237
2238 res = ram_save_release_protection(rs, pss, start_page);
2239 return (res < 0 ? res : pages);
a82d593b 2240}
6c595cde 2241
56e93d26 2242/**
3d0684b2 2243 * ram_find_and_save_block: finds a dirty page and sends it to f
56e93d26
JQ
2244 *
2245 * Called within an RCU critical section.
2246 *
e8f3735f
XG
2247 * Returns the number of pages written where zero means no dirty pages,
2248 * or negative on error
56e93d26 2249 *
6f37bb8b 2250 * @rs: current RAM state
a82d593b
DDAG
2251 *
2252 * On systems where host-page-size > target-page-size it will send all the
2253 * pages in a host page that are dirty.
56e93d26 2254 */
05931ec5 2255static int ram_find_and_save_block(RAMState *rs)
56e93d26 2256{
b8fb8cb7 2257 PageSearchStatus pss;
56e93d26 2258 int pages = 0;
b9e60928 2259 bool again, found;
56e93d26 2260
0827b9e9
AA
2261 /* No dirty page as there is zero RAM */
2262 if (!ram_bytes_total()) {
2263 return pages;
2264 }
2265
6f37bb8b 2266 pss.block = rs->last_seen_block;
a935e30f 2267 pss.page = rs->last_page;
b8fb8cb7
DDAG
2268 pss.complete_round = false;
2269
2270 if (!pss.block) {
2271 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2272 }
56e93d26 2273
b9e60928 2274 do {
a82d593b 2275 again = true;
f20e2865 2276 found = get_queued_page(rs, &pss);
b9e60928 2277
a82d593b
DDAG
2278 if (!found) {
2279 /* priority queue empty, so just search for something dirty */
f20e2865 2280 found = find_dirty_block(rs, &pss, &again);
a82d593b 2281 }
f3f491fc 2282
a82d593b 2283 if (found) {
05931ec5 2284 pages = ram_save_host_page(rs, &pss);
56e93d26 2285 }
b9e60928 2286 } while (!pages && again);
56e93d26 2287
6f37bb8b 2288 rs->last_seen_block = pss.block;
a935e30f 2289 rs->last_page = pss.page;
56e93d26
JQ
2290
2291 return pages;
2292}
2293
2294void acct_update_position(QEMUFile *f, size_t size, bool zero)
2295{
2296 uint64_t pages = size / TARGET_PAGE_SIZE;
f7ccd61b 2297
56e93d26 2298 if (zero) {
9360447d 2299 ram_counters.duplicate += pages;
56e93d26 2300 } else {
9360447d 2301 ram_counters.normal += pages;
4c2d0f6d 2302 ram_transferred_add(size);
56e93d26
JQ
2303 qemu_update_position(f, size);
2304 }
2305}
2306
fbd162e6 2307static uint64_t ram_bytes_total_common(bool count_ignored)
56e93d26
JQ
2308{
2309 RAMBlock *block;
2310 uint64_t total = 0;
2311
89ac5a1d
DDAG
2312 RCU_READ_LOCK_GUARD();
2313
fbd162e6
YK
2314 if (count_ignored) {
2315 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2316 total += block->used_length;
2317 }
2318 } else {
2319 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2320 total += block->used_length;
2321 }
99e15582 2322 }
56e93d26
JQ
2323 return total;
2324}
2325
fbd162e6
YK
2326uint64_t ram_bytes_total(void)
2327{
2328 return ram_bytes_total_common(false);
2329}
2330
f265e0e4 2331static void xbzrle_load_setup(void)
56e93d26 2332{
f265e0e4 2333 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
56e93d26
JQ
2334}
2335
f265e0e4
JQ
2336static void xbzrle_load_cleanup(void)
2337{
2338 g_free(XBZRLE.decoded_buf);
2339 XBZRLE.decoded_buf = NULL;
2340}
2341
7d7c96be
PX
2342static void ram_state_cleanup(RAMState **rsp)
2343{
b9ccaf6d
DDAG
2344 if (*rsp) {
2345 migration_page_queue_free(*rsp);
2346 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2347 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2348 g_free(*rsp);
2349 *rsp = NULL;
2350 }
7d7c96be
PX
2351}
2352
84593a08
PX
2353static void xbzrle_cleanup(void)
2354{
2355 XBZRLE_cache_lock();
2356 if (XBZRLE.cache) {
2357 cache_fini(XBZRLE.cache);
2358 g_free(XBZRLE.encoded_buf);
2359 g_free(XBZRLE.current_buf);
2360 g_free(XBZRLE.zero_target_page);
2361 XBZRLE.cache = NULL;
2362 XBZRLE.encoded_buf = NULL;
2363 XBZRLE.current_buf = NULL;
2364 XBZRLE.zero_target_page = NULL;
2365 }
2366 XBZRLE_cache_unlock();
2367}
2368
f265e0e4 2369static void ram_save_cleanup(void *opaque)
56e93d26 2370{
53518d94 2371 RAMState **rsp = opaque;
6b6712ef 2372 RAMBlock *block;
eb859c53 2373
278e2f55
AG
2374 /* We don't use dirty log with background snapshots */
2375 if (!migrate_background_snapshot()) {
2376 /* caller have hold iothread lock or is in a bh, so there is
2377 * no writing race against the migration bitmap
2378 */
63b41db4
HH
2379 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2380 /*
2381 * do not stop dirty log without starting it, since
2382 * memory_global_dirty_log_stop will assert that
2383 * memory_global_dirty_log_start/stop used in pairs
2384 */
2385 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2386 }
278e2f55 2387 }
6b6712ef 2388
fbd162e6 2389 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
002cad6b
PX
2390 g_free(block->clear_bmap);
2391 block->clear_bmap = NULL;
6b6712ef
JQ
2392 g_free(block->bmap);
2393 block->bmap = NULL;
56e93d26
JQ
2394 }
2395
84593a08 2396 xbzrle_cleanup();
f0afa331 2397 compress_threads_save_cleanup();
7d7c96be 2398 ram_state_cleanup(rsp);
56e93d26
JQ
2399}
2400
6f37bb8b 2401static void ram_state_reset(RAMState *rs)
56e93d26 2402{
6f37bb8b
JQ
2403 rs->last_seen_block = NULL;
2404 rs->last_sent_block = NULL;
269ace29 2405 rs->last_page = 0;
6f37bb8b 2406 rs->last_version = ram_list.version;
1a373522 2407 rs->xbzrle_enabled = false;
56e93d26
JQ
2408}
2409
2410#define MAX_WAIT 50 /* ms, half buffered_file limit */
2411
e0b266f0
DDAG
2412/* **** functions for postcopy ***** */
2413
ced1c616
PB
2414void ram_postcopy_migrated_memory_release(MigrationState *ms)
2415{
2416 struct RAMBlock *block;
ced1c616 2417
fbd162e6 2418 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
6b6712ef
JQ
2419 unsigned long *bitmap = block->bmap;
2420 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2421 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
ced1c616
PB
2422
2423 while (run_start < range) {
2424 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
8bba004c
AR
2425 ram_discard_range(block->idstr,
2426 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2427 ((ram_addr_t)(run_end - run_start))
2428 << TARGET_PAGE_BITS);
ced1c616
PB
2429 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2430 }
2431 }
2432}
2433
3d0684b2
JQ
2434/**
2435 * postcopy_send_discard_bm_ram: discard a RAMBlock
2436 *
e0b266f0 2437 * Callback from postcopy_each_ram_send_discard for each RAMBlock
3d0684b2
JQ
2438 *
2439 * @ms: current migration state
89dab31b 2440 * @block: RAMBlock to discard
e0b266f0 2441 */
9e7d1223 2442static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
e0b266f0 2443{
6b6712ef 2444 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
e0b266f0 2445 unsigned long current;
1e7cf8c3 2446 unsigned long *bitmap = block->bmap;
e0b266f0 2447
6b6712ef 2448 for (current = 0; current < end; ) {
1e7cf8c3 2449 unsigned long one = find_next_bit(bitmap, end, current);
33a5cb62 2450 unsigned long zero, discard_length;
e0b266f0 2451
33a5cb62
WY
2452 if (one >= end) {
2453 break;
2454 }
e0b266f0 2455
1e7cf8c3 2456 zero = find_next_zero_bit(bitmap, end, one + 1);
33a5cb62
WY
2457
2458 if (zero >= end) {
2459 discard_length = end - one;
e0b266f0 2460 } else {
33a5cb62
WY
2461 discard_length = zero - one;
2462 }
810cf2bb 2463 postcopy_discard_send_range(ms, one, discard_length);
33a5cb62 2464 current = one + discard_length;
e0b266f0 2465 }
e0b266f0
DDAG
2466}
2467
f30c2e5b
PX
2468static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2469
3d0684b2
JQ
2470/**
2471 * postcopy_each_ram_send_discard: discard all RAMBlocks
2472 *
e0b266f0
DDAG
2473 * Utility for the outgoing postcopy code.
2474 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2475 * passing it bitmap indexes and name.
e0b266f0
DDAG
2476 * (qemu_ram_foreach_block ends up passing unscaled lengths
2477 * which would mean postcopy code would have to deal with target page)
3d0684b2
JQ
2478 *
2479 * @ms: current migration state
e0b266f0 2480 */
739fcc1b 2481static void postcopy_each_ram_send_discard(MigrationState *ms)
e0b266f0
DDAG
2482{
2483 struct RAMBlock *block;
e0b266f0 2484
fbd162e6 2485 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
810cf2bb 2486 postcopy_discard_send_init(ms, block->idstr);
e0b266f0 2487
f30c2e5b
PX
2488 /*
2489 * Deal with TPS != HPS and huge pages. It discard any partially sent
2490 * host-page size chunks, mark any partially dirty host-page size
2491 * chunks as all dirty. In this case the host-page is the host-page
2492 * for the particular RAMBlock, i.e. it might be a huge page.
2493 */
2494 postcopy_chunk_hostpages_pass(ms, block);
2495
e0b266f0
DDAG
2496 /*
2497 * Postcopy sends chunks of bitmap over the wire, but it
2498 * just needs indexes at this point, avoids it having
2499 * target page specific code.
2500 */
739fcc1b 2501 postcopy_send_discard_bm_ram(ms, block);
810cf2bb 2502 postcopy_discard_send_finish(ms);
e0b266f0 2503 }
e0b266f0
DDAG
2504}
2505
3d0684b2 2506/**
8324ef86 2507 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
3d0684b2
JQ
2508 *
2509 * Helper for postcopy_chunk_hostpages; it's called twice to
2510 * canonicalize the two bitmaps, that are similar, but one is
2511 * inverted.
99e314eb 2512 *
3d0684b2
JQ
2513 * Postcopy requires that all target pages in a hostpage are dirty or
2514 * clean, not a mix. This function canonicalizes the bitmaps.
99e314eb 2515 *
3d0684b2 2516 * @ms: current migration state
3d0684b2 2517 * @block: block that contains the page we want to canonicalize
99e314eb 2518 */
1e7cf8c3 2519static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
99e314eb 2520{
53518d94 2521 RAMState *rs = ram_state;
6b6712ef 2522 unsigned long *bitmap = block->bmap;
29c59172 2523 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
6b6712ef 2524 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
99e314eb
DDAG
2525 unsigned long run_start;
2526
29c59172
DDAG
2527 if (block->page_size == TARGET_PAGE_SIZE) {
2528 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2529 return;
2530 }
2531
1e7cf8c3
WY
2532 /* Find a dirty page */
2533 run_start = find_next_bit(bitmap, pages, 0);
99e314eb 2534
6b6712ef 2535 while (run_start < pages) {
99e314eb
DDAG
2536
2537 /*
2538 * If the start of this run of pages is in the middle of a host
2539 * page, then we need to fixup this host page.
2540 */
9dec3cc3 2541 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
99e314eb 2542 /* Find the end of this run */
1e7cf8c3 2543 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
99e314eb
DDAG
2544 /*
2545 * If the end isn't at the start of a host page, then the
2546 * run doesn't finish at the end of a host page
2547 * and we need to discard.
2548 */
99e314eb
DDAG
2549 }
2550
9dec3cc3 2551 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
99e314eb 2552 unsigned long page;
dad45ab2
WY
2553 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2554 host_ratio);
2555 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
99e314eb 2556
99e314eb
DDAG
2557 /* Clean up the bitmap */
2558 for (page = fixup_start_addr;
2559 page < fixup_start_addr + host_ratio; page++) {
99e314eb
DDAG
2560 /*
2561 * Remark them as dirty, updating the count for any pages
2562 * that weren't previously dirty.
2563 */
0d8ec885 2564 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
99e314eb
DDAG
2565 }
2566 }
2567
1e7cf8c3
WY
2568 /* Find the next dirty page for the next iteration */
2569 run_start = find_next_bit(bitmap, pages, run_start);
99e314eb
DDAG
2570 }
2571}
2572
3d0684b2
JQ
2573/**
2574 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2575 *
e0b266f0
DDAG
2576 * Transmit the set of pages to be discarded after precopy to the target
2577 * these are pages that:
2578 * a) Have been previously transmitted but are now dirty again
2579 * b) Pages that have never been transmitted, this ensures that
2580 * any pages on the destination that have been mapped by background
2581 * tasks get discarded (transparent huge pages is the specific concern)
2582 * Hopefully this is pretty sparse
3d0684b2
JQ
2583 *
2584 * @ms: current migration state
e0b266f0 2585 */
739fcc1b 2586void ram_postcopy_send_discard_bitmap(MigrationState *ms)
e0b266f0 2587{
53518d94 2588 RAMState *rs = ram_state;
e0b266f0 2589
89ac5a1d 2590 RCU_READ_LOCK_GUARD();
e0b266f0
DDAG
2591
2592 /* This should be our last sync, the src is now paused */
eb859c53 2593 migration_bitmap_sync(rs);
e0b266f0 2594
6b6712ef
JQ
2595 /* Easiest way to make sure we don't resume in the middle of a host-page */
2596 rs->last_seen_block = NULL;
2597 rs->last_sent_block = NULL;
2598 rs->last_page = 0;
e0b266f0 2599
739fcc1b 2600 postcopy_each_ram_send_discard(ms);
e0b266f0 2601
739fcc1b 2602 trace_ram_postcopy_send_discard_bitmap();
e0b266f0
DDAG
2603}
2604
3d0684b2
JQ
2605/**
2606 * ram_discard_range: discard dirtied pages at the beginning of postcopy
e0b266f0 2607 *
3d0684b2 2608 * Returns zero on success
e0b266f0 2609 *
36449157
JQ
2610 * @rbname: name of the RAMBlock of the request. NULL means the
2611 * same that last one.
3d0684b2
JQ
2612 * @start: RAMBlock starting page
2613 * @length: RAMBlock size
e0b266f0 2614 */
aaa2064c 2615int ram_discard_range(const char *rbname, uint64_t start, size_t length)
e0b266f0 2616{
36449157 2617 trace_ram_discard_range(rbname, start, length);
d3a5038c 2618
89ac5a1d 2619 RCU_READ_LOCK_GUARD();
36449157 2620 RAMBlock *rb = qemu_ram_block_by_name(rbname);
e0b266f0
DDAG
2621
2622 if (!rb) {
36449157 2623 error_report("ram_discard_range: Failed to find block '%s'", rbname);
03acb4e9 2624 return -1;
e0b266f0
DDAG
2625 }
2626
814bb08f
PX
2627 /*
2628 * On source VM, we don't need to update the received bitmap since
2629 * we don't even have one.
2630 */
2631 if (rb->receivedmap) {
2632 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2633 length >> qemu_target_page_bits());
2634 }
2635
03acb4e9 2636 return ram_block_discard_range(rb, start, length);
e0b266f0
DDAG
2637}
2638
84593a08
PX
2639/*
2640 * For every allocation, we will try not to crash the VM if the
2641 * allocation failed.
2642 */
2643static int xbzrle_init(void)
2644{
2645 Error *local_err = NULL;
2646
2647 if (!migrate_use_xbzrle()) {
2648 return 0;
2649 }
2650
2651 XBZRLE_cache_lock();
2652
2653 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2654 if (!XBZRLE.zero_target_page) {
2655 error_report("%s: Error allocating zero page", __func__);
2656 goto err_out;
2657 }
2658
2659 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2660 TARGET_PAGE_SIZE, &local_err);
2661 if (!XBZRLE.cache) {
2662 error_report_err(local_err);
2663 goto free_zero_page;
2664 }
2665
2666 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2667 if (!XBZRLE.encoded_buf) {
2668 error_report("%s: Error allocating encoded_buf", __func__);
2669 goto free_cache;
2670 }
2671
2672 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2673 if (!XBZRLE.current_buf) {
2674 error_report("%s: Error allocating current_buf", __func__);
2675 goto free_encoded_buf;
2676 }
2677
2678 /* We are all good */
2679 XBZRLE_cache_unlock();
2680 return 0;
2681
2682free_encoded_buf:
2683 g_free(XBZRLE.encoded_buf);
2684 XBZRLE.encoded_buf = NULL;
2685free_cache:
2686 cache_fini(XBZRLE.cache);
2687 XBZRLE.cache = NULL;
2688free_zero_page:
2689 g_free(XBZRLE.zero_target_page);
2690 XBZRLE.zero_target_page = NULL;
2691err_out:
2692 XBZRLE_cache_unlock();
2693 return -ENOMEM;
2694}
2695
53518d94 2696static int ram_state_init(RAMState **rsp)
56e93d26 2697{
7d00ee6a
PX
2698 *rsp = g_try_new0(RAMState, 1);
2699
2700 if (!*rsp) {
2701 error_report("%s: Init ramstate fail", __func__);
2702 return -1;
2703 }
53518d94
JQ
2704
2705 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2706 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2707 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
56e93d26 2708
7d00ee6a 2709 /*
40c4d4a8
IR
2710 * Count the total number of pages used by ram blocks not including any
2711 * gaps due to alignment or unplugs.
03158519 2712 * This must match with the initial values of dirty bitmap.
7d00ee6a 2713 */
40c4d4a8 2714 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
7d00ee6a
PX
2715 ram_state_reset(*rsp);
2716
2717 return 0;
2718}
2719
d6eff5d7 2720static void ram_list_init_bitmaps(void)
7d00ee6a 2721{
002cad6b 2722 MigrationState *ms = migrate_get_current();
d6eff5d7
PX
2723 RAMBlock *block;
2724 unsigned long pages;
002cad6b 2725 uint8_t shift;
56e93d26 2726
0827b9e9
AA
2727 /* Skip setting bitmap if there is no RAM */
2728 if (ram_bytes_total()) {
002cad6b
PX
2729 shift = ms->clear_bitmap_shift;
2730 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2731 error_report("clear_bitmap_shift (%u) too big, using "
2732 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2733 shift = CLEAR_BITMAP_SHIFT_MAX;
2734 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2735 error_report("clear_bitmap_shift (%u) too small, using "
2736 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2737 shift = CLEAR_BITMAP_SHIFT_MIN;
2738 }
2739
fbd162e6 2740 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
d6eff5d7 2741 pages = block->max_length >> TARGET_PAGE_BITS;
03158519
WY
2742 /*
2743 * The initial dirty bitmap for migration must be set with all
2744 * ones to make sure we'll migrate every guest RAM page to
2745 * destination.
40c4d4a8
IR
2746 * Here we set RAMBlock.bmap all to 1 because when rebegin a
2747 * new migration after a failed migration, ram_list.
2748 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2749 * guest memory.
03158519 2750 */
6b6712ef 2751 block->bmap = bitmap_new(pages);
40c4d4a8 2752 bitmap_set(block->bmap, 0, pages);
002cad6b
PX
2753 block->clear_bmap_shift = shift;
2754 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
0827b9e9 2755 }
f3f491fc 2756 }
d6eff5d7
PX
2757}
2758
be39b4cd
DH
2759static void migration_bitmap_clear_discarded_pages(RAMState *rs)
2760{
2761 unsigned long pages;
2762 RAMBlock *rb;
2763
2764 RCU_READ_LOCK_GUARD();
2765
2766 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
2767 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
2768 rs->migration_dirty_pages -= pages;
2769 }
2770}
2771
d6eff5d7
PX
2772static void ram_init_bitmaps(RAMState *rs)
2773{
2774 /* For memory_global_dirty_log_start below. */
2775 qemu_mutex_lock_iothread();
2776 qemu_mutex_lock_ramlist();
f3f491fc 2777
89ac5a1d
DDAG
2778 WITH_RCU_READ_LOCK_GUARD() {
2779 ram_list_init_bitmaps();
278e2f55
AG
2780 /* We don't use dirty log with background snapshots */
2781 if (!migrate_background_snapshot()) {
63b41db4 2782 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
278e2f55
AG
2783 migration_bitmap_sync_precopy(rs);
2784 }
89ac5a1d 2785 }
56e93d26 2786 qemu_mutex_unlock_ramlist();
49877834 2787 qemu_mutex_unlock_iothread();
be39b4cd
DH
2788
2789 /*
2790 * After an eventual first bitmap sync, fixup the initial bitmap
2791 * containing all 1s to exclude any discarded pages from migration.
2792 */
2793 migration_bitmap_clear_discarded_pages(rs);
d6eff5d7
PX
2794}
2795
2796static int ram_init_all(RAMState **rsp)
2797{
2798 if (ram_state_init(rsp)) {
2799 return -1;
2800 }
2801
2802 if (xbzrle_init()) {
2803 ram_state_cleanup(rsp);
2804 return -1;
2805 }
2806
2807 ram_init_bitmaps(*rsp);
a91246c9
HZ
2808
2809 return 0;
2810}
2811
08614f34
PX
2812static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2813{
2814 RAMBlock *block;
2815 uint64_t pages = 0;
2816
2817 /*
2818 * Postcopy is not using xbzrle/compression, so no need for that.
2819 * Also, since source are already halted, we don't need to care
2820 * about dirty page logging as well.
2821 */
2822
fbd162e6 2823 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
08614f34
PX
2824 pages += bitmap_count_one(block->bmap,
2825 block->used_length >> TARGET_PAGE_BITS);
2826 }
2827
2828 /* This may not be aligned with current bitmaps. Recalculate. */
2829 rs->migration_dirty_pages = pages;
2830
1a373522 2831 ram_state_reset(rs);
08614f34
PX
2832
2833 /* Update RAMState cache of output QEMUFile */
2834 rs->f = out;
2835
2836 trace_ram_state_resume_prepare(pages);
2837}
2838
6bcb05fc
WW
2839/*
2840 * This function clears bits of the free pages reported by the caller from the
2841 * migration dirty bitmap. @addr is the host address corresponding to the
2842 * start of the continuous guest free pages, and @len is the total bytes of
2843 * those pages.
2844 */
2845void qemu_guest_free_page_hint(void *addr, size_t len)
2846{
2847 RAMBlock *block;
2848 ram_addr_t offset;
2849 size_t used_len, start, npages;
2850 MigrationState *s = migrate_get_current();
2851
2852 /* This function is currently expected to be used during live migration */
2853 if (!migration_is_setup_or_active(s->state)) {
2854 return;
2855 }
2856
2857 for (; len > 0; len -= used_len, addr += used_len) {
2858 block = qemu_ram_block_from_host(addr, false, &offset);
2859 if (unlikely(!block || offset >= block->used_length)) {
2860 /*
2861 * The implementation might not support RAMBlock resize during
2862 * live migration, but it could happen in theory with future
2863 * updates. So we add a check here to capture that case.
2864 */
2865 error_report_once("%s unexpected error", __func__);
2866 return;
2867 }
2868
2869 if (len <= block->used_length - offset) {
2870 used_len = len;
2871 } else {
2872 used_len = block->used_length - offset;
2873 }
2874
2875 start = offset >> TARGET_PAGE_BITS;
2876 npages = used_len >> TARGET_PAGE_BITS;
2877
2878 qemu_mutex_lock(&ram_state->bitmap_mutex);
3143577d
WW
2879 /*
2880 * The skipped free pages are equavalent to be sent from clear_bmap's
2881 * perspective, so clear the bits from the memory region bitmap which
2882 * are initially set. Otherwise those skipped pages will be sent in
2883 * the next round after syncing from the memory region bitmap.
2884 */
1230a25f 2885 migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
6bcb05fc
WW
2886 ram_state->migration_dirty_pages -=
2887 bitmap_count_one_with_offset(block->bmap, start, npages);
2888 bitmap_clear(block->bmap, start, npages);
2889 qemu_mutex_unlock(&ram_state->bitmap_mutex);
2890 }
2891}
2892
3d0684b2
JQ
2893/*
2894 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
a91246c9
HZ
2895 * long-running RCU critical section. When rcu-reclaims in the code
2896 * start to become numerous it will be necessary to reduce the
2897 * granularity of these critical sections.
2898 */
2899
3d0684b2
JQ
2900/**
2901 * ram_save_setup: Setup RAM for migration
2902 *
2903 * Returns zero to indicate success and negative for error
2904 *
2905 * @f: QEMUFile where to send the data
2906 * @opaque: RAMState pointer
2907 */
a91246c9
HZ
2908static int ram_save_setup(QEMUFile *f, void *opaque)
2909{
53518d94 2910 RAMState **rsp = opaque;
a91246c9
HZ
2911 RAMBlock *block;
2912
dcaf446e
XG
2913 if (compress_threads_save_setup()) {
2914 return -1;
2915 }
2916
a91246c9
HZ
2917 /* migration has already setup the bitmap, reuse it. */
2918 if (!migration_in_colo_state()) {
7d00ee6a 2919 if (ram_init_all(rsp) != 0) {
dcaf446e 2920 compress_threads_save_cleanup();
a91246c9 2921 return -1;
53518d94 2922 }
a91246c9 2923 }
53518d94 2924 (*rsp)->f = f;
a91246c9 2925
0e6ebd48
DDAG
2926 WITH_RCU_READ_LOCK_GUARD() {
2927 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
56e93d26 2928
0e6ebd48
DDAG
2929 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2930 qemu_put_byte(f, strlen(block->idstr));
2931 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2932 qemu_put_be64(f, block->used_length);
2933 if (migrate_postcopy_ram() && block->page_size !=
2934 qemu_host_page_size) {
2935 qemu_put_be64(f, block->page_size);
2936 }
2937 if (migrate_ignore_shared()) {
2938 qemu_put_be64(f, block->mr->addr);
2939 }
fbd162e6 2940 }
56e93d26
JQ
2941 }
2942
56e93d26
JQ
2943 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2944 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2945
99f2c6fb 2946 multifd_send_sync_main(f);
56e93d26 2947 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
35374cbd 2948 qemu_fflush(f);
56e93d26
JQ
2949
2950 return 0;
2951}
2952
3d0684b2
JQ
2953/**
2954 * ram_save_iterate: iterative stage for migration
2955 *
2956 * Returns zero to indicate success and negative for error
2957 *
2958 * @f: QEMUFile where to send the data
2959 * @opaque: RAMState pointer
2960 */
56e93d26
JQ
2961static int ram_save_iterate(QEMUFile *f, void *opaque)
2962{
53518d94
JQ
2963 RAMState **temp = opaque;
2964 RAMState *rs = *temp;
3d4095b2 2965 int ret = 0;
56e93d26
JQ
2966 int i;
2967 int64_t t0;
5c90308f 2968 int done = 0;
56e93d26 2969
b2557345
PL
2970 if (blk_mig_bulk_active()) {
2971 /* Avoid transferring ram during bulk phase of block migration as
2972 * the bulk phase will usually take a long time and transferring
2973 * ram updates during that time is pointless. */
2974 goto out;
2975 }
2976
63268c49
PX
2977 /*
2978 * We'll take this lock a little bit long, but it's okay for two reasons.
2979 * Firstly, the only possible other thread to take it is who calls
2980 * qemu_guest_free_page_hint(), which should be rare; secondly, see
2981 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
2982 * guarantees that we'll at least released it in a regular basis.
2983 */
2984 qemu_mutex_lock(&rs->bitmap_mutex);
89ac5a1d
DDAG
2985 WITH_RCU_READ_LOCK_GUARD() {
2986 if (ram_list.version != rs->last_version) {
2987 ram_state_reset(rs);
2988 }
56e93d26 2989
89ac5a1d
DDAG
2990 /* Read version before ram_list.blocks */
2991 smp_rmb();
56e93d26 2992
89ac5a1d 2993 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
56e93d26 2994
89ac5a1d
DDAG
2995 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2996 i = 0;
2997 while ((ret = qemu_file_rate_limit(f)) == 0 ||
a1fe28df 2998 postcopy_has_request(rs)) {
89ac5a1d 2999 int pages;
e03a34f8 3000
89ac5a1d
DDAG
3001 if (qemu_file_get_error(f)) {
3002 break;
3003 }
e8f3735f 3004
05931ec5 3005 pages = ram_find_and_save_block(rs);
89ac5a1d
DDAG
3006 /* no more pages to sent */
3007 if (pages == 0) {
3008 done = 1;
3009 break;
3010 }
e8f3735f 3011
89ac5a1d
DDAG
3012 if (pages < 0) {
3013 qemu_file_set_error(f, pages);
56e93d26
JQ
3014 break;
3015 }
89ac5a1d
DDAG
3016
3017 rs->target_page_count += pages;
3018
644acf99
WY
3019 /*
3020 * During postcopy, it is necessary to make sure one whole host
3021 * page is sent in one chunk.
3022 */
3023 if (migrate_postcopy_ram()) {
3024 flush_compressed_data(rs);
3025 }
3026
89ac5a1d
DDAG
3027 /*
3028 * we want to check in the 1st loop, just in case it was the 1st
3029 * time and we had to sync the dirty bitmap.
3030 * qemu_clock_get_ns() is a bit expensive, so we only check each
3031 * some iterations
3032 */
3033 if ((i & 63) == 0) {
3034 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3035 1000000;
3036 if (t1 > MAX_WAIT) {
3037 trace_ram_save_iterate_big_wait(t1, i);
3038 break;
3039 }
3040 }
3041 i++;
56e93d26 3042 }
56e93d26 3043 }
63268c49 3044 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26
JQ
3045
3046 /*
3047 * Must occur before EOS (or any QEMUFile operation)
3048 * because of RDMA protocol.
3049 */
3050 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3051
b2557345 3052out:
b69a0227
JQ
3053 if (ret >= 0
3054 && migration_is_setup_or_active(migrate_get_current()->state)) {
99f2c6fb 3055 multifd_send_sync_main(rs->f);
3d4095b2
JQ
3056 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3057 qemu_fflush(f);
4c2d0f6d 3058 ram_transferred_add(8);
56e93d26 3059
3d4095b2
JQ
3060 ret = qemu_file_get_error(f);
3061 }
56e93d26
JQ
3062 if (ret < 0) {
3063 return ret;
3064 }
3065
5c90308f 3066 return done;
56e93d26
JQ
3067}
3068
3d0684b2
JQ
3069/**
3070 * ram_save_complete: function called to send the remaining amount of ram
3071 *
e8f3735f 3072 * Returns zero to indicate success or negative on error
3d0684b2
JQ
3073 *
3074 * Called with iothread lock
3075 *
3076 * @f: QEMUFile where to send the data
3077 * @opaque: RAMState pointer
3078 */
56e93d26
JQ
3079static int ram_save_complete(QEMUFile *f, void *opaque)
3080{
53518d94
JQ
3081 RAMState **temp = opaque;
3082 RAMState *rs = *temp;
e8f3735f 3083 int ret = 0;
6f37bb8b 3084
05931ec5
JQ
3085 rs->last_stage = !migration_in_colo_state();
3086
89ac5a1d
DDAG
3087 WITH_RCU_READ_LOCK_GUARD() {
3088 if (!migration_in_postcopy()) {
3089 migration_bitmap_sync_precopy(rs);
3090 }
56e93d26 3091
89ac5a1d 3092 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
56e93d26 3093
89ac5a1d 3094 /* try transferring iterative blocks of memory */
56e93d26 3095
89ac5a1d
DDAG
3096 /* flush all remaining blocks regardless of rate limiting */
3097 while (true) {
3098 int pages;
56e93d26 3099
05931ec5 3100 pages = ram_find_and_save_block(rs);
89ac5a1d
DDAG
3101 /* no more blocks to sent */
3102 if (pages == 0) {
3103 break;
3104 }
3105 if (pages < 0) {
3106 ret = pages;
3107 break;
3108 }
e8f3735f 3109 }
56e93d26 3110
89ac5a1d
DDAG
3111 flush_compressed_data(rs);
3112 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3113 }
d09a6fde 3114
3d4095b2 3115 if (ret >= 0) {
99f2c6fb 3116 multifd_send_sync_main(rs->f);
3d4095b2
JQ
3117 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3118 qemu_fflush(f);
3119 }
56e93d26 3120
e8f3735f 3121 return ret;
56e93d26
JQ
3122}
3123
c31b098f 3124static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
47995026
VSO
3125 uint64_t *res_precopy_only,
3126 uint64_t *res_compatible,
3127 uint64_t *res_postcopy_only)
56e93d26 3128{
53518d94
JQ
3129 RAMState **temp = opaque;
3130 RAMState *rs = *temp;
56e93d26
JQ
3131 uint64_t remaining_size;
3132
9edabd4d 3133 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 3134
5727309d 3135 if (!migration_in_postcopy() &&
663e6c1d 3136 remaining_size < max_size) {
56e93d26 3137 qemu_mutex_lock_iothread();
89ac5a1d
DDAG
3138 WITH_RCU_READ_LOCK_GUARD() {
3139 migration_bitmap_sync_precopy(rs);
3140 }
56e93d26 3141 qemu_mutex_unlock_iothread();
9edabd4d 3142 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 3143 }
c31b098f 3144
86e1167e
VSO
3145 if (migrate_postcopy_ram()) {
3146 /* We can do postcopy, and all the data is postcopiable */
47995026 3147 *res_compatible += remaining_size;
86e1167e 3148 } else {
47995026 3149 *res_precopy_only += remaining_size;
86e1167e 3150 }
56e93d26
JQ
3151}
3152
3153static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3154{
3155 unsigned int xh_len;
3156 int xh_flags;
063e760a 3157 uint8_t *loaded_data;
56e93d26 3158
56e93d26
JQ
3159 /* extract RLE header */
3160 xh_flags = qemu_get_byte(f);
3161 xh_len = qemu_get_be16(f);
3162
3163 if (xh_flags != ENCODING_FLAG_XBZRLE) {
3164 error_report("Failed to load XBZRLE page - wrong compression!");
3165 return -1;
3166 }
3167
3168 if (xh_len > TARGET_PAGE_SIZE) {
3169 error_report("Failed to load XBZRLE page - len overflow!");
3170 return -1;
3171 }
f265e0e4 3172 loaded_data = XBZRLE.decoded_buf;
56e93d26 3173 /* load data and decode */
f265e0e4 3174 /* it can change loaded_data to point to an internal buffer */
063e760a 3175 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
56e93d26
JQ
3176
3177 /* decode RLE */
063e760a 3178 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
56e93d26
JQ
3179 TARGET_PAGE_SIZE) == -1) {
3180 error_report("Failed to load XBZRLE page - decode error!");
3181 return -1;
3182 }
3183
3184 return 0;
3185}
3186
3d0684b2
JQ
3187/**
3188 * ram_block_from_stream: read a RAMBlock id from the migration stream
3189 *
3190 * Must be called from within a rcu critical section.
3191 *
56e93d26 3192 * Returns a pointer from within the RCU-protected ram_list.
a7180877 3193 *
755e8d7c 3194 * @mis: the migration incoming state pointer
3d0684b2
JQ
3195 * @f: QEMUFile where to read the data from
3196 * @flags: Page flags (mostly to see if it's a continuation of previous block)
a7180877 3197 */
755e8d7c
PX
3198static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
3199 QEMUFile *f, int flags)
56e93d26 3200{
755e8d7c 3201 RAMBlock *block = mis->last_recv_block;
56e93d26
JQ
3202 char id[256];
3203 uint8_t len;
3204
3205 if (flags & RAM_SAVE_FLAG_CONTINUE) {
4c4bad48 3206 if (!block) {
56e93d26
JQ
3207 error_report("Ack, bad migration stream!");
3208 return NULL;
3209 }
4c4bad48 3210 return block;
56e93d26
JQ
3211 }
3212
3213 len = qemu_get_byte(f);
3214 qemu_get_buffer(f, (uint8_t *)id, len);
3215 id[len] = 0;
3216
e3dd7493 3217 block = qemu_ram_block_by_name(id);
4c4bad48
HZ
3218 if (!block) {
3219 error_report("Can't find block %s", id);
3220 return NULL;
56e93d26
JQ
3221 }
3222
fbd162e6 3223 if (ramblock_is_ignored(block)) {
b895de50
CLG
3224 error_report("block %s should not be migrated !", id);
3225 return NULL;
3226 }
3227
755e8d7c
PX
3228 mis->last_recv_block = block;
3229
4c4bad48
HZ
3230 return block;
3231}
3232
3233static inline void *host_from_ram_block_offset(RAMBlock *block,
3234 ram_addr_t offset)
3235{
3236 if (!offset_in_ramblock(block, offset)) {
3237 return NULL;
3238 }
3239
3240 return block->host + offset;
56e93d26
JQ
3241}
3242
6a23f639
DH
3243static void *host_page_from_ram_block_offset(RAMBlock *block,
3244 ram_addr_t offset)
3245{
3246 /* Note: Explicitly no check against offset_in_ramblock(). */
3247 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3248 block->page_size);
3249}
3250
3251static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3252 ram_addr_t offset)
3253{
3254 return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3255}
3256
13af18f2 3257static inline void *colo_cache_from_block_offset(RAMBlock *block,
8af66371 3258 ram_addr_t offset, bool record_bitmap)
13af18f2
ZC
3259{
3260 if (!offset_in_ramblock(block, offset)) {
3261 return NULL;
3262 }
3263 if (!block->colo_cache) {
3264 error_report("%s: colo_cache is NULL in block :%s",
3265 __func__, block->idstr);
3266 return NULL;
3267 }
7d9acafa
ZC
3268
3269 /*
3270 * During colo checkpoint, we need bitmap of these migrated pages.
3271 * It help us to decide which pages in ram cache should be flushed
3272 * into VM's RAM later.
3273 */
8af66371
HZ
3274 if (record_bitmap &&
3275 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
7d9acafa
ZC
3276 ram_state->migration_dirty_pages++;
3277 }
13af18f2
ZC
3278 return block->colo_cache + offset;
3279}
3280
3d0684b2
JQ
3281/**
3282 * ram_handle_compressed: handle the zero page case
3283 *
56e93d26
JQ
3284 * If a page (or a whole RDMA chunk) has been
3285 * determined to be zero, then zap it.
3d0684b2
JQ
3286 *
3287 * @host: host address for the zero page
3288 * @ch: what the page is filled from. We only support zero
3289 * @size: size of the zero page
56e93d26
JQ
3290 */
3291void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3292{
bad452a7 3293 if (ch != 0 || !buffer_is_zero(host, size)) {
56e93d26
JQ
3294 memset(host, ch, size);
3295 }
3296}
3297
797ca154
XG
3298/* return the size after decompression, or negative value on error */
3299static int
3300qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3301 const uint8_t *source, size_t source_len)
3302{
3303 int err;
3304
3305 err = inflateReset(stream);
3306 if (err != Z_OK) {
3307 return -1;
3308 }
3309
3310 stream->avail_in = source_len;
3311 stream->next_in = (uint8_t *)source;
3312 stream->avail_out = dest_len;
3313 stream->next_out = dest;
3314
3315 err = inflate(stream, Z_NO_FLUSH);
3316 if (err != Z_STREAM_END) {
3317 return -1;
3318 }
3319
3320 return stream->total_out;
3321}
3322
56e93d26
JQ
3323static void *do_data_decompress(void *opaque)
3324{
3325 DecompressParam *param = opaque;
3326 unsigned long pagesize;
33d151f4 3327 uint8_t *des;
34ab9e97 3328 int len, ret;
56e93d26 3329
33d151f4 3330 qemu_mutex_lock(&param->mutex);
90e56fb4 3331 while (!param->quit) {
33d151f4
LL
3332 if (param->des) {
3333 des = param->des;
3334 len = param->len;
3335 param->des = 0;
3336 qemu_mutex_unlock(&param->mutex);
3337
56e93d26 3338 pagesize = TARGET_PAGE_SIZE;
34ab9e97
XG
3339
3340 ret = qemu_uncompress_data(&param->stream, des, pagesize,
3341 param->compbuf, len);
f548222c 3342 if (ret < 0 && migrate_get_current()->decompress_error_check) {
34ab9e97
XG
3343 error_report("decompress data failed");
3344 qemu_file_set_error(decomp_file, ret);
3345 }
73a8912b 3346
33d151f4
LL
3347 qemu_mutex_lock(&decomp_done_lock);
3348 param->done = true;
3349 qemu_cond_signal(&decomp_done_cond);
3350 qemu_mutex_unlock(&decomp_done_lock);
3351
3352 qemu_mutex_lock(&param->mutex);
3353 } else {
3354 qemu_cond_wait(&param->cond, &param->mutex);
3355 }
56e93d26 3356 }
33d151f4 3357 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
3358
3359 return NULL;
3360}
3361
34ab9e97 3362static int wait_for_decompress_done(void)
5533b2e9
LL
3363{
3364 int idx, thread_count;
3365
3366 if (!migrate_use_compression()) {
34ab9e97 3367 return 0;
5533b2e9
LL
3368 }
3369
3370 thread_count = migrate_decompress_threads();
3371 qemu_mutex_lock(&decomp_done_lock);
3372 for (idx = 0; idx < thread_count; idx++) {
3373 while (!decomp_param[idx].done) {
3374 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3375 }
3376 }
3377 qemu_mutex_unlock(&decomp_done_lock);
34ab9e97 3378 return qemu_file_get_error(decomp_file);
5533b2e9
LL
3379}
3380
f0afa331 3381static void compress_threads_load_cleanup(void)
56e93d26
JQ
3382{
3383 int i, thread_count;
3384
3416ab5b
JQ
3385 if (!migrate_use_compression()) {
3386 return;
3387 }
56e93d26
JQ
3388 thread_count = migrate_decompress_threads();
3389 for (i = 0; i < thread_count; i++) {
797ca154
XG
3390 /*
3391 * we use it as a indicator which shows if the thread is
3392 * properly init'd or not
3393 */
3394 if (!decomp_param[i].compbuf) {
3395 break;
3396 }
3397
56e93d26 3398 qemu_mutex_lock(&decomp_param[i].mutex);
90e56fb4 3399 decomp_param[i].quit = true;
56e93d26
JQ
3400 qemu_cond_signal(&decomp_param[i].cond);
3401 qemu_mutex_unlock(&decomp_param[i].mutex);
3402 }
3403 for (i = 0; i < thread_count; i++) {
797ca154
XG
3404 if (!decomp_param[i].compbuf) {
3405 break;
3406 }
3407
56e93d26
JQ
3408 qemu_thread_join(decompress_threads + i);
3409 qemu_mutex_destroy(&decomp_param[i].mutex);
3410 qemu_cond_destroy(&decomp_param[i].cond);
797ca154 3411 inflateEnd(&decomp_param[i].stream);
56e93d26 3412 g_free(decomp_param[i].compbuf);
797ca154 3413 decomp_param[i].compbuf = NULL;
56e93d26
JQ
3414 }
3415 g_free(decompress_threads);
3416 g_free(decomp_param);
56e93d26
JQ
3417 decompress_threads = NULL;
3418 decomp_param = NULL;
34ab9e97 3419 decomp_file = NULL;
56e93d26
JQ
3420}
3421
34ab9e97 3422static int compress_threads_load_setup(QEMUFile *f)
797ca154
XG
3423{
3424 int i, thread_count;
3425
3426 if (!migrate_use_compression()) {
3427 return 0;
3428 }
3429
3430 thread_count = migrate_decompress_threads();
3431 decompress_threads = g_new0(QemuThread, thread_count);
3432 decomp_param = g_new0(DecompressParam, thread_count);
3433 qemu_mutex_init(&decomp_done_lock);
3434 qemu_cond_init(&decomp_done_cond);
34ab9e97 3435 decomp_file = f;
797ca154
XG
3436 for (i = 0; i < thread_count; i++) {
3437 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3438 goto exit;
3439 }
3440
3441 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3442 qemu_mutex_init(&decomp_param[i].mutex);
3443 qemu_cond_init(&decomp_param[i].cond);
3444 decomp_param[i].done = true;
3445 decomp_param[i].quit = false;
3446 qemu_thread_create(decompress_threads + i, "decompress",
3447 do_data_decompress, decomp_param + i,
3448 QEMU_THREAD_JOINABLE);
3449 }
3450 return 0;
3451exit:
3452 compress_threads_load_cleanup();
3453 return -1;
3454}
3455
c1bc6626 3456static void decompress_data_with_multi_threads(QEMUFile *f,
56e93d26
JQ
3457 void *host, int len)
3458{
3459 int idx, thread_count;
3460
3461 thread_count = migrate_decompress_threads();
37396950 3462 QEMU_LOCK_GUARD(&decomp_done_lock);
56e93d26
JQ
3463 while (true) {
3464 for (idx = 0; idx < thread_count; idx++) {
73a8912b 3465 if (decomp_param[idx].done) {
33d151f4
LL
3466 decomp_param[idx].done = false;
3467 qemu_mutex_lock(&decomp_param[idx].mutex);
c1bc6626 3468 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
56e93d26
JQ
3469 decomp_param[idx].des = host;
3470 decomp_param[idx].len = len;
33d151f4
LL
3471 qemu_cond_signal(&decomp_param[idx].cond);
3472 qemu_mutex_unlock(&decomp_param[idx].mutex);
56e93d26
JQ
3473 break;
3474 }
3475 }
3476 if (idx < thread_count) {
3477 break;
73a8912b
LL
3478 } else {
3479 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
56e93d26
JQ
3480 }
3481 }
3482}
3483
b70cb3b4
RL
3484static void colo_init_ram_state(void)
3485{
3486 ram_state_init(&ram_state);
b70cb3b4
RL
3487}
3488
13af18f2
ZC
3489/*
3490 * colo cache: this is for secondary VM, we cache the whole
3491 * memory of the secondary VM, it is need to hold the global lock
3492 * to call this helper.
3493 */
3494int colo_init_ram_cache(void)
3495{
3496 RAMBlock *block;
3497
44901b5a
PB
3498 WITH_RCU_READ_LOCK_GUARD() {
3499 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3500 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
8dbe22c6 3501 NULL, false, false);
44901b5a
PB
3502 if (!block->colo_cache) {
3503 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3504 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3505 block->used_length);
3506 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3507 if (block->colo_cache) {
3508 qemu_anon_ram_free(block->colo_cache, block->used_length);
3509 block->colo_cache = NULL;
3510 }
89ac5a1d 3511 }
44901b5a 3512 return -errno;
89ac5a1d 3513 }
e5fdf920
LS
3514 if (!machine_dump_guest_core(current_machine)) {
3515 qemu_madvise(block->colo_cache, block->used_length,
3516 QEMU_MADV_DONTDUMP);
3517 }
13af18f2 3518 }
13af18f2 3519 }
44901b5a 3520
7d9acafa
ZC
3521 /*
3522 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3523 * with to decide which page in cache should be flushed into SVM's RAM. Here
3524 * we use the same name 'ram_bitmap' as for migration.
3525 */
3526 if (ram_bytes_total()) {
3527 RAMBlock *block;
3528
fbd162e6 3529 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7d9acafa 3530 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
7d9acafa 3531 block->bmap = bitmap_new(pages);
7d9acafa
ZC
3532 }
3533 }
7d9acafa 3534
b70cb3b4 3535 colo_init_ram_state();
13af18f2 3536 return 0;
13af18f2
ZC
3537}
3538
0393031a
HZ
3539/* TODO: duplicated with ram_init_bitmaps */
3540void colo_incoming_start_dirty_log(void)
3541{
3542 RAMBlock *block = NULL;
3543 /* For memory_global_dirty_log_start below. */
3544 qemu_mutex_lock_iothread();
3545 qemu_mutex_lock_ramlist();
3546
3547 memory_global_dirty_log_sync();
3548 WITH_RCU_READ_LOCK_GUARD() {
3549 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3550 ramblock_sync_dirty_bitmap(ram_state, block);
3551 /* Discard this dirty bitmap record */
3552 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3553 }
63b41db4 3554 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
0393031a
HZ
3555 }
3556 ram_state->migration_dirty_pages = 0;
3557 qemu_mutex_unlock_ramlist();
3558 qemu_mutex_unlock_iothread();
3559}
3560
13af18f2
ZC
3561/* It is need to hold the global lock to call this helper */
3562void colo_release_ram_cache(void)
3563{
3564 RAMBlock *block;
3565
63b41db4 3566 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
fbd162e6 3567 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7d9acafa
ZC
3568 g_free(block->bmap);
3569 block->bmap = NULL;
3570 }
3571
89ac5a1d
DDAG
3572 WITH_RCU_READ_LOCK_GUARD() {
3573 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3574 if (block->colo_cache) {
3575 qemu_anon_ram_free(block->colo_cache, block->used_length);
3576 block->colo_cache = NULL;
3577 }
13af18f2
ZC
3578 }
3579 }
0393031a 3580 ram_state_cleanup(&ram_state);
13af18f2
ZC
3581}
3582
f265e0e4
JQ
3583/**
3584 * ram_load_setup: Setup RAM for migration incoming side
3585 *
3586 * Returns zero to indicate success and negative for error
3587 *
3588 * @f: QEMUFile where to receive the data
3589 * @opaque: RAMState pointer
3590 */
3591static int ram_load_setup(QEMUFile *f, void *opaque)
3592{
34ab9e97 3593 if (compress_threads_load_setup(f)) {
797ca154
XG
3594 return -1;
3595 }
3596
f265e0e4 3597 xbzrle_load_setup();
f9494614 3598 ramblock_recv_map_init();
13af18f2 3599
f265e0e4
JQ
3600 return 0;
3601}
3602
3603static int ram_load_cleanup(void *opaque)
3604{
f9494614 3605 RAMBlock *rb;
56eb90af 3606
fbd162e6 3607 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
bd108a44 3608 qemu_ram_block_writeback(rb);
56eb90af
JH
3609 }
3610
f265e0e4 3611 xbzrle_load_cleanup();
f0afa331 3612 compress_threads_load_cleanup();
f9494614 3613
fbd162e6 3614 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
f9494614
AP
3615 g_free(rb->receivedmap);
3616 rb->receivedmap = NULL;
3617 }
13af18f2 3618
f265e0e4
JQ
3619 return 0;
3620}
3621
3d0684b2
JQ
3622/**
3623 * ram_postcopy_incoming_init: allocate postcopy data structures
3624 *
3625 * Returns 0 for success and negative if there was one error
3626 *
3627 * @mis: current migration incoming state
3628 *
3629 * Allocate data structures etc needed by incoming migration with
3630 * postcopy-ram. postcopy-ram's similarly names
3631 * postcopy_ram_incoming_init does the work.
1caddf8a
DDAG
3632 */
3633int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3634{
c136180c 3635 return postcopy_ram_incoming_init(mis);
1caddf8a
DDAG
3636}
3637
3d0684b2
JQ
3638/**
3639 * ram_load_postcopy: load a page in postcopy case
3640 *
3641 * Returns 0 for success or -errno in case of error
3642 *
a7180877
DDAG
3643 * Called in postcopy mode by ram_load().
3644 * rcu_read_lock is taken prior to this being called.
3d0684b2
JQ
3645 *
3646 * @f: QEMUFile where to send the data
a7180877 3647 */
929068ec 3648int ram_load_postcopy(QEMUFile *f)
a7180877
DDAG
3649{
3650 int flags = 0, ret = 0;
3651 bool place_needed = false;
1aa83678 3652 bool matches_target_page_size = false;
a7180877 3653 MigrationIncomingState *mis = migration_incoming_get_current();
77dadc3f
PX
3654 /* Currently we only use channel 0. TODO: use all the channels */
3655 PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[0];
a7180877
DDAG
3656
3657 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3658 ram_addr_t addr;
a7180877
DDAG
3659 void *page_buffer = NULL;
3660 void *place_source = NULL;
df9ff5e1 3661 RAMBlock *block = NULL;
a7180877 3662 uint8_t ch;
644acf99 3663 int len;
a7180877
DDAG
3664
3665 addr = qemu_get_be64(f);
7a9ddfbf
PX
3666
3667 /*
3668 * If qemu file error, we should stop here, and then "addr"
3669 * may be invalid
3670 */
3671 ret = qemu_file_get_error(f);
3672 if (ret) {
3673 break;
3674 }
3675
a7180877
DDAG
3676 flags = addr & ~TARGET_PAGE_MASK;
3677 addr &= TARGET_PAGE_MASK;
3678
3679 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
644acf99
WY
3680 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3681 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
755e8d7c 3682 block = ram_block_from_stream(mis, f, flags);
6a23f639
DH
3683 if (!block) {
3684 ret = -EINVAL;
3685 break;
3686 }
4c4bad48 3687
898ba906
DH
3688 /*
3689 * Relying on used_length is racy and can result in false positives.
3690 * We might place pages beyond used_length in case RAM was shrunk
3691 * while in postcopy, which is fine - trying to place via
3692 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3693 */
3694 if (!block->host || addr >= block->postcopy_length) {
a7180877
DDAG
3695 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3696 ret = -EINVAL;
3697 break;
3698 }
77dadc3f 3699 tmp_page->target_pages++;
1aa83678 3700 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
a7180877 3701 /*
28abd200
DDAG
3702 * Postcopy requires that we place whole host pages atomically;
3703 * these may be huge pages for RAMBlocks that are backed by
3704 * hugetlbfs.
a7180877
DDAG
3705 * To make it atomic, the data is read into a temporary page
3706 * that's moved into place later.
3707 * The migration protocol uses, possibly smaller, target-pages
3708 * however the source ensures it always sends all the components
91ba442f 3709 * of a host page in one chunk.
a7180877 3710 */
77dadc3f 3711 page_buffer = tmp_page->tmp_huge_page +
6a23f639
DH
3712 host_page_offset_from_ram_block_offset(block, addr);
3713 /* If all TP are zero then we can optimise the place */
77dadc3f
PX
3714 if (tmp_page->target_pages == 1) {
3715 tmp_page->host_addr =
3716 host_page_from_ram_block_offset(block, addr);
3717 } else if (tmp_page->host_addr !=
3718 host_page_from_ram_block_offset(block, addr)) {
c53b7ddc 3719 /* not the 1st TP within the HP */
cfc7dc8a
PX
3720 error_report("Non-same host page detected. "
3721 "Target host page %p, received host page %p "
3722 "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
3723 tmp_page->host_addr,
3724 host_page_from_ram_block_offset(block, addr),
3725 block->idstr, addr, tmp_page->target_pages);
6a23f639
DH
3726 ret = -EINVAL;
3727 break;
a7180877
DDAG
3728 }
3729
3730 /*
3731 * If it's the last part of a host page then we place the host
3732 * page
3733 */
77dadc3f
PX
3734 if (tmp_page->target_pages ==
3735 (block->page_size / TARGET_PAGE_SIZE)) {
4cbb3c63 3736 place_needed = true;
4cbb3c63 3737 }
77dadc3f 3738 place_source = tmp_page->tmp_huge_page;
a7180877
DDAG
3739 }
3740
3741 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
bb890ed5 3742 case RAM_SAVE_FLAG_ZERO:
a7180877 3743 ch = qemu_get_byte(f);
2e36bc1b
WY
3744 /*
3745 * Can skip to set page_buffer when
3746 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3747 */
3748 if (ch || !matches_target_page_size) {
3749 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3750 }
a7180877 3751 if (ch) {
77dadc3f 3752 tmp_page->all_zero = false;
a7180877
DDAG
3753 }
3754 break;
3755
3756 case RAM_SAVE_FLAG_PAGE:
77dadc3f 3757 tmp_page->all_zero = false;
1aa83678
PX
3758 if (!matches_target_page_size) {
3759 /* For huge pages, we always use temporary buffer */
a7180877
DDAG
3760 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3761 } else {
1aa83678
PX
3762 /*
3763 * For small pages that matches target page size, we
3764 * avoid the qemu_file copy. Instead we directly use
3765 * the buffer of QEMUFile to place the page. Note: we
3766 * cannot do any QEMUFile operation before using that
3767 * buffer to make sure the buffer is valid when
3768 * placing the page.
a7180877
DDAG
3769 */
3770 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3771 TARGET_PAGE_SIZE);
3772 }
3773 break;
644acf99 3774 case RAM_SAVE_FLAG_COMPRESS_PAGE:
77dadc3f 3775 tmp_page->all_zero = false;
644acf99
WY
3776 len = qemu_get_be32(f);
3777 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3778 error_report("Invalid compressed data length: %d", len);
3779 ret = -EINVAL;
3780 break;
3781 }
3782 decompress_data_with_multi_threads(f, page_buffer, len);
3783 break;
3784
a7180877
DDAG
3785 case RAM_SAVE_FLAG_EOS:
3786 /* normal exit */
6df264ac 3787 multifd_recv_sync_main();
a7180877
DDAG
3788 break;
3789 default:
29fccade 3790 error_report("Unknown combination of migration flags: 0x%x"
a7180877
DDAG
3791 " (postcopy mode)", flags);
3792 ret = -EINVAL;
7a9ddfbf
PX
3793 break;
3794 }
3795
644acf99
WY
3796 /* Got the whole host page, wait for decompress before placing. */
3797 if (place_needed) {
3798 ret |= wait_for_decompress_done();
3799 }
3800
7a9ddfbf
PX
3801 /* Detect for any possible file errors */
3802 if (!ret && qemu_file_get_error(f)) {
3803 ret = qemu_file_get_error(f);
a7180877
DDAG
3804 }
3805
7a9ddfbf 3806 if (!ret && place_needed) {
77dadc3f
PX
3807 if (tmp_page->all_zero) {
3808 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
a7180877 3809 } else {
77dadc3f
PX
3810 ret = postcopy_place_page(mis, tmp_page->host_addr,
3811 place_source, block);
a7180877 3812 }
ddf35bdf 3813 place_needed = false;
77dadc3f 3814 postcopy_temp_page_reset(tmp_page);
a7180877 3815 }
a7180877
DDAG
3816 }
3817
3818 return ret;
3819}
3820
acab30b8
DHB
3821static bool postcopy_is_advised(void)
3822{
3823 PostcopyState ps = postcopy_state_get();
3824 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3825}
3826
3827static bool postcopy_is_running(void)
3828{
3829 PostcopyState ps = postcopy_state_get();
3830 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3831}
3832
e6f4aa18
ZC
3833/*
3834 * Flush content of RAM cache into SVM's memory.
3835 * Only flush the pages that be dirtied by PVM or SVM or both.
3836 */
24fa16f8 3837void colo_flush_ram_cache(void)
e6f4aa18
ZC
3838{
3839 RAMBlock *block = NULL;
3840 void *dst_host;
3841 void *src_host;
3842 unsigned long offset = 0;
3843
d1955d22 3844 memory_global_dirty_log_sync();
89ac5a1d
DDAG
3845 WITH_RCU_READ_LOCK_GUARD() {
3846 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3847 ramblock_sync_dirty_bitmap(ram_state, block);
3848 }
d1955d22 3849 }
d1955d22 3850
e6f4aa18 3851 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
89ac5a1d
DDAG
3852 WITH_RCU_READ_LOCK_GUARD() {
3853 block = QLIST_FIRST_RCU(&ram_list.blocks);
e6f4aa18 3854
89ac5a1d 3855 while (block) {
a6a83cef 3856 unsigned long num = 0;
e6f4aa18 3857
a6a83cef 3858 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
542147f4
DH
3859 if (!offset_in_ramblock(block,
3860 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
89ac5a1d 3861 offset = 0;
a6a83cef 3862 num = 0;
89ac5a1d
DDAG
3863 block = QLIST_NEXT_RCU(block, next);
3864 } else {
a6a83cef
RL
3865 unsigned long i = 0;
3866
3867 for (i = 0; i < num; i++) {
3868 migration_bitmap_clear_dirty(ram_state, block, offset + i);
3869 }
8bba004c
AR
3870 dst_host = block->host
3871 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3872 src_host = block->colo_cache
3873 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
a6a83cef
RL
3874 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
3875 offset += num;
89ac5a1d 3876 }
e6f4aa18
ZC
3877 }
3878 }
e6f4aa18
ZC
3879 trace_colo_flush_ram_cache_end();
3880}
3881
10da4a36
WY
3882/**
3883 * ram_load_precopy: load pages in precopy case
3884 *
3885 * Returns 0 for success or -errno in case of error
3886 *
3887 * Called in precopy mode by ram_load().
3888 * rcu_read_lock is taken prior to this being called.
3889 *
3890 * @f: QEMUFile where to send the data
3891 */
3892static int ram_load_precopy(QEMUFile *f)
56e93d26 3893{
755e8d7c 3894 MigrationIncomingState *mis = migration_incoming_get_current();
e65cec5e 3895 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
ef08fb38 3896 /* ADVISE is earlier, it shows the source has the postcopy capability on */
acab30b8 3897 bool postcopy_advised = postcopy_is_advised();
edc60127
JQ
3898 if (!migrate_use_compression()) {
3899 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3900 }
a7180877 3901
10da4a36 3902 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
56e93d26 3903 ram_addr_t addr, total_ram_bytes;
0393031a 3904 void *host = NULL, *host_bak = NULL;
56e93d26
JQ
3905 uint8_t ch;
3906
e65cec5e
YK
3907 /*
3908 * Yield periodically to let main loop run, but an iteration of
3909 * the main loop is expensive, so do it each some iterations
3910 */
3911 if ((i & 32767) == 0 && qemu_in_coroutine()) {
3912 aio_co_schedule(qemu_get_current_aio_context(),
3913 qemu_coroutine_self());
3914 qemu_coroutine_yield();
3915 }
3916 i++;
3917
56e93d26
JQ
3918 addr = qemu_get_be64(f);
3919 flags = addr & ~TARGET_PAGE_MASK;
3920 addr &= TARGET_PAGE_MASK;
3921
edc60127
JQ
3922 if (flags & invalid_flags) {
3923 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3924 error_report("Received an unexpected compressed page");
3925 }
3926
3927 ret = -EINVAL;
3928 break;
3929 }
3930
bb890ed5 3931 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
a776aa15 3932 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
755e8d7c 3933 RAMBlock *block = ram_block_from_stream(mis, f, flags);
4c4bad48 3934
0393031a 3935 host = host_from_ram_block_offset(block, addr);
13af18f2 3936 /*
0393031a
HZ
3937 * After going into COLO stage, we should not load the page
3938 * into SVM's memory directly, we put them into colo_cache firstly.
3939 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3940 * Previously, we copied all these memory in preparing stage of COLO
3941 * while we need to stop VM, which is a time-consuming process.
3942 * Here we optimize it by a trick, back-up every page while in
3943 * migration process while COLO is enabled, though it affects the
3944 * speed of the migration, but it obviously reduce the downtime of
3945 * back-up all SVM'S memory in COLO preparing stage.
13af18f2 3946 */
0393031a
HZ
3947 if (migration_incoming_colo_enabled()) {
3948 if (migration_incoming_in_colo_state()) {
3949 /* In COLO stage, put all pages into cache temporarily */
8af66371 3950 host = colo_cache_from_block_offset(block, addr, true);
0393031a
HZ
3951 } else {
3952 /*
3953 * In migration stage but before COLO stage,
3954 * Put all pages into both cache and SVM's memory.
3955 */
8af66371 3956 host_bak = colo_cache_from_block_offset(block, addr, false);
0393031a 3957 }
13af18f2 3958 }
a776aa15
DDAG
3959 if (!host) {
3960 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3961 ret = -EINVAL;
3962 break;
3963 }
13af18f2
ZC
3964 if (!migration_incoming_in_colo_state()) {
3965 ramblock_recv_bitmap_set(block, host);
3966 }
3967
1db9d8e5 3968 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
a776aa15
DDAG
3969 }
3970
56e93d26
JQ
3971 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3972 case RAM_SAVE_FLAG_MEM_SIZE:
3973 /* Synchronize RAM block list */
3974 total_ram_bytes = addr;
3975 while (!ret && total_ram_bytes) {
3976 RAMBlock *block;
56e93d26
JQ
3977 char id[256];
3978 ram_addr_t length;
3979
3980 len = qemu_get_byte(f);
3981 qemu_get_buffer(f, (uint8_t *)id, len);
3982 id[len] = 0;
3983 length = qemu_get_be64(f);
3984
e3dd7493 3985 block = qemu_ram_block_by_name(id);
b895de50
CLG
3986 if (block && !qemu_ram_is_migratable(block)) {
3987 error_report("block %s should not be migrated !", id);
3988 ret = -EINVAL;
3989 } else if (block) {
e3dd7493
DDAG
3990 if (length != block->used_length) {
3991 Error *local_err = NULL;
56e93d26 3992
fa53a0e5 3993 ret = qemu_ram_resize(block, length,
e3dd7493
DDAG
3994 &local_err);
3995 if (local_err) {
3996 error_report_err(local_err);
56e93d26 3997 }
56e93d26 3998 }
ef08fb38 3999 /* For postcopy we need to check hugepage sizes match */
e846b746 4000 if (postcopy_advised && migrate_postcopy_ram() &&
ef08fb38
DDAG
4001 block->page_size != qemu_host_page_size) {
4002 uint64_t remote_page_size = qemu_get_be64(f);
4003 if (remote_page_size != block->page_size) {
4004 error_report("Mismatched RAM page size %s "
4005 "(local) %zd != %" PRId64,
4006 id, block->page_size,
4007 remote_page_size);
4008 ret = -EINVAL;
4009 }
4010 }
fbd162e6
YK
4011 if (migrate_ignore_shared()) {
4012 hwaddr addr = qemu_get_be64(f);
fbd162e6
YK
4013 if (ramblock_is_ignored(block) &&
4014 block->mr->addr != addr) {
4015 error_report("Mismatched GPAs for block %s "
4016 "%" PRId64 "!= %" PRId64,
4017 id, (uint64_t)addr,
4018 (uint64_t)block->mr->addr);
4019 ret = -EINVAL;
4020 }
4021 }
e3dd7493
DDAG
4022 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4023 block->idstr);
4024 } else {
56e93d26
JQ
4025 error_report("Unknown ramblock \"%s\", cannot "
4026 "accept migration", id);
4027 ret = -EINVAL;
4028 }
4029
4030 total_ram_bytes -= length;
4031 }
4032 break;
a776aa15 4033
bb890ed5 4034 case RAM_SAVE_FLAG_ZERO:
56e93d26
JQ
4035 ch = qemu_get_byte(f);
4036 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4037 break;
a776aa15 4038
56e93d26 4039 case RAM_SAVE_FLAG_PAGE:
56e93d26
JQ
4040 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4041 break;
56e93d26 4042
a776aa15 4043 case RAM_SAVE_FLAG_COMPRESS_PAGE:
56e93d26
JQ
4044 len = qemu_get_be32(f);
4045 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4046 error_report("Invalid compressed data length: %d", len);
4047 ret = -EINVAL;
4048 break;
4049 }
c1bc6626 4050 decompress_data_with_multi_threads(f, host, len);
56e93d26 4051 break;
a776aa15 4052
56e93d26 4053 case RAM_SAVE_FLAG_XBZRLE:
56e93d26
JQ
4054 if (load_xbzrle(f, addr, host) < 0) {
4055 error_report("Failed to decompress XBZRLE page at "
4056 RAM_ADDR_FMT, addr);
4057 ret = -EINVAL;
4058 break;
4059 }
4060 break;
4061 case RAM_SAVE_FLAG_EOS:
4062 /* normal exit */
6df264ac 4063 multifd_recv_sync_main();
56e93d26
JQ
4064 break;
4065 default:
4066 if (flags & RAM_SAVE_FLAG_HOOK) {
632e3a5c 4067 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
56e93d26 4068 } else {
29fccade 4069 error_report("Unknown combination of migration flags: 0x%x",
56e93d26
JQ
4070 flags);
4071 ret = -EINVAL;
4072 }
4073 }
4074 if (!ret) {
4075 ret = qemu_file_get_error(f);
4076 }
0393031a
HZ
4077 if (!ret && host_bak) {
4078 memcpy(host_bak, host, TARGET_PAGE_SIZE);
4079 }
56e93d26
JQ
4080 }
4081
ca1a6b70 4082 ret |= wait_for_decompress_done();
10da4a36
WY
4083 return ret;
4084}
4085
4086static int ram_load(QEMUFile *f, void *opaque, int version_id)
4087{
4088 int ret = 0;
4089 static uint64_t seq_iter;
4090 /*
4091 * If system is running in postcopy mode, page inserts to host memory must
4092 * be atomic
4093 */
4094 bool postcopy_running = postcopy_is_running();
4095
4096 seq_iter++;
4097
4098 if (version_id != 4) {
4099 return -EINVAL;
4100 }
4101
4102 /*
4103 * This RCU critical section can be very long running.
4104 * When RCU reclaims in the code start to become numerous,
4105 * it will be necessary to reduce the granularity of this
4106 * critical section.
4107 */
89ac5a1d
DDAG
4108 WITH_RCU_READ_LOCK_GUARD() {
4109 if (postcopy_running) {
4110 ret = ram_load_postcopy(f);
4111 } else {
4112 ret = ram_load_precopy(f);
4113 }
10da4a36 4114 }
55c4446b 4115 trace_ram_load_complete(ret, seq_iter);
e6f4aa18 4116
56e93d26
JQ
4117 return ret;
4118}
4119
c6467627
VSO
4120static bool ram_has_postcopy(void *opaque)
4121{
469dd51b 4122 RAMBlock *rb;
fbd162e6 4123 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
469dd51b
JH
4124 if (ramblock_is_pmem(rb)) {
4125 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4126 "is not supported now!", rb->idstr, rb->host);
4127 return false;
4128 }
4129 }
4130
c6467627
VSO
4131 return migrate_postcopy_ram();
4132}
4133
edd090c7
PX
4134/* Sync all the dirty bitmap with destination VM. */
4135static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4136{
4137 RAMBlock *block;
4138 QEMUFile *file = s->to_dst_file;
4139 int ramblock_count = 0;
4140
4141 trace_ram_dirty_bitmap_sync_start();
4142
fbd162e6 4143 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
edd090c7
PX
4144 qemu_savevm_send_recv_bitmap(file, block->idstr);
4145 trace_ram_dirty_bitmap_request(block->idstr);
4146 ramblock_count++;
4147 }
4148
4149 trace_ram_dirty_bitmap_sync_wait();
4150
4151 /* Wait until all the ramblocks' dirty bitmap synced */
4152 while (ramblock_count--) {
4153 qemu_sem_wait(&s->rp_state.rp_sem);
4154 }
4155
4156 trace_ram_dirty_bitmap_sync_complete();
4157
4158 return 0;
4159}
4160
4161static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4162{
4163 qemu_sem_post(&s->rp_state.rp_sem);
4164}
4165
a335debb
PX
4166/*
4167 * Read the received bitmap, revert it as the initial dirty bitmap.
4168 * This is only used when the postcopy migration is paused but wants
4169 * to resume from a middle point.
4170 */
4171int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4172{
4173 int ret = -EINVAL;
43044ac0 4174 /* from_dst_file is always valid because we're within rp_thread */
a335debb
PX
4175 QEMUFile *file = s->rp_state.from_dst_file;
4176 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
a725ef9f 4177 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
4178 uint64_t size, end_mark;
4179
4180 trace_ram_dirty_bitmap_reload_begin(block->idstr);
4181
4182 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4183 error_report("%s: incorrect state %s", __func__,
4184 MigrationStatus_str(s->state));
4185 return -EINVAL;
4186 }
4187
4188 /*
4189 * Note: see comments in ramblock_recv_bitmap_send() on why we
3a4452d8 4190 * need the endianness conversion, and the paddings.
a335debb
PX
4191 */
4192 local_size = ROUND_UP(local_size, 8);
4193
4194 /* Add paddings */
4195 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4196
4197 size = qemu_get_be64(file);
4198
4199 /* The size of the bitmap should match with our ramblock */
4200 if (size != local_size) {
4201 error_report("%s: ramblock '%s' bitmap size mismatch "
4202 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4203 block->idstr, size, local_size);
4204 ret = -EINVAL;
4205 goto out;
4206 }
4207
4208 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4209 end_mark = qemu_get_be64(file);
4210
4211 ret = qemu_file_get_error(file);
4212 if (ret || size != local_size) {
4213 error_report("%s: read bitmap failed for ramblock '%s': %d"
4214 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4215 __func__, block->idstr, ret, local_size, size);
4216 ret = -EIO;
4217 goto out;
4218 }
4219
4220 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
af3bbbe9 4221 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
a335debb
PX
4222 __func__, block->idstr, end_mark);
4223 ret = -EINVAL;
4224 goto out;
4225 }
4226
4227 /*
3a4452d8 4228 * Endianness conversion. We are during postcopy (though paused).
a335debb
PX
4229 * The dirty bitmap won't change. We can directly modify it.
4230 */
4231 bitmap_from_le(block->bmap, le_bitmap, nbits);
4232
4233 /*
4234 * What we received is "received bitmap". Revert it as the initial
4235 * dirty bitmap for this ramblock.
4236 */
4237 bitmap_complement(block->bmap, block->bmap, nbits);
4238
be39b4cd
DH
4239 /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4240 ramblock_dirty_bitmap_clear_discarded_pages(block);
4241
4242 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
a335debb
PX
4243 trace_ram_dirty_bitmap_reload_complete(block->idstr);
4244
edd090c7
PX
4245 /*
4246 * We succeeded to sync bitmap for current ramblock. If this is
4247 * the last one to sync, we need to notify the main send thread.
4248 */
4249 ram_dirty_bitmap_reload_notify(s);
4250
a335debb
PX
4251 ret = 0;
4252out:
bf269906 4253 g_free(le_bitmap);
a335debb
PX
4254 return ret;
4255}
4256
edd090c7
PX
4257static int ram_resume_prepare(MigrationState *s, void *opaque)
4258{
4259 RAMState *rs = *(RAMState **)opaque;
08614f34 4260 int ret;
edd090c7 4261
08614f34
PX
4262 ret = ram_dirty_bitmap_sync_all(s, rs);
4263 if (ret) {
4264 return ret;
4265 }
4266
4267 ram_state_resume_prepare(rs, s->to_dst_file);
4268
4269 return 0;
edd090c7
PX
4270}
4271
56e93d26 4272static SaveVMHandlers savevm_ram_handlers = {
9907e842 4273 .save_setup = ram_save_setup,
56e93d26 4274 .save_live_iterate = ram_save_iterate,
763c906b 4275 .save_live_complete_postcopy = ram_save_complete,
a3e06c3d 4276 .save_live_complete_precopy = ram_save_complete,
c6467627 4277 .has_postcopy = ram_has_postcopy,
56e93d26
JQ
4278 .save_live_pending = ram_save_pending,
4279 .load_state = ram_load,
f265e0e4
JQ
4280 .save_cleanup = ram_save_cleanup,
4281 .load_setup = ram_load_setup,
4282 .load_cleanup = ram_load_cleanup,
edd090c7 4283 .resume_prepare = ram_resume_prepare,
56e93d26
JQ
4284};
4285
c7c0e724
DH
4286static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4287 size_t old_size, size_t new_size)
4288{
cc61c703 4289 PostcopyState ps = postcopy_state_get();
c7c0e724
DH
4290 ram_addr_t offset;
4291 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4292 Error *err = NULL;
4293
4294 if (ramblock_is_ignored(rb)) {
4295 return;
4296 }
4297
4298 if (!migration_is_idle()) {
4299 /*
4300 * Precopy code on the source cannot deal with the size of RAM blocks
4301 * changing at random points in time - especially after sending the
4302 * RAM block sizes in the migration stream, they must no longer change.
4303 * Abort and indicate a proper reason.
4304 */
4305 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
458fecca 4306 migration_cancel(err);
c7c0e724 4307 error_free(err);
c7c0e724 4308 }
cc61c703
DH
4309
4310 switch (ps) {
4311 case POSTCOPY_INCOMING_ADVISE:
4312 /*
4313 * Update what ram_postcopy_incoming_init()->init_range() does at the
4314 * time postcopy was advised. Syncing RAM blocks with the source will
4315 * result in RAM resizes.
4316 */
4317 if (old_size < new_size) {
4318 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4319 error_report("RAM block '%s' discard of resized RAM failed",
4320 rb->idstr);
4321 }
4322 }
898ba906 4323 rb->postcopy_length = new_size;
cc61c703
DH
4324 break;
4325 case POSTCOPY_INCOMING_NONE:
4326 case POSTCOPY_INCOMING_RUNNING:
4327 case POSTCOPY_INCOMING_END:
4328 /*
4329 * Once our guest is running, postcopy does no longer care about
4330 * resizes. When growing, the new memory was not available on the
4331 * source, no handler needed.
4332 */
4333 break;
4334 default:
4335 error_report("RAM block '%s' resized during postcopy state: %d",
4336 rb->idstr, ps);
4337 exit(-1);
4338 }
c7c0e724
DH
4339}
4340
4341static RAMBlockNotifier ram_mig_ram_notifier = {
4342 .ram_block_resized = ram_mig_ram_block_resized,
4343};
4344
56e93d26
JQ
4345void ram_mig_init(void)
4346{
4347 qemu_mutex_init(&XBZRLE.lock);
ce62df53 4348 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
c7c0e724 4349 ram_block_notifier_add(&ram_mig_ram_notifier);
56e93d26 4350}