]> git.proxmox.com Git - mirror_qemu.git/blame - migration/ram.c
ram.c: Remove last ram.c dependency from the core compress code
[mirror_qemu.git] / migration / ram.c
CommitLineData
56e93d26
JQ
1/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
76cc7b58
JQ
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
56e93d26
JQ
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
e688df6b 28
1393a485 29#include "qemu/osdep.h"
f348b6d1 30#include "qemu/cutils.h"
56e93d26
JQ
31#include "qemu/bitops.h"
32#include "qemu/bitmap.h"
b85ea5fa 33#include "qemu/madvise.h"
7205c9ec 34#include "qemu/main-loop.h"
c0e0825c 35#include "io/channel-null.h"
709e3fe8 36#include "xbzrle.h"
7b1e1a22 37#include "ram.h"
6666c96a 38#include "migration.h"
947701cc 39#include "migration-stats.h"
f2a8f0a6 40#include "migration/register.h"
7b1e1a22 41#include "migration/misc.h"
08a0aee1 42#include "qemu-file.h"
be07b0ac 43#include "postcopy-ram.h"
53d37d36 44#include "page_cache.h"
56e93d26 45#include "qemu/error-report.h"
e688df6b 46#include "qapi/error.h"
ab7cbb0b 47#include "qapi/qapi-types-migration.h"
9af23989 48#include "qapi/qapi-events-migration.h"
8acabf69 49#include "qapi/qmp/qerror.h"
56e93d26 50#include "trace.h"
56e93d26 51#include "exec/ram_addr.h"
f9494614 52#include "exec/target_page.h"
56e93d26 53#include "qemu/rcu_queue.h"
a91246c9 54#include "migration/colo.h"
53d37d36 55#include "block.h"
b0c3cf94 56#include "sysemu/cpu-throttle.h"
edd090c7 57#include "savevm.h"
b9ee2f7d 58#include "qemu/iov.h"
d32ca5ad 59#include "multifd.h"
278e2f55 60#include "sysemu/runstate.h"
1f0776f1 61#include "options.h"
278e2f55 62
e5fdf920
LS
63#include "hw/boards.h" /* for machine_dump_guest_core() */
64
278e2f55
AG
65#if defined(__linux__)
66#include "qemu/userfaultfd.h"
67#endif /* defined(__linux__) */
56e93d26 68
56e93d26
JQ
69/***********************************************************/
70/* ram save/restore */
71
7b548761
JQ
72/*
73 * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
74 * worked for pages that were filled with the same char. We switched
bb890ed5 75 * it to only search for the zero value. And to avoid confusion with
7b548761 76 * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it.
bb890ed5 77 */
7b548761
JQ
78/*
79 * RAM_SAVE_FLAG_FULL was obsoleted in 2009, it can be reused now
80 */
81#define RAM_SAVE_FLAG_FULL 0x01
bb890ed5 82#define RAM_SAVE_FLAG_ZERO 0x02
56e93d26
JQ
83#define RAM_SAVE_FLAG_MEM_SIZE 0x04
84#define RAM_SAVE_FLAG_PAGE 0x08
85#define RAM_SAVE_FLAG_EOS 0x10
86#define RAM_SAVE_FLAG_CONTINUE 0x20
87#define RAM_SAVE_FLAG_XBZRLE 0x40
7b548761 88/* 0x80 is reserved in qemu-file.h for RAM_SAVE_FLAG_HOOK */
56e93d26 89#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
294e5a40 90#define RAM_SAVE_FLAG_MULTIFD_FLUSH 0x200
7b548761 91/* We can't use any flag that is bigger than 0x200 */
56e93d26 92
04ffce13 93int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int,
94 uint8_t *, int) = xbzrle_encode_buffer;
95#if defined(CONFIG_AVX512BW_OPT)
96#include "qemu/cpuid.h"
97static void __attribute__((constructor)) init_cpu_flag(void)
98{
99 unsigned max = __get_cpuid_max(0, NULL);
100 int a, b, c, d;
101 if (max >= 1) {
102 __cpuid(1, a, b, c, d);
103 /* We must check that AVX is not just available, but usable. */
104 if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
105 int bv;
106 __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
107 __cpuid_count(7, 0, a, b, c, d);
108 /* 0xe6:
109 * XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
110 * and ZMM16-ZMM31 state are enabled by OS)
111 * XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
112 */
113 if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
114 xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512;
115 }
116 }
117 }
118}
119#endif
120
9360447d
JQ
121XBZRLECacheStats xbzrle_counters;
122
f1668764
PX
123/* used by the search for pages to send */
124struct PageSearchStatus {
125 /* The migration channel used for a specific host page */
126 QEMUFile *pss_channel;
ec6f3ab9
PX
127 /* Last block from where we have sent data */
128 RAMBlock *last_sent_block;
f1668764
PX
129 /* Current block being searched */
130 RAMBlock *block;
131 /* Current page to search from */
132 unsigned long page;
133 /* Set once we wrap around */
134 bool complete_round;
f1668764
PX
135 /* Whether we're sending a host page */
136 bool host_page_sending;
137 /* The start/end of current host page. Invalid if host_page_sending==false */
138 unsigned long host_page_start;
139 unsigned long host_page_end;
140};
141typedef struct PageSearchStatus PageSearchStatus;
142
56e93d26
JQ
143/* struct contains XBZRLE cache and a static page
144 used by the compression */
145static struct {
146 /* buffer used for XBZRLE encoding */
147 uint8_t *encoded_buf;
148 /* buffer for storing page content */
149 uint8_t *current_buf;
150 /* Cache for XBZRLE, Protected by lock. */
151 PageCache *cache;
152 QemuMutex lock;
c00e0928
JQ
153 /* it will store a page full of zeros */
154 uint8_t *zero_target_page;
f265e0e4
JQ
155 /* buffer used for XBZRLE decoding */
156 uint8_t *decoded_buf;
56e93d26
JQ
157} XBZRLE;
158
56e93d26
JQ
159static void XBZRLE_cache_lock(void)
160{
87dca0c9 161 if (migrate_xbzrle()) {
56e93d26 162 qemu_mutex_lock(&XBZRLE.lock);
f4c51a6b 163 }
56e93d26
JQ
164}
165
166static void XBZRLE_cache_unlock(void)
167{
87dca0c9 168 if (migrate_xbzrle()) {
56e93d26 169 qemu_mutex_unlock(&XBZRLE.lock);
f4c51a6b 170 }
56e93d26
JQ
171}
172
3d0684b2
JQ
173/**
174 * xbzrle_cache_resize: resize the xbzrle cache
175 *
cbde7be9 176 * This function is called from migrate_params_apply in main
3d0684b2
JQ
177 * thread, possibly while a migration is in progress. A running
178 * migration may be using the cache and might finish during this call,
179 * hence changes to the cache are protected by XBZRLE.lock().
180 *
c9dede2d 181 * Returns 0 for success or -1 for error
3d0684b2
JQ
182 *
183 * @new_size: new cache size
8acabf69 184 * @errp: set *errp if the check failed, with reason
56e93d26 185 */
8b9407a0 186int xbzrle_cache_resize(uint64_t new_size, Error **errp)
56e93d26
JQ
187{
188 PageCache *new_cache;
c9dede2d 189 int64_t ret = 0;
56e93d26 190
8acabf69
JQ
191 /* Check for truncation */
192 if (new_size != (size_t)new_size) {
193 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
194 "exceeding address space");
195 return -1;
196 }
197
2a313e5c
JQ
198 if (new_size == migrate_xbzrle_cache_size()) {
199 /* nothing to do */
c9dede2d 200 return 0;
2a313e5c
JQ
201 }
202
56e93d26
JQ
203 XBZRLE_cache_lock();
204
205 if (XBZRLE.cache != NULL) {
80f8dfde 206 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
56e93d26 207 if (!new_cache) {
56e93d26
JQ
208 ret = -1;
209 goto out;
210 }
211
212 cache_fini(XBZRLE.cache);
213 XBZRLE.cache = new_cache;
214 }
56e93d26
JQ
215out:
216 XBZRLE_cache_unlock();
217 return ret;
218}
219
20123ee1
PX
220static bool postcopy_preempt_active(void)
221{
222 return migrate_postcopy_preempt() && migration_in_postcopy();
223}
224
3ded54b1 225bool ramblock_is_ignored(RAMBlock *block)
fbd162e6
YK
226{
227 return !qemu_ram_is_migratable(block) ||
228 (migrate_ignore_shared() && qemu_ram_is_shared(block));
229}
230
343f632c
DDAG
231#undef RAMBLOCK_FOREACH
232
fbd162e6
YK
233int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
234{
235 RAMBlock *block;
236 int ret = 0;
237
89ac5a1d
DDAG
238 RCU_READ_LOCK_GUARD();
239
fbd162e6
YK
240 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
241 ret = func(block, opaque);
242 if (ret) {
243 break;
244 }
245 }
fbd162e6
YK
246 return ret;
247}
248
f9494614
AP
249static void ramblock_recv_map_init(void)
250{
251 RAMBlock *rb;
252
fbd162e6 253 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
f9494614
AP
254 assert(!rb->receivedmap);
255 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
256 }
257}
258
259int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
260{
261 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
262 rb->receivedmap);
263}
264
1cba9f6e
DDAG
265bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
266{
267 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
268}
269
f9494614
AP
270void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
271{
272 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
273}
274
275void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
276 size_t nr)
277{
278 bitmap_set_atomic(rb->receivedmap,
279 ramblock_recv_bitmap_offset(host_addr, rb),
280 nr);
281}
282
a335debb
PX
283#define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
284
285/*
286 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
287 *
288 * Returns >0 if success with sent bytes, or <0 if error.
289 */
290int64_t ramblock_recv_bitmap_send(QEMUFile *file,
291 const char *block_name)
292{
293 RAMBlock *block = qemu_ram_block_by_name(block_name);
294 unsigned long *le_bitmap, nbits;
295 uint64_t size;
296
297 if (!block) {
298 error_report("%s: invalid block name: %s", __func__, block_name);
299 return -1;
300 }
301
898ba906 302 nbits = block->postcopy_length >> TARGET_PAGE_BITS;
a335debb
PX
303
304 /*
305 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
306 * machines we may need 4 more bytes for padding (see below
307 * comment). So extend it a bit before hand.
308 */
309 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
310
311 /*
312 * Always use little endian when sending the bitmap. This is
313 * required that when source and destination VMs are not using the
3a4452d8 314 * same endianness. (Note: big endian won't work.)
a335debb
PX
315 */
316 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
317
318 /* Size of the bitmap, in bytes */
a725ef9f 319 size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
320
321 /*
322 * size is always aligned to 8 bytes for 64bit machines, but it
323 * may not be true for 32bit machines. We need this padding to
324 * make sure the migration can survive even between 32bit and
325 * 64bit machines.
326 */
327 size = ROUND_UP(size, 8);
328
329 qemu_put_be64(file, size);
330 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
331 /*
332 * Mark as an end, in case the middle part is screwed up due to
3a4452d8 333 * some "mysterious" reason.
a335debb
PX
334 */
335 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
336 qemu_fflush(file);
337
bf269906 338 g_free(le_bitmap);
a335debb
PX
339
340 if (qemu_file_get_error(file)) {
341 return qemu_file_get_error(file);
342 }
343
344 return size + sizeof(size);
345}
346
ec481c6c
JQ
347/*
348 * An outstanding page request, on the source, having been received
349 * and queued
350 */
351struct RAMSrcPageRequest {
352 RAMBlock *rb;
353 hwaddr offset;
354 hwaddr len;
355
356 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
357};
358
6f37bb8b
JQ
359/* State of RAM for migration */
360struct RAMState {
f1668764
PX
361 /*
362 * PageSearchStatus structures for the channels when send pages.
363 * Protected by the bitmap_mutex.
364 */
365 PageSearchStatus pss[RAM_CHANNEL_MAX];
278e2f55
AG
366 /* UFFD file descriptor, used in 'write-tracking' migration */
367 int uffdio_fd;
8d80e195
JQ
368 /* total ram size in bytes */
369 uint64_t ram_bytes_total;
6f37bb8b
JQ
370 /* Last block that we have visited searching for dirty pages */
371 RAMBlock *last_seen_block;
269ace29
JQ
372 /* Last dirty target page we have sent */
373 ram_addr_t last_page;
6f37bb8b
JQ
374 /* last ram version we have seen */
375 uint32_t last_version;
8d820d6f
JQ
376 /* How many times we have dirty too many pages */
377 int dirty_rate_high_cnt;
f664da80
JQ
378 /* these variables are used for bitmap sync */
379 /* last time we did a full bitmap_sync */
380 int64_t time_last_bitmap_sync;
eac74159 381 /* bytes transferred at start_time */
c4bdf0cf 382 uint64_t bytes_xfer_prev;
a66cd90c 383 /* number of dirty pages since start_time */
68908ed6 384 uint64_t num_dirty_pages_period;
b5833fde
JQ
385 /* xbzrle misses since the beginning of the period */
386 uint64_t xbzrle_cache_miss_prev;
e460a4b1
WW
387 /* Amount of xbzrle pages since the beginning of the period */
388 uint64_t xbzrle_pages_prev;
389 /* Amount of xbzrle encoded bytes since the beginning of the period */
390 uint64_t xbzrle_bytes_prev;
f3095cc8
JQ
391 /* Are we really using XBZRLE (e.g., after the first round). */
392 bool xbzrle_started;
05931ec5
JQ
393 /* Are we on the last stage of migration */
394 bool last_stage;
76e03000
XG
395 /* compression statistics since the beginning of the period */
396 /* amount of count that no free thread to compress data */
397 uint64_t compress_thread_busy_prev;
398 /* amount bytes after compression */
399 uint64_t compressed_size_prev;
400 /* amount of compressed pages */
401 uint64_t compress_pages_prev;
402
be8b02ed
XG
403 /* total handled target pages at the beginning of period */
404 uint64_t target_page_count_prev;
405 /* total handled target pages since start */
406 uint64_t target_page_count;
9360447d 407 /* number of dirty bits in the bitmap */
2dfaf12e 408 uint64_t migration_dirty_pages;
f1668764
PX
409 /*
410 * Protects:
411 * - dirty/clear bitmap
412 * - migration_dirty_pages
413 * - pss structures
414 */
108cfae0 415 QemuMutex bitmap_mutex;
68a098f3
JQ
416 /* The RAMBlock used in the last src_page_requests */
417 RAMBlock *last_req_rb;
ec481c6c
JQ
418 /* Queue of outstanding page requests from the destination */
419 QemuMutex src_page_req_mutex;
b58deb34 420 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
6f37bb8b
JQ
421};
422typedef struct RAMState RAMState;
423
53518d94 424static RAMState *ram_state;
6f37bb8b 425
bd227060
WW
426static NotifierWithReturnList precopy_notifier_list;
427
a1fe28df
PX
428/* Whether postcopy has queued requests? */
429static bool postcopy_has_request(RAMState *rs)
430{
431 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
432}
433
bd227060
WW
434void precopy_infrastructure_init(void)
435{
436 notifier_with_return_list_init(&precopy_notifier_list);
437}
438
439void precopy_add_notifier(NotifierWithReturn *n)
440{
441 notifier_with_return_list_add(&precopy_notifier_list, n);
442}
443
444void precopy_remove_notifier(NotifierWithReturn *n)
445{
446 notifier_with_return_remove(n);
447}
448
449int precopy_notify(PrecopyNotifyReason reason, Error **errp)
450{
451 PrecopyNotifyData pnd;
452 pnd.reason = reason;
453 pnd.errp = errp;
454
455 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
456}
457
9edabd4d 458uint64_t ram_bytes_remaining(void)
2f4fde93 459{
bae416e5
DDAG
460 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
461 0;
2f4fde93
JQ
462}
463
26a26069 464void ram_transferred_add(uint64_t bytes)
4c2d0f6d 465{
ae680668 466 if (runstate_is_running()) {
aff3f660 467 stat64_add(&mig_stats.precopy_bytes, bytes);
ae680668 468 } else if (migration_in_postcopy()) {
aff3f660 469 stat64_add(&mig_stats.postcopy_bytes, bytes);
ae680668 470 } else {
aff3f660 471 stat64_add(&mig_stats.downtime_bytes, bytes);
ae680668 472 }
aff3f660 473 stat64_add(&mig_stats.transferred, bytes);
4c2d0f6d
DE
474}
475
4010ba38
JQ
476struct MigrationOps {
477 int (*ram_save_target_page)(RAMState *rs, PageSearchStatus *pss);
478};
479typedef struct MigrationOps MigrationOps;
480
481MigrationOps *migration_ops;
482
76e03000
XG
483CompressionStats compression_counters;
484
97274a87
LS
485enum CompressResult {
486 RES_NONE = 0,
487 RES_ZEROPAGE = 1,
488 RES_COMPRESS = 2
489};
490typedef enum CompressResult CompressResult;
491
56e93d26 492struct CompressParam {
56e93d26 493 bool done;
90e56fb4 494 bool quit;
10c2f7b7 495 bool trigger;
97274a87 496 CompressResult result;
56e93d26
JQ
497 QEMUFile *file;
498 QemuMutex mutex;
499 QemuCond cond;
500 RAMBlock *block;
501 ram_addr_t offset;
34ab9e97
XG
502
503 /* internally used fields */
dcaf446e 504 z_stream stream;
34ab9e97 505 uint8_t *originbuf;
56e93d26
JQ
506};
507typedef struct CompressParam CompressParam;
508
509struct DecompressParam {
73a8912b 510 bool done;
90e56fb4 511 bool quit;
56e93d26
JQ
512 QemuMutex mutex;
513 QemuCond cond;
514 void *des;
d341d9f3 515 uint8_t *compbuf;
56e93d26 516 int len;
797ca154 517 z_stream stream;
56e93d26
JQ
518};
519typedef struct DecompressParam DecompressParam;
520
521static CompressParam *comp_param;
522static QemuThread *compress_threads;
523/* comp_done_cond is used to wake up the migration thread when
524 * one of the compression threads has finished the compression.
525 * comp_done_lock is used to co-work with comp_done_cond.
526 */
0d9f9a5c
LL
527static QemuMutex comp_done_lock;
528static QemuCond comp_done_cond;
56e93d26 529
34ab9e97 530static QEMUFile *decomp_file;
56e93d26
JQ
531static DecompressParam *decomp_param;
532static QemuThread *decompress_threads;
73a8912b
LL
533static QemuMutex decomp_done_lock;
534static QemuCond decomp_done_cond;
56e93d26 535
93589827
PX
536static int ram_save_host_page_urgent(PageSearchStatus *pss);
537
97274a87
LS
538static CompressResult do_compress_ram_page(QEMUFile *f, z_stream *stream,
539 RAMBlock *block, ram_addr_t offset,
540 uint8_t *source_buf);
56e93d26 541
ebd88a49
PX
542/* NOTE: page is the PFN not real ram_addr_t. */
543static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page)
544{
545 pss->block = rb;
546 pss->page = page;
547 pss->complete_round = false;
548}
549
93589827
PX
550/*
551 * Check whether two PSSs are actively sending the same page. Return true
552 * if it is, false otherwise.
553 */
554static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2)
555{
556 return pss1->host_page_sending && pss2->host_page_sending &&
557 (pss1->host_page_start == pss2->host_page_start);
558}
559
56e93d26
JQ
560static void *do_data_compress(void *opaque)
561{
562 CompressParam *param = opaque;
a7a9a88f
LL
563 RAMBlock *block;
564 ram_addr_t offset;
97274a87 565 CompressResult result;
56e93d26 566
a7a9a88f 567 qemu_mutex_lock(&param->mutex);
90e56fb4 568 while (!param->quit) {
10c2f7b7 569 if (param->trigger) {
a7a9a88f
LL
570 block = param->block;
571 offset = param->offset;
10c2f7b7 572 param->trigger = false;
a7a9a88f
LL
573 qemu_mutex_unlock(&param->mutex);
574
97274a87
LS
575 result = do_compress_ram_page(param->file, &param->stream,
576 block, offset, param->originbuf);
a7a9a88f 577
0d9f9a5c 578 qemu_mutex_lock(&comp_done_lock);
a7a9a88f 579 param->done = true;
97274a87 580 param->result = result;
0d9f9a5c
LL
581 qemu_cond_signal(&comp_done_cond);
582 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
583
584 qemu_mutex_lock(&param->mutex);
585 } else {
56e93d26
JQ
586 qemu_cond_wait(&param->cond, &param->mutex);
587 }
56e93d26 588 }
a7a9a88f 589 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
590
591 return NULL;
592}
593
f0afa331 594static void compress_threads_save_cleanup(void)
56e93d26
JQ
595{
596 int i, thread_count;
597
a7a94d14 598 if (!migrate_compress() || !comp_param) {
56e93d26
JQ
599 return;
600 }
05306935 601
56e93d26
JQ
602 thread_count = migrate_compress_threads();
603 for (i = 0; i < thread_count; i++) {
dcaf446e
XG
604 /*
605 * we use it as a indicator which shows if the thread is
606 * properly init'd or not
607 */
608 if (!comp_param[i].file) {
609 break;
610 }
05306935
FL
611
612 qemu_mutex_lock(&comp_param[i].mutex);
613 comp_param[i].quit = true;
614 qemu_cond_signal(&comp_param[i].cond);
615 qemu_mutex_unlock(&comp_param[i].mutex);
616
56e93d26 617 qemu_thread_join(compress_threads + i);
56e93d26
JQ
618 qemu_mutex_destroy(&comp_param[i].mutex);
619 qemu_cond_destroy(&comp_param[i].cond);
dcaf446e 620 deflateEnd(&comp_param[i].stream);
34ab9e97 621 g_free(comp_param[i].originbuf);
dcaf446e
XG
622 qemu_fclose(comp_param[i].file);
623 comp_param[i].file = NULL;
56e93d26 624 }
0d9f9a5c
LL
625 qemu_mutex_destroy(&comp_done_lock);
626 qemu_cond_destroy(&comp_done_cond);
56e93d26
JQ
627 g_free(compress_threads);
628 g_free(comp_param);
56e93d26
JQ
629 compress_threads = NULL;
630 comp_param = NULL;
56e93d26
JQ
631}
632
dcaf446e 633static int compress_threads_save_setup(void)
56e93d26
JQ
634{
635 int i, thread_count;
636
a7a94d14 637 if (!migrate_compress()) {
dcaf446e 638 return 0;
56e93d26 639 }
56e93d26
JQ
640 thread_count = migrate_compress_threads();
641 compress_threads = g_new0(QemuThread, thread_count);
642 comp_param = g_new0(CompressParam, thread_count);
0d9f9a5c
LL
643 qemu_cond_init(&comp_done_cond);
644 qemu_mutex_init(&comp_done_lock);
56e93d26 645 for (i = 0; i < thread_count; i++) {
34ab9e97
XG
646 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
647 if (!comp_param[i].originbuf) {
648 goto exit;
649 }
650
dcaf446e
XG
651 if (deflateInit(&comp_param[i].stream,
652 migrate_compress_level()) != Z_OK) {
34ab9e97 653 g_free(comp_param[i].originbuf);
dcaf446e
XG
654 goto exit;
655 }
656
e110aa91
C
657 /* comp_param[i].file is just used as a dummy buffer to save data,
658 * set its ops to empty.
56e93d26 659 */
77ef2dc1 660 comp_param[i].file = qemu_file_new_output(
c0e0825c 661 QIO_CHANNEL(qio_channel_null_new()));
56e93d26 662 comp_param[i].done = true;
90e56fb4 663 comp_param[i].quit = false;
56e93d26
JQ
664 qemu_mutex_init(&comp_param[i].mutex);
665 qemu_cond_init(&comp_param[i].cond);
666 qemu_thread_create(compress_threads + i, "compress",
667 do_data_compress, comp_param + i,
668 QEMU_THREAD_JOINABLE);
669 }
dcaf446e
XG
670 return 0;
671
672exit:
673 compress_threads_save_cleanup();
674 return -1;
56e93d26
JQ
675}
676
677/**
3d0684b2 678 * save_page_header: write page header to wire
56e93d26
JQ
679 *
680 * If this is the 1st block, it also writes the block identification
681 *
3d0684b2 682 * Returns the number of bytes written
56e93d26 683 *
ec6f3ab9 684 * @pss: current PSS channel status
56e93d26
JQ
685 * @block: block that contains the page we want to send
686 * @offset: offset inside the block for the page
687 * in the lower bits, it contains flags
688 */
37502df3
LS
689static size_t save_page_header(PageSearchStatus *pss, QEMUFile *f,
690 RAMBlock *block, ram_addr_t offset)
56e93d26 691{
9f5f380b 692 size_t size, len;
ec6f3ab9 693 bool same_block = (block == pss->last_sent_block);
56e93d26 694
10661f11 695 if (same_block) {
24795694
JQ
696 offset |= RAM_SAVE_FLAG_CONTINUE;
697 }
2bf3aa85 698 qemu_put_be64(f, offset);
56e93d26
JQ
699 size = 8;
700
10661f11 701 if (!same_block) {
9f5f380b 702 len = strlen(block->idstr);
2bf3aa85
JQ
703 qemu_put_byte(f, len);
704 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
9f5f380b 705 size += 1 + len;
ec6f3ab9 706 pss->last_sent_block = block;
56e93d26
JQ
707 }
708 return size;
709}
710
3d0684b2 711/**
179a8080 712 * mig_throttle_guest_down: throttle down the guest
3d0684b2
JQ
713 *
714 * Reduce amount of guest cpu execution to hopefully slow down memory
715 * writes. If guest dirty memory rate is reduced below the rate at
716 * which we can transfer pages to the destination then we should be
717 * able to complete migration. Some workloads dirty memory way too
718 * fast and will not effectively converge, even with auto-converge.
070afca2 719 */
cbbf8182
KZ
720static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
721 uint64_t bytes_dirty_threshold)
070afca2 722{
2a8ec380 723 uint64_t pct_initial = migrate_cpu_throttle_initial();
9605c2ac 724 uint64_t pct_increment = migrate_cpu_throttle_increment();
873f674c 725 bool pct_tailslow = migrate_cpu_throttle_tailslow();
24155bd0 726 int pct_max = migrate_max_cpu_throttle();
070afca2 727
cbbf8182
KZ
728 uint64_t throttle_now = cpu_throttle_get_percentage();
729 uint64_t cpu_now, cpu_ideal, throttle_inc;
730
070afca2
JH
731 /* We have not started throttling yet. Let's start it. */
732 if (!cpu_throttle_active()) {
733 cpu_throttle_set(pct_initial);
734 } else {
735 /* Throttling already on, just increase the rate */
cbbf8182
KZ
736 if (!pct_tailslow) {
737 throttle_inc = pct_increment;
738 } else {
739 /* Compute the ideal CPU percentage used by Guest, which may
740 * make the dirty rate match the dirty rate threshold. */
741 cpu_now = 100 - throttle_now;
742 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
743 bytes_dirty_period);
744 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
745 }
746 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
070afca2
JH
747 }
748}
749
91fe9a8d
RL
750void mig_throttle_counter_reset(void)
751{
752 RAMState *rs = ram_state;
753
754 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
755 rs->num_dirty_pages_period = 0;
aff3f660 756 rs->bytes_xfer_prev = stat64_get(&mig_stats.transferred);
91fe9a8d
RL
757}
758
3d0684b2
JQ
759/**
760 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
761 *
6f37bb8b 762 * @rs: current RAM state
3d0684b2
JQ
763 * @current_addr: address for the zero page
764 *
765 * Update the xbzrle cache to reflect a page that's been sent as all 0.
56e93d26
JQ
766 * The important thing is that a stale (not-yet-0'd) page be replaced
767 * by the new data.
768 * As a bonus, if the page wasn't in the cache it gets added so that
3d0684b2 769 * when a small write is made into the 0'd page it gets XBZRLE sent.
56e93d26 770 */
6f37bb8b 771static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
56e93d26 772{
56e93d26
JQ
773 /* We don't care if this fails to allocate a new cache page
774 * as long as it updated an old one */
c00e0928 775 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
aff3f660 776 stat64_get(&mig_stats.dirty_sync_count));
56e93d26
JQ
777}
778
779#define ENCODING_FLAG_XBZRLE 0x1
780
781/**
782 * save_xbzrle_page: compress and send current page
783 *
784 * Returns: 1 means that we wrote the page
785 * 0 means that page is identical to the one already sent
786 * -1 means that xbzrle would be longer than normal
787 *
5a987738 788 * @rs: current RAM state
ec6f3ab9 789 * @pss: current PSS channel
3d0684b2
JQ
790 * @current_data: pointer to the address of the page contents
791 * @current_addr: addr of the page
56e93d26
JQ
792 * @block: block that contains the page we want to send
793 * @offset: offset inside the block for the page
56e93d26 794 */
ec6f3ab9 795static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
61717ea9
PX
796 uint8_t **current_data, ram_addr_t current_addr,
797 RAMBlock *block, ram_addr_t offset)
56e93d26
JQ
798{
799 int encoded_len = 0, bytes_xbzrle;
800 uint8_t *prev_cached_page;
ec6f3ab9 801 QEMUFile *file = pss->pss_channel;
aff3f660 802 uint64_t generation = stat64_get(&mig_stats.dirty_sync_count);
56e93d26 803
536b5a4e 804 if (!cache_is_cached(XBZRLE.cache, current_addr, generation)) {
9360447d 805 xbzrle_counters.cache_miss++;
05931ec5 806 if (!rs->last_stage) {
56e93d26 807 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
536b5a4e 808 generation) == -1) {
56e93d26
JQ
809 return -1;
810 } else {
811 /* update *current_data when the page has been
812 inserted into cache */
813 *current_data = get_cached_data(XBZRLE.cache, current_addr);
814 }
815 }
816 return -1;
817 }
818
e460a4b1
WW
819 /*
820 * Reaching here means the page has hit the xbzrle cache, no matter what
821 * encoding result it is (normal encoding, overflow or skipping the page),
3a4452d8 822 * count the page as encoded. This is used to calculate the encoding rate.
e460a4b1
WW
823 *
824 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
825 * 2nd page turns out to be skipped (i.e. no new bytes written to the
826 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
827 * skipped page included. In this way, the encoding rate can tell if the
828 * guest page is good for xbzrle encoding.
829 */
830 xbzrle_counters.pages++;
56e93d26
JQ
831 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
832
833 /* save current buffer into memory */
834 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
835
836 /* XBZRLE encoding (if there is no overflow) */
04ffce13 837 encoded_len = xbzrle_encode_buffer_func(prev_cached_page, XBZRLE.current_buf,
838 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
839 TARGET_PAGE_SIZE);
ca353803
WY
840
841 /*
842 * Update the cache contents, so that it corresponds to the data
843 * sent, in all cases except where we skip the page.
844 */
05931ec5 845 if (!rs->last_stage && encoded_len != 0) {
ca353803
WY
846 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
847 /*
848 * In the case where we couldn't compress, ensure that the caller
849 * sends the data from the cache, since the guest might have
850 * changed the RAM since we copied it.
851 */
852 *current_data = prev_cached_page;
853 }
854
56e93d26 855 if (encoded_len == 0) {
55c4446b 856 trace_save_xbzrle_page_skipping();
56e93d26
JQ
857 return 0;
858 } else if (encoded_len == -1) {
55c4446b 859 trace_save_xbzrle_page_overflow();
9360447d 860 xbzrle_counters.overflow++;
e460a4b1 861 xbzrle_counters.bytes += TARGET_PAGE_SIZE;
56e93d26
JQ
862 return -1;
863 }
864
56e93d26 865 /* Send XBZRLE based compressed page */
37502df3 866 bytes_xbzrle = save_page_header(pss, pss->pss_channel, block,
204b88b8 867 offset | RAM_SAVE_FLAG_XBZRLE);
61717ea9
PX
868 qemu_put_byte(file, ENCODING_FLAG_XBZRLE);
869 qemu_put_be16(file, encoded_len);
870 qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len);
56e93d26 871 bytes_xbzrle += encoded_len + 1 + 2;
e460a4b1
WW
872 /*
873 * Like compressed_size (please see update_compress_thread_counts),
874 * the xbzrle encoded bytes don't count the 8 byte header with
875 * RAM_SAVE_FLAG_CONTINUE.
876 */
877 xbzrle_counters.bytes += bytes_xbzrle - 8;
4c2d0f6d 878 ram_transferred_add(bytes_xbzrle);
56e93d26
JQ
879
880 return 1;
881}
882
3d0684b2 883/**
d9e474ea 884 * pss_find_next_dirty: find the next dirty page of current ramblock
f3f491fc 885 *
d9e474ea
PX
886 * This function updates pss->page to point to the next dirty page index
887 * within the ramblock to migrate, or the end of ramblock when nothing
888 * found. Note that when pss->host_page_sending==true it means we're
889 * during sending a host page, so we won't look for dirty page that is
890 * outside the host page boundary.
3d0684b2 891 *
d9e474ea 892 * @pss: the current page search status
f3f491fc 893 */
d9e474ea 894static void pss_find_next_dirty(PageSearchStatus *pss)
56e93d26 895{
d9e474ea 896 RAMBlock *rb = pss->block;
6b6712ef
JQ
897 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
898 unsigned long *bitmap = rb->bmap;
56e93d26 899
fbd162e6 900 if (ramblock_is_ignored(rb)) {
d9e474ea
PX
901 /* Points directly to the end, so we know no dirty page */
902 pss->page = size;
903 return;
904 }
905
906 /*
907 * If during sending a host page, only look for dirty pages within the
908 * current host page being send.
909 */
910 if (pss->host_page_sending) {
911 assert(pss->host_page_end);
912 size = MIN(size, pss->host_page_end);
b895de50
CLG
913 }
914
d9e474ea 915 pss->page = find_next_bit(bitmap, size, pss->page);
56e93d26
JQ
916}
917
1230a25f 918static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
3143577d
WW
919 unsigned long page)
920{
921 uint8_t shift;
922 hwaddr size, start;
923
924 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
925 return;
926 }
927
928 shift = rb->clear_bmap_shift;
929 /*
930 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
931 * can make things easier sometimes since then start address
932 * of the small chunk will always be 64 pages aligned so the
933 * bitmap will always be aligned to unsigned long. We should
934 * even be able to remove this restriction but I'm simply
935 * keeping it.
936 */
937 assert(shift >= 6);
938
939 size = 1ULL << (TARGET_PAGE_BITS + shift);
7648297d 940 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
3143577d
WW
941 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
942 memory_region_clear_dirty_bitmap(rb->mr, start, size);
943}
944
945static void
1230a25f 946migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
3143577d
WW
947 unsigned long start,
948 unsigned long npages)
949{
950 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
951 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
952 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
953
954 /*
955 * Clear pages from start to start + npages - 1, so the end boundary is
956 * exclusive.
957 */
958 for (i = chunk_start; i < chunk_end; i += chunk_pages) {
1230a25f 959 migration_clear_memory_region_dirty_bitmap(rb, i);
3143577d
WW
960 }
961}
962
a6a83cef
RL
963/*
964 * colo_bitmap_find_diry:find contiguous dirty pages from start
965 *
966 * Returns the page offset within memory region of the start of the contiguout
967 * dirty page
968 *
969 * @rs: current RAM state
970 * @rb: RAMBlock where to search for dirty pages
971 * @start: page where we start the search
972 * @num: the number of contiguous dirty pages
973 */
974static inline
975unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
976 unsigned long start, unsigned long *num)
977{
978 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
979 unsigned long *bitmap = rb->bmap;
980 unsigned long first, next;
981
982 *num = 0;
983
984 if (ramblock_is_ignored(rb)) {
985 return size;
986 }
987
988 first = find_next_bit(bitmap, size, start);
989 if (first >= size) {
990 return first;
991 }
992 next = find_next_zero_bit(bitmap, size, first + 1);
993 assert(next >= first);
994 *num = next - first;
995 return first;
996}
997
06b10688 998static inline bool migration_bitmap_clear_dirty(RAMState *rs,
f20e2865
JQ
999 RAMBlock *rb,
1000 unsigned long page)
a82d593b
DDAG
1001{
1002 bool ret;
a82d593b 1003
002cad6b
PX
1004 /*
1005 * Clear dirty bitmap if needed. This _must_ be called before we
1006 * send any of the page in the chunk because we need to make sure
1007 * we can capture further page content changes when we sync dirty
1008 * log the next time. So as long as we are going to send any of
1009 * the page in the chunk we clear the remote dirty bitmap for all.
1010 * Clearing it earlier won't be a problem, but too late will.
1011 */
1230a25f 1012 migration_clear_memory_region_dirty_bitmap(rb, page);
002cad6b 1013
6b6712ef 1014 ret = test_and_clear_bit(page, rb->bmap);
a82d593b 1015 if (ret) {
0d8ec885 1016 rs->migration_dirty_pages--;
a82d593b 1017 }
386a907b 1018
a82d593b
DDAG
1019 return ret;
1020}
1021
be39b4cd
DH
1022static void dirty_bitmap_clear_section(MemoryRegionSection *section,
1023 void *opaque)
1024{
1025 const hwaddr offset = section->offset_within_region;
1026 const hwaddr size = int128_get64(section->size);
1027 const unsigned long start = offset >> TARGET_PAGE_BITS;
1028 const unsigned long npages = size >> TARGET_PAGE_BITS;
1029 RAMBlock *rb = section->mr->ram_block;
1030 uint64_t *cleared_bits = opaque;
1031
1032 /*
1033 * We don't grab ram_state->bitmap_mutex because we expect to run
1034 * only when starting migration or during postcopy recovery where
1035 * we don't have concurrent access.
1036 */
1037 if (!migration_in_postcopy() && !migrate_background_snapshot()) {
1038 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
1039 }
1040 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
1041 bitmap_clear(rb->bmap, start, npages);
1042}
1043
1044/*
1045 * Exclude all dirty pages from migration that fall into a discarded range as
1046 * managed by a RamDiscardManager responsible for the mapped memory region of
1047 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
1048 *
1049 * Discarded pages ("logically unplugged") have undefined content and must
1050 * not get migrated, because even reading these pages for migration might
1051 * result in undesired behavior.
1052 *
1053 * Returns the number of cleared bits in the RAMBlock dirty bitmap.
1054 *
1055 * Note: The result is only stable while migrating (precopy/postcopy).
1056 */
1057static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
1058{
1059 uint64_t cleared_bits = 0;
1060
1061 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
1062 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1063 MemoryRegionSection section = {
1064 .mr = rb->mr,
1065 .offset_within_region = 0,
1066 .size = int128_make64(qemu_ram_get_used_length(rb)),
1067 };
1068
1069 ram_discard_manager_replay_discarded(rdm, &section,
1070 dirty_bitmap_clear_section,
1071 &cleared_bits);
1072 }
1073 return cleared_bits;
1074}
1075
9470c5e0
DH
1076/*
1077 * Check if a host-page aligned page falls into a discarded range as managed by
1078 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
1079 *
1080 * Note: The result is only stable while migrating (precopy/postcopy).
1081 */
1082bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
1083{
1084 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1085 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1086 MemoryRegionSection section = {
1087 .mr = rb->mr,
1088 .offset_within_region = start,
1089 .size = int128_make64(qemu_ram_pagesize(rb)),
1090 };
1091
1092 return !ram_discard_manager_is_populated(rdm, &section);
1093 }
1094 return false;
1095}
1096
267691b6 1097/* Called with RCU critical section */
7a3e9571 1098static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
56e93d26 1099{
fb613580
KZ
1100 uint64_t new_dirty_pages =
1101 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
1102
1103 rs->migration_dirty_pages += new_dirty_pages;
1104 rs->num_dirty_pages_period += new_dirty_pages;
56e93d26
JQ
1105}
1106
3d0684b2
JQ
1107/**
1108 * ram_pagesize_summary: calculate all the pagesizes of a VM
1109 *
1110 * Returns a summary bitmap of the page sizes of all RAMBlocks
1111 *
1112 * For VMs with just normal pages this is equivalent to the host page
1113 * size. If it's got some huge pages then it's the OR of all the
1114 * different page sizes.
e8ca1db2
DDAG
1115 */
1116uint64_t ram_pagesize_summary(void)
1117{
1118 RAMBlock *block;
1119 uint64_t summary = 0;
1120
fbd162e6 1121 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
e8ca1db2
DDAG
1122 summary |= block->page_size;
1123 }
1124
1125 return summary;
1126}
1127
aecbfe9c
XG
1128uint64_t ram_get_total_transferred_pages(void)
1129{
aff3f660
JQ
1130 return stat64_get(&mig_stats.normal_pages) +
1131 stat64_get(&mig_stats.zero_pages) +
23b7576d 1132 compression_counters.pages + xbzrle_counters.pages;
aecbfe9c
XG
1133}
1134
b734035b
XG
1135static void migration_update_rates(RAMState *rs, int64_t end_time)
1136{
be8b02ed 1137 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
76e03000 1138 double compressed_size;
b734035b
XG
1139
1140 /* calculate period counters */
aff3f660 1141 stat64_set(&mig_stats.dirty_pages_rate,
72f8e587
JQ
1142 rs->num_dirty_pages_period * 1000 /
1143 (end_time - rs->time_last_bitmap_sync));
b734035b 1144
be8b02ed 1145 if (!page_count) {
b734035b
XG
1146 return;
1147 }
1148
87dca0c9 1149 if (migrate_xbzrle()) {
e460a4b1
WW
1150 double encoded_size, unencoded_size;
1151
b734035b 1152 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
be8b02ed 1153 rs->xbzrle_cache_miss_prev) / page_count;
b734035b 1154 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
e460a4b1
WW
1155 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1156 TARGET_PAGE_SIZE;
1157 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
92271402 1158 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
e460a4b1 1159 xbzrle_counters.encoding_rate = 0;
e460a4b1
WW
1160 } else {
1161 xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1162 }
1163 rs->xbzrle_pages_prev = xbzrle_counters.pages;
1164 rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
b734035b 1165 }
76e03000 1166
a7a94d14 1167 if (migrate_compress()) {
76e03000
XG
1168 compression_counters.busy_rate = (double)(compression_counters.busy -
1169 rs->compress_thread_busy_prev) / page_count;
1170 rs->compress_thread_busy_prev = compression_counters.busy;
1171
1172 compressed_size = compression_counters.compressed_size -
1173 rs->compressed_size_prev;
1174 if (compressed_size) {
1175 double uncompressed_size = (compression_counters.pages -
1176 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1177
1178 /* Compression-Ratio = Uncompressed-size / Compressed-size */
1179 compression_counters.compression_rate =
1180 uncompressed_size / compressed_size;
1181
1182 rs->compress_pages_prev = compression_counters.pages;
1183 rs->compressed_size_prev = compression_counters.compressed_size;
1184 }
1185 }
b734035b
XG
1186}
1187
dc14a470
KZ
1188static void migration_trigger_throttle(RAMState *rs)
1189{
6499efdb 1190 uint64_t threshold = migrate_throttle_trigger_threshold();
23b7576d 1191 uint64_t bytes_xfer_period =
aff3f660 1192 stat64_get(&mig_stats.transferred) - rs->bytes_xfer_prev;
dc14a470
KZ
1193 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1194 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1195
1196 /* During block migration the auto-converge logic incorrectly detects
1197 * that ram migration makes no progress. Avoid this by disabling the
1198 * throttling logic during the bulk phase of block migration. */
1199 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1200 /* The following detection logic can be refined later. For now:
1201 Check to see if the ratio between dirtied bytes and the approx.
1202 amount of bytes that just got transferred since the last time
1203 we were in this routine reaches the threshold. If that happens
1204 twice, start or increase throttling. */
1205
1206 if ((bytes_dirty_period > bytes_dirty_threshold) &&
1207 (++rs->dirty_rate_high_cnt >= 2)) {
1208 trace_migration_throttle();
1209 rs->dirty_rate_high_cnt = 0;
cbbf8182
KZ
1210 mig_throttle_guest_down(bytes_dirty_period,
1211 bytes_dirty_threshold);
dc14a470
KZ
1212 }
1213 }
1214}
1215
8d820d6f 1216static void migration_bitmap_sync(RAMState *rs)
56e93d26
JQ
1217{
1218 RAMBlock *block;
56e93d26 1219 int64_t end_time;
56e93d26 1220
aff3f660 1221 stat64_add(&mig_stats.dirty_sync_count, 1);
56e93d26 1222
f664da80
JQ
1223 if (!rs->time_last_bitmap_sync) {
1224 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
56e93d26
JQ
1225 }
1226
1227 trace_migration_bitmap_sync_start();
9c1f8f44 1228 memory_global_dirty_log_sync();
56e93d26 1229
108cfae0 1230 qemu_mutex_lock(&rs->bitmap_mutex);
89ac5a1d
DDAG
1231 WITH_RCU_READ_LOCK_GUARD() {
1232 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1233 ramblock_sync_dirty_bitmap(rs, block);
1234 }
aff3f660 1235 stat64_set(&mig_stats.dirty_bytes_last_sync, ram_bytes_remaining());
56e93d26 1236 }
108cfae0 1237 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26 1238
9458a9a1 1239 memory_global_after_dirty_log_sync();
a66cd90c 1240 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1ffb5dfd 1241
56e93d26
JQ
1242 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1243
1244 /* more than 1 second = 1000 millisecons */
f664da80 1245 if (end_time > rs->time_last_bitmap_sync + 1000) {
dc14a470 1246 migration_trigger_throttle(rs);
070afca2 1247
b734035b
XG
1248 migration_update_rates(rs, end_time);
1249
be8b02ed 1250 rs->target_page_count_prev = rs->target_page_count;
d693c6f1
FF
1251
1252 /* reset period counters */
f664da80 1253 rs->time_last_bitmap_sync = end_time;
a66cd90c 1254 rs->num_dirty_pages_period = 0;
aff3f660 1255 rs->bytes_xfer_prev = stat64_get(&mig_stats.transferred);
56e93d26 1256 }
b890902c 1257 if (migrate_events()) {
aff3f660 1258 uint64_t generation = stat64_get(&mig_stats.dirty_sync_count);
536b5a4e 1259 qapi_event_send_migration_pass(generation);
4addcd4f 1260 }
56e93d26
JQ
1261}
1262
bd227060
WW
1263static void migration_bitmap_sync_precopy(RAMState *rs)
1264{
1265 Error *local_err = NULL;
1266
1267 /*
1268 * The current notifier usage is just an optimization to migration, so we
1269 * don't stop the normal migration process in the error case.
1270 */
1271 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1272 error_report_err(local_err);
b4a1733c 1273 local_err = NULL;
bd227060
WW
1274 }
1275
1276 migration_bitmap_sync(rs);
1277
1278 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1279 error_report_err(local_err);
1280 }
1281}
1282
a4dbaf8e 1283void ram_release_page(const char *rbname, uint64_t offset)
47fe16ff
JQ
1284{
1285 if (!migrate_release_ram() || !migration_in_postcopy()) {
1286 return;
1287 }
1288
1289 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1290}
1291
6c97ec5f
XG
1292/**
1293 * save_zero_page_to_file: send the zero page to the file
1294 *
1295 * Returns the size of data written to the file, 0 means the page is not
1296 * a zero page
1297 *
ec6f3ab9 1298 * @pss: current PSS channel
6c97ec5f
XG
1299 * @block: block that contains the page we want to send
1300 * @offset: offset inside the block for the page
1301 */
37502df3 1302static int save_zero_page_to_file(PageSearchStatus *pss, QEMUFile *file,
6c97ec5f
XG
1303 RAMBlock *block, ram_addr_t offset)
1304{
1305 uint8_t *p = block->host + offset;
1306 int len = 0;
1307
bad452a7 1308 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
37502df3 1309 len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO);
6c97ec5f
XG
1310 qemu_put_byte(file, 0);
1311 len += 1;
47fe16ff 1312 ram_release_page(block->idstr, offset);
6c97ec5f
XG
1313 }
1314 return len;
1315}
1316
56e93d26 1317/**
3d0684b2 1318 * save_zero_page: send the zero page to the stream
56e93d26 1319 *
3d0684b2 1320 * Returns the number of pages written.
56e93d26 1321 *
ec6f3ab9 1322 * @pss: current PSS channel
56e93d26
JQ
1323 * @block: block that contains the page we want to send
1324 * @offset: offset inside the block for the page
56e93d26 1325 */
37502df3 1326static int save_zero_page(PageSearchStatus *pss, QEMUFile *f, RAMBlock *block,
61717ea9 1327 ram_addr_t offset)
56e93d26 1328{
37502df3 1329 int len = save_zero_page_to_file(pss, f, block, offset);
56e93d26 1330
6c97ec5f 1331 if (len) {
aff3f660 1332 stat64_add(&mig_stats.zero_pages, 1);
4c2d0f6d 1333 ram_transferred_add(len);
6c97ec5f 1334 return 1;
56e93d26 1335 }
6c97ec5f 1336 return -1;
56e93d26
JQ
1337}
1338
059ff0fb
XG
1339/*
1340 * @pages: the number of pages written by the control path,
1341 * < 0 - error
1342 * > 0 - number of pages written
1343 *
1344 * Return true if the pages has been saved, otherwise false is returned.
1345 */
61717ea9
PX
1346static bool control_save_page(PageSearchStatus *pss, RAMBlock *block,
1347 ram_addr_t offset, int *pages)
059ff0fb
XG
1348{
1349 uint64_t bytes_xmit = 0;
1350 int ret;
1351
1352 *pages = -1;
61717ea9
PX
1353 ret = ram_control_save_page(pss->pss_channel, block->offset, offset,
1354 TARGET_PAGE_SIZE, &bytes_xmit);
059ff0fb
XG
1355 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1356 return false;
1357 }
1358
1359 if (bytes_xmit) {
4c2d0f6d 1360 ram_transferred_add(bytes_xmit);
059ff0fb
XG
1361 *pages = 1;
1362 }
1363
1364 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1365 return true;
1366 }
1367
1368 if (bytes_xmit > 0) {
aff3f660 1369 stat64_add(&mig_stats.normal_pages, 1);
059ff0fb 1370 } else if (bytes_xmit == 0) {
aff3f660 1371 stat64_add(&mig_stats.zero_pages, 1);
059ff0fb
XG
1372 }
1373
1374 return true;
1375}
1376
65dacaa0
XG
1377/*
1378 * directly send the page to the stream
1379 *
1380 * Returns the number of pages written.
1381 *
ec6f3ab9 1382 * @pss: current PSS channel
65dacaa0
XG
1383 * @block: block that contains the page we want to send
1384 * @offset: offset inside the block for the page
1385 * @buf: the page to be sent
1386 * @async: send to page asyncly
1387 */
ec6f3ab9 1388static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
61717ea9 1389 ram_addr_t offset, uint8_t *buf, bool async)
65dacaa0 1390{
ec6f3ab9
PX
1391 QEMUFile *file = pss->pss_channel;
1392
37502df3 1393 ram_transferred_add(save_page_header(pss, pss->pss_channel, block,
4c2d0f6d 1394 offset | RAM_SAVE_FLAG_PAGE));
65dacaa0 1395 if (async) {
61717ea9 1396 qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
f912ec5b 1397 migrate_release_ram() &&
65dacaa0
XG
1398 migration_in_postcopy());
1399 } else {
61717ea9 1400 qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
65dacaa0 1401 }
4c2d0f6d 1402 ram_transferred_add(TARGET_PAGE_SIZE);
aff3f660 1403 stat64_add(&mig_stats.normal_pages, 1);
65dacaa0
XG
1404 return 1;
1405}
1406
56e93d26 1407/**
3d0684b2 1408 * ram_save_page: send the given page to the stream
56e93d26 1409 *
3d0684b2 1410 * Returns the number of pages written.
3fd3c4b3
DDAG
1411 * < 0 - error
1412 * >=0 - Number of pages written - this might legally be 0
1413 * if xbzrle noticed the page was the same.
56e93d26 1414 *
6f37bb8b 1415 * @rs: current RAM state
56e93d26
JQ
1416 * @block: block that contains the page we want to send
1417 * @offset: offset inside the block for the page
56e93d26 1418 */
05931ec5 1419static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
56e93d26
JQ
1420{
1421 int pages = -1;
56e93d26 1422 uint8_t *p;
56e93d26 1423 bool send_async = true;
a08f6890 1424 RAMBlock *block = pss->block;
8bba004c 1425 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
059ff0fb 1426 ram_addr_t current_addr = block->offset + offset;
56e93d26 1427
2f68e399 1428 p = block->host + offset;
1db9d8e5 1429 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
56e93d26 1430
56e93d26 1431 XBZRLE_cache_lock();
f3095cc8 1432 if (rs->xbzrle_started && !migration_in_postcopy()) {
ec6f3ab9 1433 pages = save_xbzrle_page(rs, pss, &p, current_addr,
61717ea9 1434 block, offset);
05931ec5 1435 if (!rs->last_stage) {
059ff0fb
XG
1436 /* Can't send this cached data async, since the cache page
1437 * might get updated before it gets to the wire
56e93d26 1438 */
059ff0fb 1439 send_async = false;
56e93d26
JQ
1440 }
1441 }
1442
1443 /* XBZRLE overflow or normal page */
1444 if (pages == -1) {
ec6f3ab9 1445 pages = save_normal_page(pss, block, offset, p, send_async);
56e93d26
JQ
1446 }
1447
1448 XBZRLE_cache_unlock();
1449
1450 return pages;
1451}
1452
61717ea9 1453static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block,
b9ee2f7d
JQ
1454 ram_addr_t offset)
1455{
61717ea9 1456 if (multifd_queue_page(file, block, offset) < 0) {
713f762a
IR
1457 return -1;
1458 }
aff3f660 1459 stat64_add(&mig_stats.normal_pages, 1);
b9ee2f7d
JQ
1460
1461 return 1;
1462}
1463
97274a87
LS
1464static CompressResult do_compress_ram_page(QEMUFile *f, z_stream *stream,
1465 RAMBlock *block, ram_addr_t offset,
1466 uint8_t *source_buf)
56e93d26 1467{
20d549cb 1468 uint8_t *p = block->host + offset;
6ef3771c 1469 int ret;
56e93d26 1470
3e81763e 1471 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
97274a87 1472 return RES_ZEROPAGE;
5e5fdcff
XG
1473 }
1474
34ab9e97
XG
1475 /*
1476 * copy it to a internal buffer to avoid it being modified by VM
1477 * so that we can catch up the error during compression and
1478 * decompression
1479 */
1480 memcpy(source_buf, p, TARGET_PAGE_SIZE);
6ef3771c
XG
1481 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1482 if (ret < 0) {
1483 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
b3be2896 1484 error_report("compressed data failed!");
97274a87 1485 return RES_NONE;
b3be2896 1486 }
97274a87 1487 return RES_COMPRESS;
5e5fdcff
XG
1488}
1489
1490static void
1491update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1492{
4c2d0f6d 1493 ram_transferred_add(bytes_xmit);
76e03000 1494
97274a87 1495 if (param->result == RES_ZEROPAGE) {
aff3f660 1496 stat64_add(&mig_stats.zero_pages, 1);
76e03000 1497 return;
5e5fdcff 1498 }
76e03000
XG
1499
1500 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1501 compression_counters.compressed_size += bytes_xmit - 8;
1502 compression_counters.pages++;
56e93d26
JQ
1503}
1504
32b05495
XG
1505static bool save_page_use_compression(RAMState *rs);
1506
b5cf1cd3
LS
1507static inline void compress_reset_result(CompressParam *param)
1508{
1509 param->result = RES_NONE;
1510 param->block = NULL;
1511 param->offset = 0;
1512}
1513
3e81763e 1514static int send_queued_data(CompressParam *param)
56e93d26 1515{
3e81763e 1516 PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_PRECOPY];
eaa238ab 1517 MigrationState *ms = migrate_get_current();
3e81763e
LS
1518 QEMUFile *file = ms->to_dst_file;
1519 int len = 0;
1520
1521 RAMBlock *block = param->block;
1522 ram_addr_t offset = param->offset;
1523
1524 if (param->result == RES_NONE) {
1525 return 0;
1526 }
1527
1528 assert(block == pss->last_sent_block);
1529
1530 if (param->result == RES_ZEROPAGE) {
1531 len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO);
1532 qemu_put_byte(file, 0);
1533 len += 1;
1534 ram_release_page(block->idstr, offset);
1535 } else if (param->result == RES_COMPRESS) {
1536 len += save_page_header(pss, file, block,
1537 offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1538 len += qemu_put_qemu_file(file, param->file);
1539 } else {
1540 abort();
1541 }
1542
680628d2
LS
1543 update_compress_thread_counts(param, len);
1544
3e81763e
LS
1545 return len;
1546}
1547
ef4f5f5d 1548static void flush_compressed_data(int (send_queued_data(CompressParam *)))
3e81763e 1549{
680628d2 1550 int idx, thread_count;
56e93d26 1551
56e93d26 1552 thread_count = migrate_compress_threads();
a7a9a88f 1553
0d9f9a5c 1554 qemu_mutex_lock(&comp_done_lock);
56e93d26 1555 for (idx = 0; idx < thread_count; idx++) {
a7a9a88f 1556 while (!comp_param[idx].done) {
0d9f9a5c 1557 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26 1558 }
a7a9a88f 1559 }
0d9f9a5c 1560 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
1561
1562 for (idx = 0; idx < thread_count; idx++) {
1563 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 1564 if (!comp_param[idx].quit) {
b5cf1cd3 1565 CompressParam *param = &comp_param[idx];
680628d2 1566 send_queued_data(param);
b5cf1cd3 1567 compress_reset_result(param);
56e93d26 1568 }
a7a9a88f 1569 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26
JQ
1570 }
1571}
1572
ef4f5f5d
LS
1573static void ram_flush_compressed_data(RAMState *rs)
1574{
1575 if (!save_page_use_compression(rs)) {
1576 return;
1577 }
1578
1579 flush_compressed_data(send_queued_data);
1580}
1581
56e93d26
JQ
1582static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1583 ram_addr_t offset)
1584{
1585 param->block = block;
1586 param->offset = offset;
10c2f7b7 1587 param->trigger = true;
56e93d26
JQ
1588}
1589
ef4f5f5d
LS
1590static int compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset,
1591 int (send_queued_data(CompressParam *)))
56e93d26 1592{
680628d2 1593 int idx, thread_count, pages = -1;
1d58872a 1594 bool wait = migrate_compress_wait_thread();
56e93d26
JQ
1595
1596 thread_count = migrate_compress_threads();
0d9f9a5c 1597 qemu_mutex_lock(&comp_done_lock);
1d58872a
XG
1598retry:
1599 for (idx = 0; idx < thread_count; idx++) {
1600 if (comp_param[idx].done) {
b5cf1cd3
LS
1601 CompressParam *param = &comp_param[idx];
1602 qemu_mutex_lock(&param->mutex);
1603 param->done = false;
680628d2 1604 send_queued_data(param);
b5cf1cd3
LS
1605 compress_reset_result(param);
1606 set_compress_params(param, block, offset);
1607
b5cf1cd3
LS
1608 qemu_cond_signal(&param->cond);
1609 qemu_mutex_unlock(&param->mutex);
1d58872a 1610 pages = 1;
56e93d26 1611 break;
56e93d26
JQ
1612 }
1613 }
1d58872a
XG
1614
1615 /*
1616 * wait for the free thread if the user specifies 'compress-wait-thread',
1617 * otherwise we will post the page out in the main thread as normal page.
1618 */
1619 if (pages < 0 && wait) {
1620 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1621 goto retry;
1622 }
0d9f9a5c 1623 qemu_mutex_unlock(&comp_done_lock);
56e93d26
JQ
1624
1625 return pages;
1626}
1627
31e2ac74
JQ
1628#define PAGE_ALL_CLEAN 0
1629#define PAGE_TRY_AGAIN 1
1630#define PAGE_DIRTY_FOUND 2
3d0684b2
JQ
1631/**
1632 * find_dirty_block: find the next dirty page and update any state
1633 * associated with the search process.
b9e60928 1634 *
31e2ac74 1635 * Returns:
294e5a40 1636 * <0: An error happened
31e2ac74
JQ
1637 * PAGE_ALL_CLEAN: no dirty page found, give up
1638 * PAGE_TRY_AGAIN: no dirty page found, retry for next block
1639 * PAGE_DIRTY_FOUND: dirty page found
b9e60928 1640 *
6f37bb8b 1641 * @rs: current RAM state
3d0684b2
JQ
1642 * @pss: data about the state of the current dirty page scan
1643 * @again: set to false if the search has scanned the whole of RAM
b9e60928 1644 */
31e2ac74 1645static int find_dirty_block(RAMState *rs, PageSearchStatus *pss)
b9e60928 1646{
d9e474ea
PX
1647 /* Update pss->page for the next dirty bit in ramblock */
1648 pss_find_next_dirty(pss);
1649
6f37bb8b 1650 if (pss->complete_round && pss->block == rs->last_seen_block &&
a935e30f 1651 pss->page >= rs->last_page) {
b9e60928
DDAG
1652 /*
1653 * We've been once around the RAM and haven't found anything.
1654 * Give up.
1655 */
31e2ac74 1656 return PAGE_ALL_CLEAN;
b9e60928 1657 }
542147f4
DH
1658 if (!offset_in_ramblock(pss->block,
1659 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
b9e60928 1660 /* Didn't find anything in this RAM Block */
a935e30f 1661 pss->page = 0;
b9e60928
DDAG
1662 pss->block = QLIST_NEXT_RCU(pss->block, next);
1663 if (!pss->block) {
294e5a40
JQ
1664 if (!migrate_multifd_flush_after_each_section()) {
1665 QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel;
1666 int ret = multifd_send_sync_main(f);
1667 if (ret < 0) {
1668 return ret;
1669 }
1670 qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
1671 qemu_fflush(f);
1672 }
48df9d80
XG
1673 /*
1674 * If memory migration starts over, we will meet a dirtied page
1675 * which may still exists in compression threads's ring, so we
1676 * should flush the compressed data to make sure the new page
1677 * is not overwritten by the old one in the destination.
1678 *
1679 * Also If xbzrle is on, stop using the data compression at this
1680 * point. In theory, xbzrle can do better than compression.
1681 */
ef4f5f5d 1682 ram_flush_compressed_data(rs);
48df9d80 1683
b9e60928
DDAG
1684 /* Hit the end of the list */
1685 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1686 /* Flag that we've looped */
1687 pss->complete_round = true;
1a373522 1688 /* After the first round, enable XBZRLE. */
87dca0c9 1689 if (migrate_xbzrle()) {
f3095cc8 1690 rs->xbzrle_started = true;
1a373522 1691 }
b9e60928
DDAG
1692 }
1693 /* Didn't find anything this time, but try again on the new block */
31e2ac74 1694 return PAGE_TRY_AGAIN;
b9e60928 1695 } else {
31e2ac74
JQ
1696 /* We've found something */
1697 return PAGE_DIRTY_FOUND;
b9e60928
DDAG
1698 }
1699}
1700
3d0684b2
JQ
1701/**
1702 * unqueue_page: gets a page of the queue
1703 *
a82d593b 1704 * Helper for 'get_queued_page' - gets a page off the queue
a82d593b 1705 *
3d0684b2
JQ
1706 * Returns the block of the page (or NULL if none available)
1707 *
ec481c6c 1708 * @rs: current RAM state
3d0684b2 1709 * @offset: used to return the offset within the RAMBlock
a82d593b 1710 */
f20e2865 1711static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
a82d593b 1712{
a1fe28df 1713 struct RAMSrcPageRequest *entry;
a82d593b
DDAG
1714 RAMBlock *block = NULL;
1715
a1fe28df 1716 if (!postcopy_has_request(rs)) {
ae526e32
XG
1717 return NULL;
1718 }
1719
6e8a355d 1720 QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
a1fe28df
PX
1721
1722 /*
1723 * This should _never_ change even after we take the lock, because no one
1724 * should be taking anything off the request list other than us.
1725 */
1726 assert(postcopy_has_request(rs));
1727
1728 entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1729 block = entry->rb;
1730 *offset = entry->offset;
1731
777f53c7
TH
1732 if (entry->len > TARGET_PAGE_SIZE) {
1733 entry->len -= TARGET_PAGE_SIZE;
1734 entry->offset += TARGET_PAGE_SIZE;
a1fe28df
PX
1735 } else {
1736 memory_region_unref(block->mr);
1737 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1738 g_free(entry);
1739 migration_consume_urgent_request();
a82d593b 1740 }
a82d593b
DDAG
1741
1742 return block;
1743}
1744
278e2f55
AG
1745#if defined(__linux__)
1746/**
1747 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1748 * is found, return RAM block pointer and page offset
1749 *
1750 * Returns pointer to the RAMBlock containing faulting page,
1751 * NULL if no write faults are pending
1752 *
1753 * @rs: current RAM state
1754 * @offset: page offset from the beginning of the block
1755 */
1756static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1757{
1758 struct uffd_msg uffd_msg;
1759 void *page_address;
82ea3e3b 1760 RAMBlock *block;
278e2f55
AG
1761 int res;
1762
1763 if (!migrate_background_snapshot()) {
1764 return NULL;
1765 }
1766
1767 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1768 if (res <= 0) {
1769 return NULL;
1770 }
1771
1772 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
82ea3e3b
AG
1773 block = qemu_ram_block_from_host(page_address, false, offset);
1774 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1775 return block;
278e2f55
AG
1776}
1777
1778/**
1779 * ram_save_release_protection: release UFFD write protection after
1780 * a range of pages has been saved
1781 *
1782 * @rs: current RAM state
1783 * @pss: page-search-status structure
1784 * @start_page: index of the first page in the range relative to pss->block
1785 *
1786 * Returns 0 on success, negative value in case of an error
1787*/
1788static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1789 unsigned long start_page)
1790{
1791 int res = 0;
1792
1793 /* Check if page is from UFFD-managed region. */
1794 if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1795 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
258f5c98 1796 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
278e2f55
AG
1797
1798 /* Flush async buffers before un-protect. */
61717ea9 1799 qemu_fflush(pss->pss_channel);
278e2f55
AG
1800 /* Un-protect memory range. */
1801 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1802 false, false);
1803 }
1804
1805 return res;
1806}
1807
1808/* ram_write_tracking_available: check if kernel supports required UFFD features
1809 *
1810 * Returns true if supports, false otherwise
1811 */
1812bool ram_write_tracking_available(void)
1813{
1814 uint64_t uffd_features;
1815 int res;
1816
1817 res = uffd_query_features(&uffd_features);
1818 return (res == 0 &&
1819 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1820}
1821
1822/* ram_write_tracking_compatible: check if guest configuration is
1823 * compatible with 'write-tracking'
1824 *
1825 * Returns true if compatible, false otherwise
1826 */
1827bool ram_write_tracking_compatible(void)
1828{
1829 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1830 int uffd_fd;
82ea3e3b 1831 RAMBlock *block;
278e2f55
AG
1832 bool ret = false;
1833
1834 /* Open UFFD file descriptor */
1835 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1836 if (uffd_fd < 0) {
1837 return false;
1838 }
1839
1840 RCU_READ_LOCK_GUARD();
1841
82ea3e3b 1842 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
278e2f55
AG
1843 uint64_t uffd_ioctls;
1844
1845 /* Nothing to do with read-only and MMIO-writable regions */
82ea3e3b 1846 if (block->mr->readonly || block->mr->rom_device) {
278e2f55
AG
1847 continue;
1848 }
1849 /* Try to register block memory via UFFD-IO to track writes */
82ea3e3b 1850 if (uffd_register_memory(uffd_fd, block->host, block->max_length,
278e2f55
AG
1851 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1852 goto out;
1853 }
1854 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1855 goto out;
1856 }
1857 }
1858 ret = true;
1859
1860out:
1861 uffd_close_fd(uffd_fd);
1862 return ret;
1863}
1864
f7b9dcfb
DH
1865static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1866 ram_addr_t size)
1867{
5f19a449
DH
1868 const ram_addr_t end = offset + size;
1869
f7b9dcfb
DH
1870 /*
1871 * We read one byte of each page; this will preallocate page tables if
1872 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1873 * where no page was populated yet. This might require adaption when
1874 * supporting other mappings, like shmem.
1875 */
5f19a449 1876 for (; offset < end; offset += block->page_size) {
f7b9dcfb
DH
1877 char tmp = *((char *)block->host + offset);
1878
1879 /* Don't optimize the read out */
1880 asm volatile("" : "+r" (tmp));
1881 }
1882}
1883
6fee3a1f
DH
1884static inline int populate_read_section(MemoryRegionSection *section,
1885 void *opaque)
1886{
1887 const hwaddr size = int128_get64(section->size);
1888 hwaddr offset = section->offset_within_region;
1889 RAMBlock *block = section->mr->ram_block;
1890
1891 populate_read_range(block, offset, size);
1892 return 0;
1893}
1894
eeccb99c 1895/*
f7b9dcfb
DH
1896 * ram_block_populate_read: preallocate page tables and populate pages in the
1897 * RAM block by reading a byte of each page.
eeccb99c
AG
1898 *
1899 * Since it's solely used for userfault_fd WP feature, here we just
1900 * hardcode page size to qemu_real_host_page_size.
1901 *
82ea3e3b 1902 * @block: RAM block to populate
eeccb99c 1903 */
6fee3a1f 1904static void ram_block_populate_read(RAMBlock *rb)
eeccb99c 1905{
6fee3a1f
DH
1906 /*
1907 * Skip populating all pages that fall into a discarded range as managed by
1908 * a RamDiscardManager responsible for the mapped memory region of the
1909 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1910 * must not get populated automatically. We don't have to track
1911 * modifications via userfaultfd WP reliably, because these pages will
1912 * not be part of the migration stream either way -- see
1913 * ramblock_dirty_bitmap_exclude_discarded_pages().
1914 *
1915 * Note: The result is only stable while migrating (precopy/postcopy).
1916 */
1917 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1918 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1919 MemoryRegionSection section = {
1920 .mr = rb->mr,
1921 .offset_within_region = 0,
1922 .size = rb->mr->size,
1923 };
1924
1925 ram_discard_manager_replay_populated(rdm, &section,
1926 populate_read_section, NULL);
1927 } else {
1928 populate_read_range(rb, 0, rb->used_length);
1929 }
eeccb99c
AG
1930}
1931
1932/*
1933 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1934 */
1935void ram_write_tracking_prepare(void)
1936{
82ea3e3b 1937 RAMBlock *block;
eeccb99c
AG
1938
1939 RCU_READ_LOCK_GUARD();
1940
82ea3e3b 1941 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
eeccb99c 1942 /* Nothing to do with read-only and MMIO-writable regions */
82ea3e3b 1943 if (block->mr->readonly || block->mr->rom_device) {
eeccb99c
AG
1944 continue;
1945 }
1946
1947 /*
1948 * Populate pages of the RAM block before enabling userfault_fd
1949 * write protection.
1950 *
1951 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1952 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1953 * pages with pte_none() entries in page table.
1954 */
f7b9dcfb 1955 ram_block_populate_read(block);
eeccb99c
AG
1956 }
1957}
1958
e41c5770
DH
1959static inline int uffd_protect_section(MemoryRegionSection *section,
1960 void *opaque)
1961{
1962 const hwaddr size = int128_get64(section->size);
1963 const hwaddr offset = section->offset_within_region;
1964 RAMBlock *rb = section->mr->ram_block;
1965 int uffd_fd = (uintptr_t)opaque;
1966
1967 return uffd_change_protection(uffd_fd, rb->host + offset, size, true,
1968 false);
1969}
1970
1971static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd)
1972{
1973 assert(rb->flags & RAM_UF_WRITEPROTECT);
1974
1975 /* See ram_block_populate_read() */
1976 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1977 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1978 MemoryRegionSection section = {
1979 .mr = rb->mr,
1980 .offset_within_region = 0,
1981 .size = rb->mr->size,
1982 };
1983
1984 return ram_discard_manager_replay_populated(rdm, &section,
1985 uffd_protect_section,
1986 (void *)(uintptr_t)uffd_fd);
1987 }
1988 return uffd_change_protection(uffd_fd, rb->host,
1989 rb->used_length, true, false);
1990}
1991
278e2f55
AG
1992/*
1993 * ram_write_tracking_start: start UFFD-WP memory tracking
1994 *
1995 * Returns 0 for success or negative value in case of error
1996 */
1997int ram_write_tracking_start(void)
1998{
1999 int uffd_fd;
2000 RAMState *rs = ram_state;
82ea3e3b 2001 RAMBlock *block;
278e2f55
AG
2002
2003 /* Open UFFD file descriptor */
2004 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
2005 if (uffd_fd < 0) {
2006 return uffd_fd;
2007 }
2008 rs->uffdio_fd = uffd_fd;
2009
2010 RCU_READ_LOCK_GUARD();
2011
82ea3e3b 2012 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
278e2f55 2013 /* Nothing to do with read-only and MMIO-writable regions */
82ea3e3b 2014 if (block->mr->readonly || block->mr->rom_device) {
278e2f55
AG
2015 continue;
2016 }
2017
2018 /* Register block memory with UFFD to track writes */
82ea3e3b
AG
2019 if (uffd_register_memory(rs->uffdio_fd, block->host,
2020 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
278e2f55
AG
2021 goto fail;
2022 }
72ef3a37
DH
2023 block->flags |= RAM_UF_WRITEPROTECT;
2024 memory_region_ref(block->mr);
2025
278e2f55 2026 /* Apply UFFD write protection to the block memory range */
e41c5770 2027 if (ram_block_uffd_protect(block, uffd_fd)) {
278e2f55
AG
2028 goto fail;
2029 }
278e2f55 2030
82ea3e3b
AG
2031 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
2032 block->host, block->max_length);
278e2f55
AG
2033 }
2034
2035 return 0;
2036
2037fail:
2038 error_report("ram_write_tracking_start() failed: restoring initial memory state");
2039
82ea3e3b
AG
2040 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2041 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
278e2f55
AG
2042 continue;
2043 }
82ea3e3b 2044 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
278e2f55 2045 /* Cleanup flags and remove reference */
82ea3e3b
AG
2046 block->flags &= ~RAM_UF_WRITEPROTECT;
2047 memory_region_unref(block->mr);
278e2f55
AG
2048 }
2049
2050 uffd_close_fd(uffd_fd);
2051 rs->uffdio_fd = -1;
2052 return -1;
2053}
2054
2055/**
2056 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
2057 */
2058void ram_write_tracking_stop(void)
2059{
2060 RAMState *rs = ram_state;
82ea3e3b 2061 RAMBlock *block;
278e2f55
AG
2062
2063 RCU_READ_LOCK_GUARD();
2064
82ea3e3b
AG
2065 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2066 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
278e2f55
AG
2067 continue;
2068 }
82ea3e3b 2069 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
278e2f55 2070
82ea3e3b
AG
2071 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
2072 block->host, block->max_length);
278e2f55
AG
2073
2074 /* Cleanup flags and remove reference */
82ea3e3b
AG
2075 block->flags &= ~RAM_UF_WRITEPROTECT;
2076 memory_region_unref(block->mr);
278e2f55
AG
2077 }
2078
2079 /* Finally close UFFD file descriptor */
2080 uffd_close_fd(rs->uffdio_fd);
2081 rs->uffdio_fd = -1;
2082}
2083
2084#else
2085/* No target OS support, stubs just fail or ignore */
2086
2087static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
2088{
2089 (void) rs;
2090 (void) offset;
2091
2092 return NULL;
2093}
2094
2095static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
2096 unsigned long start_page)
2097{
2098 (void) rs;
2099 (void) pss;
2100 (void) start_page;
2101
2102 return 0;
2103}
2104
2105bool ram_write_tracking_available(void)
2106{
2107 return false;
2108}
2109
2110bool ram_write_tracking_compatible(void)
2111{
2112 assert(0);
2113 return false;
2114}
2115
2116int ram_write_tracking_start(void)
2117{
2118 assert(0);
2119 return -1;
2120}
2121
2122void ram_write_tracking_stop(void)
2123{
2124 assert(0);
2125}
2126#endif /* defined(__linux__) */
2127
3d0684b2 2128/**
ff1543af 2129 * get_queued_page: unqueue a page from the postcopy requests
3d0684b2
JQ
2130 *
2131 * Skips pages that are already sent (!dirty)
a82d593b 2132 *
a5f7b1a6 2133 * Returns true if a queued page is found
a82d593b 2134 *
6f37bb8b 2135 * @rs: current RAM state
3d0684b2 2136 * @pss: data about the state of the current dirty page scan
a82d593b 2137 */
f20e2865 2138static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
a82d593b
DDAG
2139{
2140 RAMBlock *block;
2141 ram_addr_t offset;
777f53c7
TH
2142 bool dirty;
2143
2144 do {
2145 block = unqueue_page(rs, &offset);
2146 /*
2147 * We're sending this page, and since it's postcopy nothing else
2148 * will dirty it, and we must make sure it doesn't get sent again
2149 * even if this queue request was received after the background
2150 * search already sent it.
2151 */
2152 if (block) {
2153 unsigned long page;
2154
2155 page = offset >> TARGET_PAGE_BITS;
2156 dirty = test_bit(page, block->bmap);
2157 if (!dirty) {
2158 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2159 page);
2160 } else {
2161 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2162 }
2163 }
a82d593b 2164
777f53c7 2165 } while (block && !dirty);
a82d593b 2166
b062106d 2167 if (!block) {
278e2f55
AG
2168 /*
2169 * Poll write faults too if background snapshot is enabled; that's
2170 * when we have vcpus got blocked by the write protected pages.
2171 */
2172 block = poll_fault_page(rs, &offset);
2173 }
2174
a82d593b 2175 if (block) {
a82d593b
DDAG
2176 /*
2177 * We want the background search to continue from the queued page
2178 * since the guest is likely to want other pages near to the page
2179 * it just requested.
2180 */
2181 pss->block = block;
a935e30f 2182 pss->page = offset >> TARGET_PAGE_BITS;
422314e7
WY
2183
2184 /*
2185 * This unqueued page would break the "one round" check, even is
2186 * really rare.
2187 */
2188 pss->complete_round = false;
a82d593b
DDAG
2189 }
2190
2191 return !!block;
2192}
2193
6c595cde 2194/**
5e58f968
JQ
2195 * migration_page_queue_free: drop any remaining pages in the ram
2196 * request queue
6c595cde 2197 *
3d0684b2
JQ
2198 * It should be empty at the end anyway, but in error cases there may
2199 * be some left. in case that there is any page left, we drop it.
2200 *
6c595cde 2201 */
83c13382 2202static void migration_page_queue_free(RAMState *rs)
6c595cde 2203{
ec481c6c 2204 struct RAMSrcPageRequest *mspr, *next_mspr;
6c595cde
DDAG
2205 /* This queue generally should be empty - but in the case of a failed
2206 * migration might have some droppings in.
2207 */
89ac5a1d 2208 RCU_READ_LOCK_GUARD();
ec481c6c 2209 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
6c595cde 2210 memory_region_unref(mspr->rb->mr);
ec481c6c 2211 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
6c595cde
DDAG
2212 g_free(mspr);
2213 }
6c595cde
DDAG
2214}
2215
2216/**
3d0684b2
JQ
2217 * ram_save_queue_pages: queue the page for transmission
2218 *
2219 * A request from postcopy destination for example.
2220 *
2221 * Returns zero on success or negative on error
2222 *
3d0684b2
JQ
2223 * @rbname: Name of the RAMBLock of the request. NULL means the
2224 * same that last one.
2225 * @start: starting address from the start of the RAMBlock
2226 * @len: length (in bytes) to send
6c595cde 2227 */
96506894 2228int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
6c595cde
DDAG
2229{
2230 RAMBlock *ramblock;
53518d94 2231 RAMState *rs = ram_state;
6c595cde 2232
aff3f660 2233 stat64_add(&mig_stats.postcopy_requests, 1);
89ac5a1d
DDAG
2234 RCU_READ_LOCK_GUARD();
2235
6c595cde
DDAG
2236 if (!rbname) {
2237 /* Reuse last RAMBlock */
68a098f3 2238 ramblock = rs->last_req_rb;
6c595cde
DDAG
2239
2240 if (!ramblock) {
2241 /*
2242 * Shouldn't happen, we can't reuse the last RAMBlock if
2243 * it's the 1st request.
2244 */
2245 error_report("ram_save_queue_pages no previous block");
03acb4e9 2246 return -1;
6c595cde
DDAG
2247 }
2248 } else {
2249 ramblock = qemu_ram_block_by_name(rbname);
2250
2251 if (!ramblock) {
2252 /* We shouldn't be asked for a non-existent RAMBlock */
2253 error_report("ram_save_queue_pages no block '%s'", rbname);
03acb4e9 2254 return -1;
6c595cde 2255 }
68a098f3 2256 rs->last_req_rb = ramblock;
6c595cde
DDAG
2257 }
2258 trace_ram_save_queue_pages(ramblock->idstr, start, len);
542147f4 2259 if (!offset_in_ramblock(ramblock, start + len - 1)) {
9458ad6b
JQ
2260 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2261 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
6c595cde 2262 __func__, start, len, ramblock->used_length);
03acb4e9 2263 return -1;
6c595cde
DDAG
2264 }
2265
93589827
PX
2266 /*
2267 * When with postcopy preempt, we send back the page directly in the
2268 * rp-return thread.
2269 */
2270 if (postcopy_preempt_active()) {
2271 ram_addr_t page_start = start >> TARGET_PAGE_BITS;
2272 size_t page_size = qemu_ram_pagesize(ramblock);
2273 PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY];
2274 int ret = 0;
2275
2276 qemu_mutex_lock(&rs->bitmap_mutex);
2277
2278 pss_init(pss, ramblock, page_start);
2279 /*
2280 * Always use the preempt channel, and make sure it's there. It's
2281 * safe to access without lock, because when rp-thread is running
2282 * we should be the only one who operates on the qemufile
2283 */
2284 pss->pss_channel = migrate_get_current()->postcopy_qemufile_src;
93589827
PX
2285 assert(pss->pss_channel);
2286
2287 /*
2288 * It must be either one or multiple of host page size. Just
2289 * assert; if something wrong we're mostly split brain anyway.
2290 */
2291 assert(len % page_size == 0);
2292 while (len) {
2293 if (ram_save_host_page_urgent(pss)) {
2294 error_report("%s: ram_save_host_page_urgent() failed: "
2295 "ramblock=%s, start_addr=0x"RAM_ADDR_FMT,
2296 __func__, ramblock->idstr, start);
2297 ret = -1;
2298 break;
2299 }
2300 /*
2301 * NOTE: after ram_save_host_page_urgent() succeeded, pss->page
2302 * will automatically be moved and point to the next host page
2303 * we're going to send, so no need to update here.
2304 *
2305 * Normally QEMU never sends >1 host page in requests, so
2306 * logically we don't even need that as the loop should only
2307 * run once, but just to be consistent.
2308 */
2309 len -= page_size;
2310 };
2311 qemu_mutex_unlock(&rs->bitmap_mutex);
2312
2313 return ret;
2314 }
2315
ec481c6c 2316 struct RAMSrcPageRequest *new_entry =
b21e2380 2317 g_new0(struct RAMSrcPageRequest, 1);
6c595cde
DDAG
2318 new_entry->rb = ramblock;
2319 new_entry->offset = start;
2320 new_entry->len = len;
2321
2322 memory_region_ref(ramblock->mr);
ec481c6c
JQ
2323 qemu_mutex_lock(&rs->src_page_req_mutex);
2324 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
e03a34f8 2325 migration_make_urgent_request();
ec481c6c 2326 qemu_mutex_unlock(&rs->src_page_req_mutex);
6c595cde
DDAG
2327
2328 return 0;
6c595cde
DDAG
2329}
2330
d7400a34
XG
2331static bool save_page_use_compression(RAMState *rs)
2332{
a7a94d14 2333 if (!migrate_compress()) {
d7400a34
XG
2334 return false;
2335 }
2336
2337 /*
1a373522
DH
2338 * If xbzrle is enabled (e.g., after first round of migration), stop
2339 * using the data compression. In theory, xbzrle can do better than
2340 * compression.
d7400a34 2341 */
f3095cc8 2342 if (rs->xbzrle_started) {
1a373522 2343 return false;
d7400a34
XG
2344 }
2345
1a373522 2346 return true;
d7400a34
XG
2347}
2348
5e5fdcff
XG
2349/*
2350 * try to compress the page before posting it out, return true if the page
2351 * has been properly handled by compression, otherwise needs other
2352 * paths to handle it
2353 */
ec6f3ab9
PX
2354static bool save_compress_page(RAMState *rs, PageSearchStatus *pss,
2355 RAMBlock *block, ram_addr_t offset)
5e5fdcff
XG
2356{
2357 if (!save_page_use_compression(rs)) {
2358 return false;
2359 }
2360
2361 /*
2362 * When starting the process of a new block, the first page of
2363 * the block should be sent out before other pages in the same
2364 * block, and all the pages in last block should have been sent
2365 * out, keeping this order is important, because the 'cont' flag
2366 * is used to avoid resending the block name.
2367 *
2368 * We post the fist page as normal page as compression will take
2369 * much CPU resource.
2370 */
ec6f3ab9 2371 if (block != pss->last_sent_block) {
ef4f5f5d 2372 ram_flush_compressed_data(rs);
5e5fdcff
XG
2373 return false;
2374 }
2375
ef4f5f5d 2376 if (compress_page_with_multi_thread(block, offset, send_queued_data) > 0) {
5e5fdcff
XG
2377 return true;
2378 }
2379
76e03000 2380 compression_counters.busy++;
5e5fdcff
XG
2381 return false;
2382}
2383
a82d593b 2384/**
4010ba38 2385 * ram_save_target_page_legacy: save one target page
a82d593b 2386 *
3d0684b2 2387 * Returns the number of pages written
a82d593b 2388 *
6f37bb8b 2389 * @rs: current RAM state
3d0684b2 2390 * @pss: data about the page we want to send
a82d593b 2391 */
4010ba38 2392static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss)
a82d593b 2393{
a8ec91f9 2394 RAMBlock *block = pss->block;
8bba004c 2395 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
a8ec91f9
XG
2396 int res;
2397
61717ea9 2398 if (control_save_page(pss, block, offset, &res)) {
a8ec91f9
XG
2399 return res;
2400 }
2401
ec6f3ab9 2402 if (save_compress_page(rs, pss, block, offset)) {
5e5fdcff 2403 return 1;
d7400a34
XG
2404 }
2405
37502df3 2406 res = save_zero_page(pss, pss->pss_channel, block, offset);
d7400a34
XG
2407 if (res > 0) {
2408 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2409 * page would be stale
2410 */
f3095cc8 2411 if (rs->xbzrle_started) {
d7400a34
XG
2412 XBZRLE_cache_lock();
2413 xbzrle_cache_zero_page(rs, block->offset + offset);
2414 XBZRLE_cache_unlock();
2415 }
d7400a34
XG
2416 return res;
2417 }
2418
da3f56cb 2419 /*
6f39c90b
PX
2420 * Do not use multifd in postcopy as one whole host page should be
2421 * placed. Meanwhile postcopy requires atomic update of pages, so even
2422 * if host page size == guest page size the dest guest during run may
2423 * still see partially copied pages which is data corruption.
da3f56cb 2424 */
51b07548 2425 if (migrate_multifd() && !migration_in_postcopy()) {
61717ea9 2426 return ram_save_multifd_page(pss->pss_channel, block, offset);
a82d593b
DDAG
2427 }
2428
05931ec5 2429 return ram_save_page(rs, pss);
a82d593b
DDAG
2430}
2431
d9e474ea
PX
2432/* Should be called before sending a host page */
2433static void pss_host_page_prepare(PageSearchStatus *pss)
2434{
2435 /* How many guest pages are there in one host page? */
2436 size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2437
2438 pss->host_page_sending = true;
301d7ffe
PX
2439 if (guest_pfns <= 1) {
2440 /*
2441 * This covers both when guest psize == host psize, or when guest
2442 * has larger psize than the host (guest_pfns==0).
2443 *
2444 * For the latter, we always send one whole guest page per
2445 * iteration of the host page (example: an Alpha VM on x86 host
2446 * will have guest psize 8K while host psize 4K).
2447 */
2448 pss->host_page_start = pss->page;
2449 pss->host_page_end = pss->page + 1;
2450 } else {
2451 /*
2452 * The host page spans over multiple guest pages, we send them
2453 * within the same host page iteration.
2454 */
2455 pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
2456 pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
2457 }
d9e474ea
PX
2458}
2459
2460/*
2461 * Whether the page pointed by PSS is within the host page being sent.
2462 * Must be called after a previous pss_host_page_prepare().
2463 */
2464static bool pss_within_range(PageSearchStatus *pss)
2465{
2466 ram_addr_t ram_addr;
2467
2468 assert(pss->host_page_sending);
2469
2470 /* Over host-page boundary? */
2471 if (pss->page >= pss->host_page_end) {
2472 return false;
2473 }
2474
2475 ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2476
2477 return offset_in_ramblock(pss->block, ram_addr);
2478}
2479
2480static void pss_host_page_finish(PageSearchStatus *pss)
2481{
2482 pss->host_page_sending = false;
2483 /* This is not needed, but just to reset it */
2484 pss->host_page_start = pss->host_page_end = 0;
2485}
2486
93589827
PX
2487/*
2488 * Send an urgent host page specified by `pss'. Need to be called with
2489 * bitmap_mutex held.
2490 *
2491 * Returns 0 if save host page succeeded, false otherwise.
2492 */
2493static int ram_save_host_page_urgent(PageSearchStatus *pss)
2494{
2495 bool page_dirty, sent = false;
2496 RAMState *rs = ram_state;
2497 int ret = 0;
2498
2499 trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2500 pss_host_page_prepare(pss);
2501
2502 /*
2503 * If precopy is sending the same page, let it be done in precopy, or
2504 * we could send the same page in two channels and none of them will
2505 * receive the whole page.
2506 */
2507 if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) {
2508 trace_postcopy_preempt_hit(pss->block->idstr,
2509 pss->page << TARGET_PAGE_BITS);
2510 return 0;
2511 }
2512
2513 do {
2514 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2515
2516 if (page_dirty) {
2517 /* Be strict to return code; it must be 1, or what else? */
4010ba38 2518 if (migration_ops->ram_save_target_page(rs, pss) != 1) {
93589827
PX
2519 error_report_once("%s: ram_save_target_page failed", __func__);
2520 ret = -1;
2521 goto out;
2522 }
2523 sent = true;
2524 }
2525 pss_find_next_dirty(pss);
2526 } while (pss_within_range(pss));
2527out:
2528 pss_host_page_finish(pss);
2529 /* For urgent requests, flush immediately if sent */
2530 if (sent) {
2531 qemu_fflush(pss->pss_channel);
2532 }
2533 return ret;
2534}
2535
a82d593b 2536/**
3d0684b2 2537 * ram_save_host_page: save a whole host page
a82d593b 2538 *
3d0684b2
JQ
2539 * Starting at *offset send pages up to the end of the current host
2540 * page. It's valid for the initial offset to point into the middle of
2541 * a host page in which case the remainder of the hostpage is sent.
2542 * Only dirty target pages are sent. Note that the host page size may
2543 * be a huge page for this block.
f3321554 2544 *
1eb3fc0a
DDAG
2545 * The saving stops at the boundary of the used_length of the block
2546 * if the RAMBlock isn't a multiple of the host page size.
a82d593b 2547 *
f3321554
PX
2548 * The caller must be with ram_state.bitmap_mutex held to call this
2549 * function. Note that this function can temporarily release the lock, but
2550 * when the function is returned it'll make sure the lock is still held.
2551 *
3d0684b2
JQ
2552 * Returns the number of pages written or negative on error
2553 *
6f37bb8b 2554 * @rs: current RAM state
3d0684b2 2555 * @pss: data about the page we want to send
a82d593b 2556 */
05931ec5 2557static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
a82d593b 2558{
f3321554 2559 bool page_dirty, preempt_active = postcopy_preempt_active();
a82d593b 2560 int tmppages, pages = 0;
a935e30f
JQ
2561 size_t pagesize_bits =
2562 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
278e2f55
AG
2563 unsigned long start_page = pss->page;
2564 int res;
4c011c37 2565
fbd162e6 2566 if (ramblock_is_ignored(pss->block)) {
b895de50
CLG
2567 error_report("block %s should not be migrated !", pss->block->idstr);
2568 return 0;
2569 }
2570
d9e474ea
PX
2571 /* Update host page boundary information */
2572 pss_host_page_prepare(pss);
2573
a82d593b 2574 do {
f3321554 2575 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
a82d593b 2576
f3321554
PX
2577 /* Check the pages is dirty and if it is send it */
2578 if (page_dirty) {
ba1b7c81 2579 /*
f3321554
PX
2580 * Properly yield the lock only in postcopy preempt mode
2581 * because both migration thread and rp-return thread can
2582 * operate on the bitmaps.
ba1b7c81 2583 */
f3321554
PX
2584 if (preempt_active) {
2585 qemu_mutex_unlock(&rs->bitmap_mutex);
ba1b7c81 2586 }
4010ba38 2587 tmppages = migration_ops->ram_save_target_page(rs, pss);
f3321554
PX
2588 if (tmppages >= 0) {
2589 pages += tmppages;
2590 /*
2591 * Allow rate limiting to happen in the middle of huge pages if
2592 * something is sent in the current iteration.
2593 */
2594 if (pagesize_bits > 1 && tmppages > 0) {
2595 migration_rate_limit();
2596 }
2597 }
2598 if (preempt_active) {
2599 qemu_mutex_lock(&rs->bitmap_mutex);
2600 }
2601 } else {
2602 tmppages = 0;
23feba90 2603 }
f3321554
PX
2604
2605 if (tmppages < 0) {
d9e474ea 2606 pss_host_page_finish(pss);
f3321554
PX
2607 return tmppages;
2608 }
2609
d9e474ea
PX
2610 pss_find_next_dirty(pss);
2611 } while (pss_within_range(pss));
2612
2613 pss_host_page_finish(pss);
278e2f55
AG
2614
2615 res = ram_save_release_protection(rs, pss, start_page);
2616 return (res < 0 ? res : pages);
a82d593b 2617}
6c595cde 2618
56e93d26 2619/**
3d0684b2 2620 * ram_find_and_save_block: finds a dirty page and sends it to f
56e93d26
JQ
2621 *
2622 * Called within an RCU critical section.
2623 *
e8f3735f
XG
2624 * Returns the number of pages written where zero means no dirty pages,
2625 * or negative on error
56e93d26 2626 *
6f37bb8b 2627 * @rs: current RAM state
a82d593b
DDAG
2628 *
2629 * On systems where host-page-size > target-page-size it will send all the
2630 * pages in a host page that are dirty.
56e93d26 2631 */
05931ec5 2632static int ram_find_and_save_block(RAMState *rs)
56e93d26 2633{
f1668764 2634 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
56e93d26 2635 int pages = 0;
56e93d26 2636
0827b9e9 2637 /* No dirty page as there is zero RAM */
8d80e195 2638 if (!rs->ram_bytes_total) {
0827b9e9
AA
2639 return pages;
2640 }
2641
4934a5dd
PX
2642 /*
2643 * Always keep last_seen_block/last_page valid during this procedure,
2644 * because find_dirty_block() relies on these values (e.g., we compare
2645 * last_seen_block with pss.block to see whether we searched all the
2646 * ramblocks) to detect the completion of migration. Having NULL value
2647 * of last_seen_block can conditionally cause below loop to run forever.
2648 */
2649 if (!rs->last_seen_block) {
2650 rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks);
2651 rs->last_page = 0;
2652 }
2653
f1668764 2654 pss_init(pss, rs->last_seen_block, rs->last_page);
b8fb8cb7 2655
31e2ac74 2656 while (true){
51efd36f 2657 if (!get_queued_page(rs, pss)) {
b062106d 2658 /* priority queue empty, so just search for something dirty */
31e2ac74
JQ
2659 int res = find_dirty_block(rs, pss);
2660 if (res != PAGE_DIRTY_FOUND) {
2661 if (res == PAGE_ALL_CLEAN) {
51efd36f 2662 break;
31e2ac74
JQ
2663 } else if (res == PAGE_TRY_AGAIN) {
2664 continue;
294e5a40
JQ
2665 } else if (res < 0) {
2666 pages = res;
2667 break;
51efd36f
JQ
2668 }
2669 }
56e93d26 2670 }
51efd36f 2671 pages = ram_save_host_page(rs, pss);
31e2ac74
JQ
2672 if (pages) {
2673 break;
2674 }
2675 }
56e93d26 2676
f1668764
PX
2677 rs->last_seen_block = pss->block;
2678 rs->last_page = pss->page;
56e93d26
JQ
2679
2680 return pages;
2681}
2682
8008a272 2683static uint64_t ram_bytes_total_with_ignored(void)
56e93d26
JQ
2684{
2685 RAMBlock *block;
2686 uint64_t total = 0;
2687
89ac5a1d
DDAG
2688 RCU_READ_LOCK_GUARD();
2689
8008a272
JQ
2690 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2691 total += block->used_length;
99e15582 2692 }
56e93d26
JQ
2693 return total;
2694}
2695
fbd162e6
YK
2696uint64_t ram_bytes_total(void)
2697{
8008a272
JQ
2698 RAMBlock *block;
2699 uint64_t total = 0;
2700
2701 RCU_READ_LOCK_GUARD();
2702
2703 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2704 total += block->used_length;
2705 }
2706 return total;
fbd162e6
YK
2707}
2708
f265e0e4 2709static void xbzrle_load_setup(void)
56e93d26 2710{
f265e0e4 2711 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
56e93d26
JQ
2712}
2713
f265e0e4
JQ
2714static void xbzrle_load_cleanup(void)
2715{
2716 g_free(XBZRLE.decoded_buf);
2717 XBZRLE.decoded_buf = NULL;
2718}
2719
7d7c96be
PX
2720static void ram_state_cleanup(RAMState **rsp)
2721{
b9ccaf6d
DDAG
2722 if (*rsp) {
2723 migration_page_queue_free(*rsp);
2724 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2725 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2726 g_free(*rsp);
2727 *rsp = NULL;
2728 }
7d7c96be
PX
2729}
2730
84593a08
PX
2731static void xbzrle_cleanup(void)
2732{
2733 XBZRLE_cache_lock();
2734 if (XBZRLE.cache) {
2735 cache_fini(XBZRLE.cache);
2736 g_free(XBZRLE.encoded_buf);
2737 g_free(XBZRLE.current_buf);
2738 g_free(XBZRLE.zero_target_page);
2739 XBZRLE.cache = NULL;
2740 XBZRLE.encoded_buf = NULL;
2741 XBZRLE.current_buf = NULL;
2742 XBZRLE.zero_target_page = NULL;
2743 }
2744 XBZRLE_cache_unlock();
2745}
2746
f265e0e4 2747static void ram_save_cleanup(void *opaque)
56e93d26 2748{
53518d94 2749 RAMState **rsp = opaque;
6b6712ef 2750 RAMBlock *block;
eb859c53 2751
278e2f55
AG
2752 /* We don't use dirty log with background snapshots */
2753 if (!migrate_background_snapshot()) {
2754 /* caller have hold iothread lock or is in a bh, so there is
2755 * no writing race against the migration bitmap
2756 */
63b41db4
HH
2757 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2758 /*
2759 * do not stop dirty log without starting it, since
2760 * memory_global_dirty_log_stop will assert that
2761 * memory_global_dirty_log_start/stop used in pairs
2762 */
2763 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2764 }
278e2f55 2765 }
6b6712ef 2766
fbd162e6 2767 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
002cad6b
PX
2768 g_free(block->clear_bmap);
2769 block->clear_bmap = NULL;
6b6712ef
JQ
2770 g_free(block->bmap);
2771 block->bmap = NULL;
56e93d26
JQ
2772 }
2773
84593a08 2774 xbzrle_cleanup();
f0afa331 2775 compress_threads_save_cleanup();
7d7c96be 2776 ram_state_cleanup(rsp);
4010ba38
JQ
2777 g_free(migration_ops);
2778 migration_ops = NULL;
56e93d26
JQ
2779}
2780
6f37bb8b 2781static void ram_state_reset(RAMState *rs)
56e93d26 2782{
ec6f3ab9
PX
2783 int i;
2784
2785 for (i = 0; i < RAM_CHANNEL_MAX; i++) {
2786 rs->pss[i].last_sent_block = NULL;
2787 }
2788
6f37bb8b 2789 rs->last_seen_block = NULL;
269ace29 2790 rs->last_page = 0;
6f37bb8b 2791 rs->last_version = ram_list.version;
f3095cc8 2792 rs->xbzrle_started = false;
56e93d26
JQ
2793}
2794
2795#define MAX_WAIT 50 /* ms, half buffered_file limit */
2796
e0b266f0
DDAG
2797/* **** functions for postcopy ***** */
2798
ced1c616
PB
2799void ram_postcopy_migrated_memory_release(MigrationState *ms)
2800{
2801 struct RAMBlock *block;
ced1c616 2802
fbd162e6 2803 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
6b6712ef
JQ
2804 unsigned long *bitmap = block->bmap;
2805 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2806 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
ced1c616
PB
2807
2808 while (run_start < range) {
2809 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
8bba004c
AR
2810 ram_discard_range(block->idstr,
2811 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2812 ((ram_addr_t)(run_end - run_start))
2813 << TARGET_PAGE_BITS);
ced1c616
PB
2814 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2815 }
2816 }
2817}
2818
3d0684b2
JQ
2819/**
2820 * postcopy_send_discard_bm_ram: discard a RAMBlock
2821 *
e0b266f0 2822 * Callback from postcopy_each_ram_send_discard for each RAMBlock
3d0684b2
JQ
2823 *
2824 * @ms: current migration state
89dab31b 2825 * @block: RAMBlock to discard
e0b266f0 2826 */
9e7d1223 2827static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
e0b266f0 2828{
6b6712ef 2829 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
e0b266f0 2830 unsigned long current;
1e7cf8c3 2831 unsigned long *bitmap = block->bmap;
e0b266f0 2832
6b6712ef 2833 for (current = 0; current < end; ) {
1e7cf8c3 2834 unsigned long one = find_next_bit(bitmap, end, current);
33a5cb62 2835 unsigned long zero, discard_length;
e0b266f0 2836
33a5cb62
WY
2837 if (one >= end) {
2838 break;
2839 }
e0b266f0 2840
1e7cf8c3 2841 zero = find_next_zero_bit(bitmap, end, one + 1);
33a5cb62
WY
2842
2843 if (zero >= end) {
2844 discard_length = end - one;
e0b266f0 2845 } else {
33a5cb62
WY
2846 discard_length = zero - one;
2847 }
810cf2bb 2848 postcopy_discard_send_range(ms, one, discard_length);
33a5cb62 2849 current = one + discard_length;
e0b266f0 2850 }
e0b266f0
DDAG
2851}
2852
f30c2e5b
PX
2853static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2854
3d0684b2
JQ
2855/**
2856 * postcopy_each_ram_send_discard: discard all RAMBlocks
2857 *
e0b266f0
DDAG
2858 * Utility for the outgoing postcopy code.
2859 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2860 * passing it bitmap indexes and name.
e0b266f0
DDAG
2861 * (qemu_ram_foreach_block ends up passing unscaled lengths
2862 * which would mean postcopy code would have to deal with target page)
3d0684b2
JQ
2863 *
2864 * @ms: current migration state
e0b266f0 2865 */
739fcc1b 2866static void postcopy_each_ram_send_discard(MigrationState *ms)
e0b266f0
DDAG
2867{
2868 struct RAMBlock *block;
e0b266f0 2869
fbd162e6 2870 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
810cf2bb 2871 postcopy_discard_send_init(ms, block->idstr);
e0b266f0 2872
f30c2e5b
PX
2873 /*
2874 * Deal with TPS != HPS and huge pages. It discard any partially sent
2875 * host-page size chunks, mark any partially dirty host-page size
2876 * chunks as all dirty. In this case the host-page is the host-page
2877 * for the particular RAMBlock, i.e. it might be a huge page.
2878 */
2879 postcopy_chunk_hostpages_pass(ms, block);
2880
e0b266f0
DDAG
2881 /*
2882 * Postcopy sends chunks of bitmap over the wire, but it
2883 * just needs indexes at this point, avoids it having
2884 * target page specific code.
2885 */
739fcc1b 2886 postcopy_send_discard_bm_ram(ms, block);
810cf2bb 2887 postcopy_discard_send_finish(ms);
e0b266f0 2888 }
e0b266f0
DDAG
2889}
2890
3d0684b2 2891/**
8324ef86 2892 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
3d0684b2
JQ
2893 *
2894 * Helper for postcopy_chunk_hostpages; it's called twice to
2895 * canonicalize the two bitmaps, that are similar, but one is
2896 * inverted.
99e314eb 2897 *
3d0684b2
JQ
2898 * Postcopy requires that all target pages in a hostpage are dirty or
2899 * clean, not a mix. This function canonicalizes the bitmaps.
99e314eb 2900 *
3d0684b2 2901 * @ms: current migration state
3d0684b2 2902 * @block: block that contains the page we want to canonicalize
99e314eb 2903 */
1e7cf8c3 2904static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
99e314eb 2905{
53518d94 2906 RAMState *rs = ram_state;
6b6712ef 2907 unsigned long *bitmap = block->bmap;
29c59172 2908 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
6b6712ef 2909 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
99e314eb
DDAG
2910 unsigned long run_start;
2911
29c59172
DDAG
2912 if (block->page_size == TARGET_PAGE_SIZE) {
2913 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2914 return;
2915 }
2916
1e7cf8c3
WY
2917 /* Find a dirty page */
2918 run_start = find_next_bit(bitmap, pages, 0);
99e314eb 2919
6b6712ef 2920 while (run_start < pages) {
99e314eb
DDAG
2921
2922 /*
2923 * If the start of this run of pages is in the middle of a host
2924 * page, then we need to fixup this host page.
2925 */
9dec3cc3 2926 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
99e314eb 2927 /* Find the end of this run */
1e7cf8c3 2928 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
99e314eb
DDAG
2929 /*
2930 * If the end isn't at the start of a host page, then the
2931 * run doesn't finish at the end of a host page
2932 * and we need to discard.
2933 */
99e314eb
DDAG
2934 }
2935
9dec3cc3 2936 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
99e314eb 2937 unsigned long page;
dad45ab2
WY
2938 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2939 host_ratio);
2940 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
99e314eb 2941
99e314eb
DDAG
2942 /* Clean up the bitmap */
2943 for (page = fixup_start_addr;
2944 page < fixup_start_addr + host_ratio; page++) {
99e314eb
DDAG
2945 /*
2946 * Remark them as dirty, updating the count for any pages
2947 * that weren't previously dirty.
2948 */
0d8ec885 2949 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
99e314eb
DDAG
2950 }
2951 }
2952
1e7cf8c3
WY
2953 /* Find the next dirty page for the next iteration */
2954 run_start = find_next_bit(bitmap, pages, run_start);
99e314eb
DDAG
2955 }
2956}
2957
3d0684b2
JQ
2958/**
2959 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2960 *
e0b266f0
DDAG
2961 * Transmit the set of pages to be discarded after precopy to the target
2962 * these are pages that:
2963 * a) Have been previously transmitted but are now dirty again
2964 * b) Pages that have never been transmitted, this ensures that
2965 * any pages on the destination that have been mapped by background
2966 * tasks get discarded (transparent huge pages is the specific concern)
2967 * Hopefully this is pretty sparse
3d0684b2
JQ
2968 *
2969 * @ms: current migration state
e0b266f0 2970 */
739fcc1b 2971void ram_postcopy_send_discard_bitmap(MigrationState *ms)
e0b266f0 2972{
53518d94 2973 RAMState *rs = ram_state;
e0b266f0 2974
89ac5a1d 2975 RCU_READ_LOCK_GUARD();
e0b266f0
DDAG
2976
2977 /* This should be our last sync, the src is now paused */
eb859c53 2978 migration_bitmap_sync(rs);
e0b266f0 2979
6b6712ef 2980 /* Easiest way to make sure we don't resume in the middle of a host-page */
ec6f3ab9 2981 rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL;
6b6712ef 2982 rs->last_seen_block = NULL;
6b6712ef 2983 rs->last_page = 0;
e0b266f0 2984
739fcc1b 2985 postcopy_each_ram_send_discard(ms);
e0b266f0 2986
739fcc1b 2987 trace_ram_postcopy_send_discard_bitmap();
e0b266f0
DDAG
2988}
2989
3d0684b2
JQ
2990/**
2991 * ram_discard_range: discard dirtied pages at the beginning of postcopy
e0b266f0 2992 *
3d0684b2 2993 * Returns zero on success
e0b266f0 2994 *
36449157
JQ
2995 * @rbname: name of the RAMBlock of the request. NULL means the
2996 * same that last one.
3d0684b2
JQ
2997 * @start: RAMBlock starting page
2998 * @length: RAMBlock size
e0b266f0 2999 */
aaa2064c 3000int ram_discard_range(const char *rbname, uint64_t start, size_t length)
e0b266f0 3001{
36449157 3002 trace_ram_discard_range(rbname, start, length);
d3a5038c 3003
89ac5a1d 3004 RCU_READ_LOCK_GUARD();
36449157 3005 RAMBlock *rb = qemu_ram_block_by_name(rbname);
e0b266f0
DDAG
3006
3007 if (!rb) {
36449157 3008 error_report("ram_discard_range: Failed to find block '%s'", rbname);
03acb4e9 3009 return -1;
e0b266f0
DDAG
3010 }
3011
814bb08f
PX
3012 /*
3013 * On source VM, we don't need to update the received bitmap since
3014 * we don't even have one.
3015 */
3016 if (rb->receivedmap) {
3017 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
3018 length >> qemu_target_page_bits());
3019 }
3020
03acb4e9 3021 return ram_block_discard_range(rb, start, length);
e0b266f0
DDAG
3022}
3023
84593a08
PX
3024/*
3025 * For every allocation, we will try not to crash the VM if the
3026 * allocation failed.
3027 */
3028static int xbzrle_init(void)
3029{
3030 Error *local_err = NULL;
3031
87dca0c9 3032 if (!migrate_xbzrle()) {
84593a08
PX
3033 return 0;
3034 }
3035
3036 XBZRLE_cache_lock();
3037
3038 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
3039 if (!XBZRLE.zero_target_page) {
3040 error_report("%s: Error allocating zero page", __func__);
3041 goto err_out;
3042 }
3043
3044 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
3045 TARGET_PAGE_SIZE, &local_err);
3046 if (!XBZRLE.cache) {
3047 error_report_err(local_err);
3048 goto free_zero_page;
3049 }
3050
3051 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3052 if (!XBZRLE.encoded_buf) {
3053 error_report("%s: Error allocating encoded_buf", __func__);
3054 goto free_cache;
3055 }
3056
3057 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3058 if (!XBZRLE.current_buf) {
3059 error_report("%s: Error allocating current_buf", __func__);
3060 goto free_encoded_buf;
3061 }
3062
3063 /* We are all good */
3064 XBZRLE_cache_unlock();
3065 return 0;
3066
3067free_encoded_buf:
3068 g_free(XBZRLE.encoded_buf);
3069 XBZRLE.encoded_buf = NULL;
3070free_cache:
3071 cache_fini(XBZRLE.cache);
3072 XBZRLE.cache = NULL;
3073free_zero_page:
3074 g_free(XBZRLE.zero_target_page);
3075 XBZRLE.zero_target_page = NULL;
3076err_out:
3077 XBZRLE_cache_unlock();
3078 return -ENOMEM;
3079}
3080
53518d94 3081static int ram_state_init(RAMState **rsp)
56e93d26 3082{
7d00ee6a
PX
3083 *rsp = g_try_new0(RAMState, 1);
3084
3085 if (!*rsp) {
3086 error_report("%s: Init ramstate fail", __func__);
3087 return -1;
3088 }
53518d94
JQ
3089
3090 qemu_mutex_init(&(*rsp)->bitmap_mutex);
3091 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3092 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
8d80e195 3093 (*rsp)->ram_bytes_total = ram_bytes_total();
56e93d26 3094
7d00ee6a 3095 /*
40c4d4a8
IR
3096 * Count the total number of pages used by ram blocks not including any
3097 * gaps due to alignment or unplugs.
03158519 3098 * This must match with the initial values of dirty bitmap.
7d00ee6a 3099 */
8d80e195 3100 (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS;
7d00ee6a
PX
3101 ram_state_reset(*rsp);
3102
3103 return 0;
3104}
3105
d6eff5d7 3106static void ram_list_init_bitmaps(void)
7d00ee6a 3107{
002cad6b 3108 MigrationState *ms = migrate_get_current();
d6eff5d7
PX
3109 RAMBlock *block;
3110 unsigned long pages;
002cad6b 3111 uint8_t shift;
56e93d26 3112
0827b9e9
AA
3113 /* Skip setting bitmap if there is no RAM */
3114 if (ram_bytes_total()) {
002cad6b
PX
3115 shift = ms->clear_bitmap_shift;
3116 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3117 error_report("clear_bitmap_shift (%u) too big, using "
3118 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3119 shift = CLEAR_BITMAP_SHIFT_MAX;
3120 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3121 error_report("clear_bitmap_shift (%u) too small, using "
3122 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3123 shift = CLEAR_BITMAP_SHIFT_MIN;
3124 }
3125
fbd162e6 3126 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
d6eff5d7 3127 pages = block->max_length >> TARGET_PAGE_BITS;
03158519
WY
3128 /*
3129 * The initial dirty bitmap for migration must be set with all
3130 * ones to make sure we'll migrate every guest RAM page to
3131 * destination.
40c4d4a8
IR
3132 * Here we set RAMBlock.bmap all to 1 because when rebegin a
3133 * new migration after a failed migration, ram_list.
3134 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3135 * guest memory.
03158519 3136 */
6b6712ef 3137 block->bmap = bitmap_new(pages);
40c4d4a8 3138 bitmap_set(block->bmap, 0, pages);
002cad6b
PX
3139 block->clear_bmap_shift = shift;
3140 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
0827b9e9 3141 }
f3f491fc 3142 }
d6eff5d7
PX
3143}
3144
be39b4cd
DH
3145static void migration_bitmap_clear_discarded_pages(RAMState *rs)
3146{
3147 unsigned long pages;
3148 RAMBlock *rb;
3149
3150 RCU_READ_LOCK_GUARD();
3151
3152 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3153 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
3154 rs->migration_dirty_pages -= pages;
3155 }
3156}
3157
d6eff5d7
PX
3158static void ram_init_bitmaps(RAMState *rs)
3159{
3160 /* For memory_global_dirty_log_start below. */
3161 qemu_mutex_lock_iothread();
3162 qemu_mutex_lock_ramlist();
f3f491fc 3163
89ac5a1d
DDAG
3164 WITH_RCU_READ_LOCK_GUARD() {
3165 ram_list_init_bitmaps();
278e2f55
AG
3166 /* We don't use dirty log with background snapshots */
3167 if (!migrate_background_snapshot()) {
63b41db4 3168 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
278e2f55
AG
3169 migration_bitmap_sync_precopy(rs);
3170 }
89ac5a1d 3171 }
56e93d26 3172 qemu_mutex_unlock_ramlist();
49877834 3173 qemu_mutex_unlock_iothread();
be39b4cd
DH
3174
3175 /*
3176 * After an eventual first bitmap sync, fixup the initial bitmap
3177 * containing all 1s to exclude any discarded pages from migration.
3178 */
3179 migration_bitmap_clear_discarded_pages(rs);
d6eff5d7
PX
3180}
3181
3182static int ram_init_all(RAMState **rsp)
3183{
3184 if (ram_state_init(rsp)) {
3185 return -1;
3186 }
3187
3188 if (xbzrle_init()) {
3189 ram_state_cleanup(rsp);
3190 return -1;
3191 }
3192
3193 ram_init_bitmaps(*rsp);
a91246c9
HZ
3194
3195 return 0;
3196}
3197
08614f34
PX
3198static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3199{
3200 RAMBlock *block;
3201 uint64_t pages = 0;
3202
3203 /*
3204 * Postcopy is not using xbzrle/compression, so no need for that.
3205 * Also, since source are already halted, we don't need to care
3206 * about dirty page logging as well.
3207 */
3208
fbd162e6 3209 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
08614f34
PX
3210 pages += bitmap_count_one(block->bmap,
3211 block->used_length >> TARGET_PAGE_BITS);
3212 }
3213
3214 /* This may not be aligned with current bitmaps. Recalculate. */
3215 rs->migration_dirty_pages = pages;
3216
1a373522 3217 ram_state_reset(rs);
08614f34
PX
3218
3219 /* Update RAMState cache of output QEMUFile */
7f401b80 3220 rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out;
08614f34
PX
3221
3222 trace_ram_state_resume_prepare(pages);
3223}
3224
6bcb05fc
WW
3225/*
3226 * This function clears bits of the free pages reported by the caller from the
3227 * migration dirty bitmap. @addr is the host address corresponding to the
3228 * start of the continuous guest free pages, and @len is the total bytes of
3229 * those pages.
3230 */
3231void qemu_guest_free_page_hint(void *addr, size_t len)
3232{
3233 RAMBlock *block;
3234 ram_addr_t offset;
3235 size_t used_len, start, npages;
3236 MigrationState *s = migrate_get_current();
3237
3238 /* This function is currently expected to be used during live migration */
3239 if (!migration_is_setup_or_active(s->state)) {
3240 return;
3241 }
3242
3243 for (; len > 0; len -= used_len, addr += used_len) {
3244 block = qemu_ram_block_from_host(addr, false, &offset);
3245 if (unlikely(!block || offset >= block->used_length)) {
3246 /*
3247 * The implementation might not support RAMBlock resize during
3248 * live migration, but it could happen in theory with future
3249 * updates. So we add a check here to capture that case.
3250 */
3251 error_report_once("%s unexpected error", __func__);
3252 return;
3253 }
3254
3255 if (len <= block->used_length - offset) {
3256 used_len = len;
3257 } else {
3258 used_len = block->used_length - offset;
3259 }
3260
3261 start = offset >> TARGET_PAGE_BITS;
3262 npages = used_len >> TARGET_PAGE_BITS;
3263
3264 qemu_mutex_lock(&ram_state->bitmap_mutex);
3143577d
WW
3265 /*
3266 * The skipped free pages are equavalent to be sent from clear_bmap's
3267 * perspective, so clear the bits from the memory region bitmap which
3268 * are initially set. Otherwise those skipped pages will be sent in
3269 * the next round after syncing from the memory region bitmap.
3270 */
1230a25f 3271 migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
6bcb05fc
WW
3272 ram_state->migration_dirty_pages -=
3273 bitmap_count_one_with_offset(block->bmap, start, npages);
3274 bitmap_clear(block->bmap, start, npages);
3275 qemu_mutex_unlock(&ram_state->bitmap_mutex);
3276 }
3277}
3278
3d0684b2
JQ
3279/*
3280 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
a91246c9
HZ
3281 * long-running RCU critical section. When rcu-reclaims in the code
3282 * start to become numerous it will be necessary to reduce the
3283 * granularity of these critical sections.
3284 */
3285
3d0684b2
JQ
3286/**
3287 * ram_save_setup: Setup RAM for migration
3288 *
3289 * Returns zero to indicate success and negative for error
3290 *
3291 * @f: QEMUFile where to send the data
3292 * @opaque: RAMState pointer
3293 */
a91246c9
HZ
3294static int ram_save_setup(QEMUFile *f, void *opaque)
3295{
53518d94 3296 RAMState **rsp = opaque;
a91246c9 3297 RAMBlock *block;
33d70973 3298 int ret;
a91246c9 3299
dcaf446e
XG
3300 if (compress_threads_save_setup()) {
3301 return -1;
3302 }
3303
a91246c9
HZ
3304 /* migration has already setup the bitmap, reuse it. */
3305 if (!migration_in_colo_state()) {
7d00ee6a 3306 if (ram_init_all(rsp) != 0) {
dcaf446e 3307 compress_threads_save_cleanup();
a91246c9 3308 return -1;
53518d94 3309 }
a91246c9 3310 }
7f401b80 3311 (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f;
a91246c9 3312
0e6ebd48 3313 WITH_RCU_READ_LOCK_GUARD() {
8008a272
JQ
3314 qemu_put_be64(f, ram_bytes_total_with_ignored()
3315 | RAM_SAVE_FLAG_MEM_SIZE);
56e93d26 3316
0e6ebd48
DDAG
3317 RAMBLOCK_FOREACH_MIGRATABLE(block) {
3318 qemu_put_byte(f, strlen(block->idstr));
3319 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3320 qemu_put_be64(f, block->used_length);
3321 if (migrate_postcopy_ram() && block->page_size !=
3322 qemu_host_page_size) {
3323 qemu_put_be64(f, block->page_size);
3324 }
3325 if (migrate_ignore_shared()) {
3326 qemu_put_be64(f, block->mr->addr);
3327 }
fbd162e6 3328 }
56e93d26
JQ
3329 }
3330
56e93d26
JQ
3331 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3332 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3333
4010ba38
JQ
3334 migration_ops = g_malloc0(sizeof(MigrationOps));
3335 migration_ops->ram_save_target_page = ram_save_target_page_legacy;
8ebb6ecc 3336 ret = multifd_send_sync_main(f);
33d70973
LB
3337 if (ret < 0) {
3338 return ret;
3339 }
3340
294e5a40
JQ
3341 if (!migrate_multifd_flush_after_each_section()) {
3342 qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
3343 }
3344
56e93d26 3345 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
35374cbd 3346 qemu_fflush(f);
56e93d26
JQ
3347
3348 return 0;
3349}
3350
3d0684b2
JQ
3351/**
3352 * ram_save_iterate: iterative stage for migration
3353 *
3354 * Returns zero to indicate success and negative for error
3355 *
3356 * @f: QEMUFile where to send the data
3357 * @opaque: RAMState pointer
3358 */
56e93d26
JQ
3359static int ram_save_iterate(QEMUFile *f, void *opaque)
3360{
53518d94
JQ
3361 RAMState **temp = opaque;
3362 RAMState *rs = *temp;
3d4095b2 3363 int ret = 0;
56e93d26
JQ
3364 int i;
3365 int64_t t0;
5c90308f 3366 int done = 0;
56e93d26 3367
b2557345
PL
3368 if (blk_mig_bulk_active()) {
3369 /* Avoid transferring ram during bulk phase of block migration as
3370 * the bulk phase will usually take a long time and transferring
3371 * ram updates during that time is pointless. */
3372 goto out;
3373 }
3374
63268c49
PX
3375 /*
3376 * We'll take this lock a little bit long, but it's okay for two reasons.
3377 * Firstly, the only possible other thread to take it is who calls
3378 * qemu_guest_free_page_hint(), which should be rare; secondly, see
3379 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3380 * guarantees that we'll at least released it in a regular basis.
3381 */
3382 qemu_mutex_lock(&rs->bitmap_mutex);
89ac5a1d
DDAG
3383 WITH_RCU_READ_LOCK_GUARD() {
3384 if (ram_list.version != rs->last_version) {
3385 ram_state_reset(rs);
3386 }
56e93d26 3387
89ac5a1d
DDAG
3388 /* Read version before ram_list.blocks */
3389 smp_rmb();
56e93d26 3390
89ac5a1d 3391 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
56e93d26 3392
89ac5a1d
DDAG
3393 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3394 i = 0;
3395 while ((ret = qemu_file_rate_limit(f)) == 0 ||
a1fe28df 3396 postcopy_has_request(rs)) {
89ac5a1d 3397 int pages;
e03a34f8 3398
89ac5a1d
DDAG
3399 if (qemu_file_get_error(f)) {
3400 break;
3401 }
e8f3735f 3402
05931ec5 3403 pages = ram_find_and_save_block(rs);
89ac5a1d
DDAG
3404 /* no more pages to sent */
3405 if (pages == 0) {
3406 done = 1;
3407 break;
3408 }
e8f3735f 3409
89ac5a1d
DDAG
3410 if (pages < 0) {
3411 qemu_file_set_error(f, pages);
56e93d26
JQ
3412 break;
3413 }
89ac5a1d
DDAG
3414
3415 rs->target_page_count += pages;
3416
644acf99
WY
3417 /*
3418 * During postcopy, it is necessary to make sure one whole host
3419 * page is sent in one chunk.
3420 */
3421 if (migrate_postcopy_ram()) {
ef4f5f5d 3422 ram_flush_compressed_data(rs);
644acf99
WY
3423 }
3424
89ac5a1d
DDAG
3425 /*
3426 * we want to check in the 1st loop, just in case it was the 1st
3427 * time and we had to sync the dirty bitmap.
3428 * qemu_clock_get_ns() is a bit expensive, so we only check each
3429 * some iterations
3430 */
3431 if ((i & 63) == 0) {
3432 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3433 1000000;
3434 if (t1 > MAX_WAIT) {
3435 trace_ram_save_iterate_big_wait(t1, i);
3436 break;
3437 }
3438 }
3439 i++;
56e93d26 3440 }
56e93d26 3441 }
63268c49 3442 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26
JQ
3443
3444 /*
3445 * Must occur before EOS (or any QEMUFile operation)
3446 * because of RDMA protocol.
3447 */
3448 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3449
b2557345 3450out:
b69a0227
JQ
3451 if (ret >= 0
3452 && migration_is_setup_or_active(migrate_get_current()->state)) {
b05292c2
JQ
3453 if (migrate_multifd_flush_after_each_section()) {
3454 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3455 if (ret < 0) {
3456 return ret;
3457 }
33d70973
LB
3458 }
3459
3d4095b2
JQ
3460 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3461 qemu_fflush(f);
4c2d0f6d 3462 ram_transferred_add(8);
56e93d26 3463
3d4095b2
JQ
3464 ret = qemu_file_get_error(f);
3465 }
56e93d26
JQ
3466 if (ret < 0) {
3467 return ret;
3468 }
3469
5c90308f 3470 return done;
56e93d26
JQ
3471}
3472
3d0684b2
JQ
3473/**
3474 * ram_save_complete: function called to send the remaining amount of ram
3475 *
e8f3735f 3476 * Returns zero to indicate success or negative on error
3d0684b2
JQ
3477 *
3478 * Called with iothread lock
3479 *
3480 * @f: QEMUFile where to send the data
3481 * @opaque: RAMState pointer
3482 */
56e93d26
JQ
3483static int ram_save_complete(QEMUFile *f, void *opaque)
3484{
53518d94
JQ
3485 RAMState **temp = opaque;
3486 RAMState *rs = *temp;
e8f3735f 3487 int ret = 0;
6f37bb8b 3488
05931ec5
JQ
3489 rs->last_stage = !migration_in_colo_state();
3490
89ac5a1d
DDAG
3491 WITH_RCU_READ_LOCK_GUARD() {
3492 if (!migration_in_postcopy()) {
3493 migration_bitmap_sync_precopy(rs);
3494 }
56e93d26 3495
89ac5a1d 3496 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
56e93d26 3497
89ac5a1d 3498 /* try transferring iterative blocks of memory */
56e93d26 3499
89ac5a1d 3500 /* flush all remaining blocks regardless of rate limiting */
c13221b5 3501 qemu_mutex_lock(&rs->bitmap_mutex);
89ac5a1d
DDAG
3502 while (true) {
3503 int pages;
56e93d26 3504
05931ec5 3505 pages = ram_find_and_save_block(rs);
89ac5a1d
DDAG
3506 /* no more blocks to sent */
3507 if (pages == 0) {
3508 break;
3509 }
3510 if (pages < 0) {
3511 ret = pages;
3512 break;
3513 }
e8f3735f 3514 }
c13221b5 3515 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26 3516
ef4f5f5d 3517 ram_flush_compressed_data(rs);
89ac5a1d
DDAG
3518 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3519 }
d09a6fde 3520
33d70973
LB
3521 if (ret < 0) {
3522 return ret;
3d4095b2 3523 }
56e93d26 3524
7f401b80 3525 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
33d70973
LB
3526 if (ret < 0) {
3527 return ret;
3528 }
3529
294e5a40
JQ
3530 if (!migrate_multifd_flush_after_each_section()) {
3531 qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
3532 }
33d70973
LB
3533 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3534 qemu_fflush(f);
3535
3536 return 0;
56e93d26
JQ
3537}
3538
24beea4e
JQ
3539static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy,
3540 uint64_t *can_postcopy)
56e93d26 3541{
53518d94
JQ
3542 RAMState **temp = opaque;
3543 RAMState *rs = *temp;
56e93d26 3544
c8df4a7a 3545 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 3546
c8df4a7a
JQ
3547 if (migrate_postcopy_ram()) {
3548 /* We can do postcopy, and all the data is postcopiable */
24beea4e 3549 *can_postcopy += remaining_size;
c8df4a7a 3550 } else {
24beea4e 3551 *must_precopy += remaining_size;
c8df4a7a
JQ
3552 }
3553}
3554
24beea4e
JQ
3555static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy,
3556 uint64_t *can_postcopy)
c8df4a7a 3557{
28ef5339 3558 MigrationState *s = migrate_get_current();
c8df4a7a
JQ
3559 RAMState **temp = opaque;
3560 RAMState *rs = *temp;
3561
3562 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3563
28ef5339 3564 if (!migration_in_postcopy() && remaining_size < s->threshold_size) {
56e93d26 3565 qemu_mutex_lock_iothread();
89ac5a1d
DDAG
3566 WITH_RCU_READ_LOCK_GUARD() {
3567 migration_bitmap_sync_precopy(rs);
3568 }
56e93d26 3569 qemu_mutex_unlock_iothread();
9edabd4d 3570 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 3571 }
c31b098f 3572
86e1167e
VSO
3573 if (migrate_postcopy_ram()) {
3574 /* We can do postcopy, and all the data is postcopiable */
24beea4e 3575 *can_postcopy += remaining_size;
86e1167e 3576 } else {
24beea4e 3577 *must_precopy += remaining_size;
86e1167e 3578 }
56e93d26
JQ
3579}
3580
3581static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3582{
3583 unsigned int xh_len;
3584 int xh_flags;
063e760a 3585 uint8_t *loaded_data;
56e93d26 3586
56e93d26
JQ
3587 /* extract RLE header */
3588 xh_flags = qemu_get_byte(f);
3589 xh_len = qemu_get_be16(f);
3590
3591 if (xh_flags != ENCODING_FLAG_XBZRLE) {
3592 error_report("Failed to load XBZRLE page - wrong compression!");
3593 return -1;
3594 }
3595
3596 if (xh_len > TARGET_PAGE_SIZE) {
3597 error_report("Failed to load XBZRLE page - len overflow!");
3598 return -1;
3599 }
f265e0e4 3600 loaded_data = XBZRLE.decoded_buf;
56e93d26 3601 /* load data and decode */
f265e0e4 3602 /* it can change loaded_data to point to an internal buffer */
063e760a 3603 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
56e93d26
JQ
3604
3605 /* decode RLE */
063e760a 3606 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
56e93d26
JQ
3607 TARGET_PAGE_SIZE) == -1) {
3608 error_report("Failed to load XBZRLE page - decode error!");
3609 return -1;
3610 }
3611
3612 return 0;
3613}
3614
3d0684b2
JQ
3615/**
3616 * ram_block_from_stream: read a RAMBlock id from the migration stream
3617 *
3618 * Must be called from within a rcu critical section.
3619 *
56e93d26 3620 * Returns a pointer from within the RCU-protected ram_list.
a7180877 3621 *
755e8d7c 3622 * @mis: the migration incoming state pointer
3d0684b2
JQ
3623 * @f: QEMUFile where to read the data from
3624 * @flags: Page flags (mostly to see if it's a continuation of previous block)
c01b16ed 3625 * @channel: the channel we're using
a7180877 3626 */
755e8d7c 3627static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
c01b16ed
PX
3628 QEMUFile *f, int flags,
3629 int channel)
56e93d26 3630{
c01b16ed 3631 RAMBlock *block = mis->last_recv_block[channel];
56e93d26
JQ
3632 char id[256];
3633 uint8_t len;
3634
3635 if (flags & RAM_SAVE_FLAG_CONTINUE) {
4c4bad48 3636 if (!block) {
56e93d26
JQ
3637 error_report("Ack, bad migration stream!");
3638 return NULL;
3639 }
4c4bad48 3640 return block;
56e93d26
JQ
3641 }
3642
3643 len = qemu_get_byte(f);
3644 qemu_get_buffer(f, (uint8_t *)id, len);
3645 id[len] = 0;
3646
e3dd7493 3647 block = qemu_ram_block_by_name(id);
4c4bad48
HZ
3648 if (!block) {
3649 error_report("Can't find block %s", id);
3650 return NULL;
56e93d26
JQ
3651 }
3652
fbd162e6 3653 if (ramblock_is_ignored(block)) {
b895de50
CLG
3654 error_report("block %s should not be migrated !", id);
3655 return NULL;
3656 }
3657
c01b16ed 3658 mis->last_recv_block[channel] = block;
755e8d7c 3659
4c4bad48
HZ
3660 return block;
3661}
3662
3663static inline void *host_from_ram_block_offset(RAMBlock *block,
3664 ram_addr_t offset)
3665{
3666 if (!offset_in_ramblock(block, offset)) {
3667 return NULL;
3668 }
3669
3670 return block->host + offset;
56e93d26
JQ
3671}
3672
6a23f639
DH
3673static void *host_page_from_ram_block_offset(RAMBlock *block,
3674 ram_addr_t offset)
3675{
3676 /* Note: Explicitly no check against offset_in_ramblock(). */
3677 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3678 block->page_size);
3679}
3680
3681static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3682 ram_addr_t offset)
3683{
3684 return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3685}
3686
13af18f2 3687static inline void *colo_cache_from_block_offset(RAMBlock *block,
8af66371 3688 ram_addr_t offset, bool record_bitmap)
13af18f2
ZC
3689{
3690 if (!offset_in_ramblock(block, offset)) {
3691 return NULL;
3692 }
3693 if (!block->colo_cache) {
3694 error_report("%s: colo_cache is NULL in block :%s",
3695 __func__, block->idstr);
3696 return NULL;
3697 }
7d9acafa
ZC
3698
3699 /*
3700 * During colo checkpoint, we need bitmap of these migrated pages.
3701 * It help us to decide which pages in ram cache should be flushed
3702 * into VM's RAM later.
3703 */
8af66371
HZ
3704 if (record_bitmap &&
3705 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
7d9acafa
ZC
3706 ram_state->migration_dirty_pages++;
3707 }
13af18f2
ZC
3708 return block->colo_cache + offset;
3709}
3710
3d0684b2
JQ
3711/**
3712 * ram_handle_compressed: handle the zero page case
3713 *
56e93d26
JQ
3714 * If a page (or a whole RDMA chunk) has been
3715 * determined to be zero, then zap it.
3d0684b2
JQ
3716 *
3717 * @host: host address for the zero page
3718 * @ch: what the page is filled from. We only support zero
3719 * @size: size of the zero page
56e93d26
JQ
3720 */
3721void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3722{
bad452a7 3723 if (ch != 0 || !buffer_is_zero(host, size)) {
56e93d26
JQ
3724 memset(host, ch, size);
3725 }
3726}
3727
797ca154
XG
3728/* return the size after decompression, or negative value on error */
3729static int
3730qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3731 const uint8_t *source, size_t source_len)
3732{
3733 int err;
3734
3735 err = inflateReset(stream);
3736 if (err != Z_OK) {
3737 return -1;
3738 }
3739
3740 stream->avail_in = source_len;
3741 stream->next_in = (uint8_t *)source;
3742 stream->avail_out = dest_len;
3743 stream->next_out = dest;
3744
3745 err = inflate(stream, Z_NO_FLUSH);
3746 if (err != Z_STREAM_END) {
3747 return -1;
3748 }
3749
3750 return stream->total_out;
3751}
3752
56e93d26
JQ
3753static void *do_data_decompress(void *opaque)
3754{
3755 DecompressParam *param = opaque;
3756 unsigned long pagesize;
33d151f4 3757 uint8_t *des;
34ab9e97 3758 int len, ret;
56e93d26 3759
33d151f4 3760 qemu_mutex_lock(&param->mutex);
90e56fb4 3761 while (!param->quit) {
33d151f4
LL
3762 if (param->des) {
3763 des = param->des;
3764 len = param->len;
3765 param->des = 0;
3766 qemu_mutex_unlock(&param->mutex);
3767
56e93d26 3768 pagesize = TARGET_PAGE_SIZE;
34ab9e97
XG
3769
3770 ret = qemu_uncompress_data(&param->stream, des, pagesize,
3771 param->compbuf, len);
f548222c 3772 if (ret < 0 && migrate_get_current()->decompress_error_check) {
34ab9e97
XG
3773 error_report("decompress data failed");
3774 qemu_file_set_error(decomp_file, ret);
3775 }
73a8912b 3776
33d151f4
LL
3777 qemu_mutex_lock(&decomp_done_lock);
3778 param->done = true;
3779 qemu_cond_signal(&decomp_done_cond);
3780 qemu_mutex_unlock(&decomp_done_lock);
3781
3782 qemu_mutex_lock(&param->mutex);
3783 } else {
3784 qemu_cond_wait(&param->cond, &param->mutex);
3785 }
56e93d26 3786 }
33d151f4 3787 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
3788
3789 return NULL;
3790}
3791
34ab9e97 3792static int wait_for_decompress_done(void)
5533b2e9
LL
3793{
3794 int idx, thread_count;
3795
a7a94d14 3796 if (!migrate_compress()) {
34ab9e97 3797 return 0;
5533b2e9
LL
3798 }
3799
3800 thread_count = migrate_decompress_threads();
3801 qemu_mutex_lock(&decomp_done_lock);
3802 for (idx = 0; idx < thread_count; idx++) {
3803 while (!decomp_param[idx].done) {
3804 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3805 }
3806 }
3807 qemu_mutex_unlock(&decomp_done_lock);
34ab9e97 3808 return qemu_file_get_error(decomp_file);
5533b2e9
LL
3809}
3810
f0afa331 3811static void compress_threads_load_cleanup(void)
56e93d26
JQ
3812{
3813 int i, thread_count;
3814
a7a94d14 3815 if (!migrate_compress()) {
3416ab5b
JQ
3816 return;
3817 }
56e93d26
JQ
3818 thread_count = migrate_decompress_threads();
3819 for (i = 0; i < thread_count; i++) {
797ca154
XG
3820 /*
3821 * we use it as a indicator which shows if the thread is
3822 * properly init'd or not
3823 */
3824 if (!decomp_param[i].compbuf) {
3825 break;
3826 }
3827
56e93d26 3828 qemu_mutex_lock(&decomp_param[i].mutex);
90e56fb4 3829 decomp_param[i].quit = true;
56e93d26
JQ
3830 qemu_cond_signal(&decomp_param[i].cond);
3831 qemu_mutex_unlock(&decomp_param[i].mutex);
3832 }
3833 for (i = 0; i < thread_count; i++) {
797ca154
XG
3834 if (!decomp_param[i].compbuf) {
3835 break;
3836 }
3837
56e93d26
JQ
3838 qemu_thread_join(decompress_threads + i);
3839 qemu_mutex_destroy(&decomp_param[i].mutex);
3840 qemu_cond_destroy(&decomp_param[i].cond);
797ca154 3841 inflateEnd(&decomp_param[i].stream);
56e93d26 3842 g_free(decomp_param[i].compbuf);
797ca154 3843 decomp_param[i].compbuf = NULL;
56e93d26
JQ
3844 }
3845 g_free(decompress_threads);
3846 g_free(decomp_param);
56e93d26
JQ
3847 decompress_threads = NULL;
3848 decomp_param = NULL;
34ab9e97 3849 decomp_file = NULL;
56e93d26
JQ
3850}
3851
34ab9e97 3852static int compress_threads_load_setup(QEMUFile *f)
797ca154
XG
3853{
3854 int i, thread_count;
3855
a7a94d14 3856 if (!migrate_compress()) {
797ca154
XG
3857 return 0;
3858 }
3859
3860 thread_count = migrate_decompress_threads();
3861 decompress_threads = g_new0(QemuThread, thread_count);
3862 decomp_param = g_new0(DecompressParam, thread_count);
3863 qemu_mutex_init(&decomp_done_lock);
3864 qemu_cond_init(&decomp_done_cond);
34ab9e97 3865 decomp_file = f;
797ca154
XG
3866 for (i = 0; i < thread_count; i++) {
3867 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3868 goto exit;
3869 }
3870
3871 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3872 qemu_mutex_init(&decomp_param[i].mutex);
3873 qemu_cond_init(&decomp_param[i].cond);
3874 decomp_param[i].done = true;
3875 decomp_param[i].quit = false;
3876 qemu_thread_create(decompress_threads + i, "decompress",
3877 do_data_decompress, decomp_param + i,
3878 QEMU_THREAD_JOINABLE);
3879 }
3880 return 0;
3881exit:
3882 compress_threads_load_cleanup();
3883 return -1;
3884}
3885
c1bc6626 3886static void decompress_data_with_multi_threads(QEMUFile *f,
56e93d26
JQ
3887 void *host, int len)
3888{
3889 int idx, thread_count;
3890
3891 thread_count = migrate_decompress_threads();
37396950 3892 QEMU_LOCK_GUARD(&decomp_done_lock);
56e93d26
JQ
3893 while (true) {
3894 for (idx = 0; idx < thread_count; idx++) {
73a8912b 3895 if (decomp_param[idx].done) {
33d151f4
LL
3896 decomp_param[idx].done = false;
3897 qemu_mutex_lock(&decomp_param[idx].mutex);
c1bc6626 3898 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
56e93d26
JQ
3899 decomp_param[idx].des = host;
3900 decomp_param[idx].len = len;
33d151f4
LL
3901 qemu_cond_signal(&decomp_param[idx].cond);
3902 qemu_mutex_unlock(&decomp_param[idx].mutex);
56e93d26
JQ
3903 break;
3904 }
3905 }
3906 if (idx < thread_count) {
3907 break;
73a8912b
LL
3908 } else {
3909 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
56e93d26
JQ
3910 }
3911 }
3912}
3913
b70cb3b4
RL
3914static void colo_init_ram_state(void)
3915{
3916 ram_state_init(&ram_state);
b70cb3b4
RL
3917}
3918
13af18f2
ZC
3919/*
3920 * colo cache: this is for secondary VM, we cache the whole
3921 * memory of the secondary VM, it is need to hold the global lock
3922 * to call this helper.
3923 */
3924int colo_init_ram_cache(void)
3925{
3926 RAMBlock *block;
3927
44901b5a
PB
3928 WITH_RCU_READ_LOCK_GUARD() {
3929 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3930 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
8dbe22c6 3931 NULL, false, false);
44901b5a
PB
3932 if (!block->colo_cache) {
3933 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3934 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3935 block->used_length);
3936 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3937 if (block->colo_cache) {
3938 qemu_anon_ram_free(block->colo_cache, block->used_length);
3939 block->colo_cache = NULL;
3940 }
89ac5a1d 3941 }
44901b5a 3942 return -errno;
89ac5a1d 3943 }
e5fdf920
LS
3944 if (!machine_dump_guest_core(current_machine)) {
3945 qemu_madvise(block->colo_cache, block->used_length,
3946 QEMU_MADV_DONTDUMP);
3947 }
13af18f2 3948 }
13af18f2 3949 }
44901b5a 3950
7d9acafa
ZC
3951 /*
3952 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3953 * with to decide which page in cache should be flushed into SVM's RAM. Here
3954 * we use the same name 'ram_bitmap' as for migration.
3955 */
3956 if (ram_bytes_total()) {
3957 RAMBlock *block;
3958
fbd162e6 3959 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7d9acafa 3960 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
7d9acafa 3961 block->bmap = bitmap_new(pages);
7d9acafa
ZC
3962 }
3963 }
7d9acafa 3964
b70cb3b4 3965 colo_init_ram_state();
13af18f2 3966 return 0;
13af18f2
ZC
3967}
3968
0393031a
HZ
3969/* TODO: duplicated with ram_init_bitmaps */
3970void colo_incoming_start_dirty_log(void)
3971{
3972 RAMBlock *block = NULL;
3973 /* For memory_global_dirty_log_start below. */
3974 qemu_mutex_lock_iothread();
3975 qemu_mutex_lock_ramlist();
3976
3977 memory_global_dirty_log_sync();
3978 WITH_RCU_READ_LOCK_GUARD() {
3979 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3980 ramblock_sync_dirty_bitmap(ram_state, block);
3981 /* Discard this dirty bitmap record */
3982 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3983 }
63b41db4 3984 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
0393031a
HZ
3985 }
3986 ram_state->migration_dirty_pages = 0;
3987 qemu_mutex_unlock_ramlist();
3988 qemu_mutex_unlock_iothread();
3989}
3990
13af18f2
ZC
3991/* It is need to hold the global lock to call this helper */
3992void colo_release_ram_cache(void)
3993{
3994 RAMBlock *block;
3995
63b41db4 3996 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
fbd162e6 3997 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7d9acafa
ZC
3998 g_free(block->bmap);
3999 block->bmap = NULL;
4000 }
4001
89ac5a1d
DDAG
4002 WITH_RCU_READ_LOCK_GUARD() {
4003 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4004 if (block->colo_cache) {
4005 qemu_anon_ram_free(block->colo_cache, block->used_length);
4006 block->colo_cache = NULL;
4007 }
13af18f2
ZC
4008 }
4009 }
0393031a 4010 ram_state_cleanup(&ram_state);
13af18f2
ZC
4011}
4012
f265e0e4
JQ
4013/**
4014 * ram_load_setup: Setup RAM for migration incoming side
4015 *
4016 * Returns zero to indicate success and negative for error
4017 *
4018 * @f: QEMUFile where to receive the data
4019 * @opaque: RAMState pointer
4020 */
4021static int ram_load_setup(QEMUFile *f, void *opaque)
4022{
34ab9e97 4023 if (compress_threads_load_setup(f)) {
797ca154
XG
4024 return -1;
4025 }
4026
f265e0e4 4027 xbzrle_load_setup();
f9494614 4028 ramblock_recv_map_init();
13af18f2 4029
f265e0e4
JQ
4030 return 0;
4031}
4032
4033static int ram_load_cleanup(void *opaque)
4034{
f9494614 4035 RAMBlock *rb;
56eb90af 4036
fbd162e6 4037 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
bd108a44 4038 qemu_ram_block_writeback(rb);
56eb90af
JH
4039 }
4040
f265e0e4 4041 xbzrle_load_cleanup();
f0afa331 4042 compress_threads_load_cleanup();
f9494614 4043
fbd162e6 4044 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
f9494614
AP
4045 g_free(rb->receivedmap);
4046 rb->receivedmap = NULL;
4047 }
13af18f2 4048
f265e0e4
JQ
4049 return 0;
4050}
4051
3d0684b2
JQ
4052/**
4053 * ram_postcopy_incoming_init: allocate postcopy data structures
4054 *
4055 * Returns 0 for success and negative if there was one error
4056 *
4057 * @mis: current migration incoming state
4058 *
4059 * Allocate data structures etc needed by incoming migration with
4060 * postcopy-ram. postcopy-ram's similarly names
4061 * postcopy_ram_incoming_init does the work.
1caddf8a
DDAG
4062 */
4063int ram_postcopy_incoming_init(MigrationIncomingState *mis)
4064{
c136180c 4065 return postcopy_ram_incoming_init(mis);
1caddf8a
DDAG
4066}
4067
3d0684b2
JQ
4068/**
4069 * ram_load_postcopy: load a page in postcopy case
4070 *
4071 * Returns 0 for success or -errno in case of error
4072 *
a7180877
DDAG
4073 * Called in postcopy mode by ram_load().
4074 * rcu_read_lock is taken prior to this being called.
3d0684b2
JQ
4075 *
4076 * @f: QEMUFile where to send the data
36f62f11 4077 * @channel: the channel to use for loading
a7180877 4078 */
36f62f11 4079int ram_load_postcopy(QEMUFile *f, int channel)
a7180877
DDAG
4080{
4081 int flags = 0, ret = 0;
4082 bool place_needed = false;
1aa83678 4083 bool matches_target_page_size = false;
a7180877 4084 MigrationIncomingState *mis = migration_incoming_get_current();
36f62f11 4085 PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
a7180877
DDAG
4086
4087 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4088 ram_addr_t addr;
a7180877
DDAG
4089 void *page_buffer = NULL;
4090 void *place_source = NULL;
df9ff5e1 4091 RAMBlock *block = NULL;
a7180877 4092 uint8_t ch;
644acf99 4093 int len;
a7180877
DDAG
4094
4095 addr = qemu_get_be64(f);
7a9ddfbf
PX
4096
4097 /*
4098 * If qemu file error, we should stop here, and then "addr"
4099 * may be invalid
4100 */
4101 ret = qemu_file_get_error(f);
4102 if (ret) {
4103 break;
4104 }
4105
a7180877
DDAG
4106 flags = addr & ~TARGET_PAGE_MASK;
4107 addr &= TARGET_PAGE_MASK;
4108
36f62f11 4109 trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
644acf99
WY
4110 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4111 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
c01b16ed 4112 block = ram_block_from_stream(mis, f, flags, channel);
6a23f639
DH
4113 if (!block) {
4114 ret = -EINVAL;
4115 break;
4116 }
4c4bad48 4117
898ba906
DH
4118 /*
4119 * Relying on used_length is racy and can result in false positives.
4120 * We might place pages beyond used_length in case RAM was shrunk
4121 * while in postcopy, which is fine - trying to place via
4122 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
4123 */
4124 if (!block->host || addr >= block->postcopy_length) {
a7180877
DDAG
4125 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4126 ret = -EINVAL;
4127 break;
4128 }
77dadc3f 4129 tmp_page->target_pages++;
1aa83678 4130 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
a7180877 4131 /*
28abd200
DDAG
4132 * Postcopy requires that we place whole host pages atomically;
4133 * these may be huge pages for RAMBlocks that are backed by
4134 * hugetlbfs.
a7180877
DDAG
4135 * To make it atomic, the data is read into a temporary page
4136 * that's moved into place later.
4137 * The migration protocol uses, possibly smaller, target-pages
4138 * however the source ensures it always sends all the components
91ba442f 4139 * of a host page in one chunk.
a7180877 4140 */
77dadc3f 4141 page_buffer = tmp_page->tmp_huge_page +
6a23f639
DH
4142 host_page_offset_from_ram_block_offset(block, addr);
4143 /* If all TP are zero then we can optimise the place */
77dadc3f
PX
4144 if (tmp_page->target_pages == 1) {
4145 tmp_page->host_addr =
4146 host_page_from_ram_block_offset(block, addr);
4147 } else if (tmp_page->host_addr !=
4148 host_page_from_ram_block_offset(block, addr)) {
c53b7ddc 4149 /* not the 1st TP within the HP */
36f62f11 4150 error_report("Non-same host page detected on channel %d: "
cfc7dc8a
PX
4151 "Target host page %p, received host page %p "
4152 "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
36f62f11 4153 channel, tmp_page->host_addr,
cfc7dc8a
PX
4154 host_page_from_ram_block_offset(block, addr),
4155 block->idstr, addr, tmp_page->target_pages);
6a23f639
DH
4156 ret = -EINVAL;
4157 break;
a7180877
DDAG
4158 }
4159
4160 /*
4161 * If it's the last part of a host page then we place the host
4162 * page
4163 */
77dadc3f
PX
4164 if (tmp_page->target_pages ==
4165 (block->page_size / TARGET_PAGE_SIZE)) {
4cbb3c63 4166 place_needed = true;
4cbb3c63 4167 }
77dadc3f 4168 place_source = tmp_page->tmp_huge_page;
a7180877
DDAG
4169 }
4170
4171 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
bb890ed5 4172 case RAM_SAVE_FLAG_ZERO:
a7180877 4173 ch = qemu_get_byte(f);
2e36bc1b
WY
4174 /*
4175 * Can skip to set page_buffer when
4176 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
4177 */
4178 if (ch || !matches_target_page_size) {
4179 memset(page_buffer, ch, TARGET_PAGE_SIZE);
4180 }
a7180877 4181 if (ch) {
77dadc3f 4182 tmp_page->all_zero = false;
a7180877
DDAG
4183 }
4184 break;
4185
4186 case RAM_SAVE_FLAG_PAGE:
77dadc3f 4187 tmp_page->all_zero = false;
1aa83678
PX
4188 if (!matches_target_page_size) {
4189 /* For huge pages, we always use temporary buffer */
a7180877
DDAG
4190 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4191 } else {
1aa83678
PX
4192 /*
4193 * For small pages that matches target page size, we
4194 * avoid the qemu_file copy. Instead we directly use
4195 * the buffer of QEMUFile to place the page. Note: we
4196 * cannot do any QEMUFile operation before using that
4197 * buffer to make sure the buffer is valid when
4198 * placing the page.
a7180877
DDAG
4199 */
4200 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4201 TARGET_PAGE_SIZE);
4202 }
4203 break;
644acf99 4204 case RAM_SAVE_FLAG_COMPRESS_PAGE:
77dadc3f 4205 tmp_page->all_zero = false;
644acf99
WY
4206 len = qemu_get_be32(f);
4207 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4208 error_report("Invalid compressed data length: %d", len);
4209 ret = -EINVAL;
4210 break;
4211 }
4212 decompress_data_with_multi_threads(f, page_buffer, len);
4213 break;
294e5a40
JQ
4214 case RAM_SAVE_FLAG_MULTIFD_FLUSH:
4215 multifd_recv_sync_main();
4216 break;
a7180877
DDAG
4217 case RAM_SAVE_FLAG_EOS:
4218 /* normal exit */
b05292c2
JQ
4219 if (migrate_multifd_flush_after_each_section()) {
4220 multifd_recv_sync_main();
4221 }
a7180877
DDAG
4222 break;
4223 default:
29fccade 4224 error_report("Unknown combination of migration flags: 0x%x"
a7180877
DDAG
4225 " (postcopy mode)", flags);
4226 ret = -EINVAL;
7a9ddfbf
PX
4227 break;
4228 }
4229
644acf99
WY
4230 /* Got the whole host page, wait for decompress before placing. */
4231 if (place_needed) {
4232 ret |= wait_for_decompress_done();
4233 }
4234
7a9ddfbf
PX
4235 /* Detect for any possible file errors */
4236 if (!ret && qemu_file_get_error(f)) {
4237 ret = qemu_file_get_error(f);
a7180877
DDAG
4238 }
4239
7a9ddfbf 4240 if (!ret && place_needed) {
77dadc3f
PX
4241 if (tmp_page->all_zero) {
4242 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
a7180877 4243 } else {
77dadc3f
PX
4244 ret = postcopy_place_page(mis, tmp_page->host_addr,
4245 place_source, block);
a7180877 4246 }
ddf35bdf 4247 place_needed = false;
77dadc3f 4248 postcopy_temp_page_reset(tmp_page);
a7180877 4249 }
a7180877
DDAG
4250 }
4251
4252 return ret;
4253}
4254
acab30b8
DHB
4255static bool postcopy_is_running(void)
4256{
4257 PostcopyState ps = postcopy_state_get();
4258 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4259}
4260
e6f4aa18
ZC
4261/*
4262 * Flush content of RAM cache into SVM's memory.
4263 * Only flush the pages that be dirtied by PVM or SVM or both.
4264 */
24fa16f8 4265void colo_flush_ram_cache(void)
e6f4aa18
ZC
4266{
4267 RAMBlock *block = NULL;
4268 void *dst_host;
4269 void *src_host;
4270 unsigned long offset = 0;
4271
d1955d22 4272 memory_global_dirty_log_sync();
89ac5a1d
DDAG
4273 WITH_RCU_READ_LOCK_GUARD() {
4274 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4275 ramblock_sync_dirty_bitmap(ram_state, block);
4276 }
d1955d22 4277 }
d1955d22 4278
e6f4aa18 4279 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
89ac5a1d
DDAG
4280 WITH_RCU_READ_LOCK_GUARD() {
4281 block = QLIST_FIRST_RCU(&ram_list.blocks);
e6f4aa18 4282
89ac5a1d 4283 while (block) {
a6a83cef 4284 unsigned long num = 0;
e6f4aa18 4285
a6a83cef 4286 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
542147f4
DH
4287 if (!offset_in_ramblock(block,
4288 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
89ac5a1d 4289 offset = 0;
a6a83cef 4290 num = 0;
89ac5a1d
DDAG
4291 block = QLIST_NEXT_RCU(block, next);
4292 } else {
a6a83cef
RL
4293 unsigned long i = 0;
4294
4295 for (i = 0; i < num; i++) {
4296 migration_bitmap_clear_dirty(ram_state, block, offset + i);
4297 }
8bba004c
AR
4298 dst_host = block->host
4299 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4300 src_host = block->colo_cache
4301 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
a6a83cef
RL
4302 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
4303 offset += num;
89ac5a1d 4304 }
e6f4aa18
ZC
4305 }
4306 }
e6f4aa18
ZC
4307 trace_colo_flush_ram_cache_end();
4308}
4309
10da4a36
WY
4310/**
4311 * ram_load_precopy: load pages in precopy case
4312 *
4313 * Returns 0 for success or -errno in case of error
4314 *
4315 * Called in precopy mode by ram_load().
4316 * rcu_read_lock is taken prior to this being called.
4317 *
4318 * @f: QEMUFile where to send the data
4319 */
4320static int ram_load_precopy(QEMUFile *f)
56e93d26 4321{
755e8d7c 4322 MigrationIncomingState *mis = migration_incoming_get_current();
e65cec5e 4323 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
ef08fb38 4324 /* ADVISE is earlier, it shows the source has the postcopy capability on */
80fe315c 4325 bool postcopy_advised = migration_incoming_postcopy_advised();
a7a94d14 4326 if (!migrate_compress()) {
edc60127
JQ
4327 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4328 }
a7180877 4329
10da4a36 4330 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
56e93d26 4331 ram_addr_t addr, total_ram_bytes;
0393031a 4332 void *host = NULL, *host_bak = NULL;
56e93d26
JQ
4333 uint8_t ch;
4334
e65cec5e
YK
4335 /*
4336 * Yield periodically to let main loop run, but an iteration of
4337 * the main loop is expensive, so do it each some iterations
4338 */
4339 if ((i & 32767) == 0 && qemu_in_coroutine()) {
4340 aio_co_schedule(qemu_get_current_aio_context(),
4341 qemu_coroutine_self());
4342 qemu_coroutine_yield();
4343 }
4344 i++;
4345
56e93d26
JQ
4346 addr = qemu_get_be64(f);
4347 flags = addr & ~TARGET_PAGE_MASK;
4348 addr &= TARGET_PAGE_MASK;
4349
edc60127
JQ
4350 if (flags & invalid_flags) {
4351 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4352 error_report("Received an unexpected compressed page");
4353 }
4354
4355 ret = -EINVAL;
4356 break;
4357 }
4358
bb890ed5 4359 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
a776aa15 4360 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
c01b16ed
PX
4361 RAMBlock *block = ram_block_from_stream(mis, f, flags,
4362 RAM_CHANNEL_PRECOPY);
4c4bad48 4363
0393031a 4364 host = host_from_ram_block_offset(block, addr);
13af18f2 4365 /*
0393031a
HZ
4366 * After going into COLO stage, we should not load the page
4367 * into SVM's memory directly, we put them into colo_cache firstly.
4368 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
4369 * Previously, we copied all these memory in preparing stage of COLO
4370 * while we need to stop VM, which is a time-consuming process.
4371 * Here we optimize it by a trick, back-up every page while in
4372 * migration process while COLO is enabled, though it affects the
4373 * speed of the migration, but it obviously reduce the downtime of
4374 * back-up all SVM'S memory in COLO preparing stage.
13af18f2 4375 */
0393031a
HZ
4376 if (migration_incoming_colo_enabled()) {
4377 if (migration_incoming_in_colo_state()) {
4378 /* In COLO stage, put all pages into cache temporarily */
8af66371 4379 host = colo_cache_from_block_offset(block, addr, true);
0393031a
HZ
4380 } else {
4381 /*
4382 * In migration stage but before COLO stage,
4383 * Put all pages into both cache and SVM's memory.
4384 */
8af66371 4385 host_bak = colo_cache_from_block_offset(block, addr, false);
0393031a 4386 }
13af18f2 4387 }
a776aa15
DDAG
4388 if (!host) {
4389 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4390 ret = -EINVAL;
4391 break;
4392 }
13af18f2
ZC
4393 if (!migration_incoming_in_colo_state()) {
4394 ramblock_recv_bitmap_set(block, host);
4395 }
4396
1db9d8e5 4397 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
a776aa15
DDAG
4398 }
4399
56e93d26
JQ
4400 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4401 case RAM_SAVE_FLAG_MEM_SIZE:
4402 /* Synchronize RAM block list */
4403 total_ram_bytes = addr;
4404 while (!ret && total_ram_bytes) {
4405 RAMBlock *block;
56e93d26
JQ
4406 char id[256];
4407 ram_addr_t length;
4408
4409 len = qemu_get_byte(f);
4410 qemu_get_buffer(f, (uint8_t *)id, len);
4411 id[len] = 0;
4412 length = qemu_get_be64(f);
4413
e3dd7493 4414 block = qemu_ram_block_by_name(id);
b895de50
CLG
4415 if (block && !qemu_ram_is_migratable(block)) {
4416 error_report("block %s should not be migrated !", id);
4417 ret = -EINVAL;
4418 } else if (block) {
e3dd7493
DDAG
4419 if (length != block->used_length) {
4420 Error *local_err = NULL;
56e93d26 4421
fa53a0e5 4422 ret = qemu_ram_resize(block, length,
e3dd7493
DDAG
4423 &local_err);
4424 if (local_err) {
4425 error_report_err(local_err);
56e93d26 4426 }
56e93d26 4427 }
ef08fb38 4428 /* For postcopy we need to check hugepage sizes match */
e846b746 4429 if (postcopy_advised && migrate_postcopy_ram() &&
ef08fb38
DDAG
4430 block->page_size != qemu_host_page_size) {
4431 uint64_t remote_page_size = qemu_get_be64(f);
4432 if (remote_page_size != block->page_size) {
4433 error_report("Mismatched RAM page size %s "
4434 "(local) %zd != %" PRId64,
4435 id, block->page_size,
4436 remote_page_size);
4437 ret = -EINVAL;
4438 }
4439 }
fbd162e6
YK
4440 if (migrate_ignore_shared()) {
4441 hwaddr addr = qemu_get_be64(f);
fbd162e6
YK
4442 if (ramblock_is_ignored(block) &&
4443 block->mr->addr != addr) {
4444 error_report("Mismatched GPAs for block %s "
4445 "%" PRId64 "!= %" PRId64,
4446 id, (uint64_t)addr,
4447 (uint64_t)block->mr->addr);
4448 ret = -EINVAL;
4449 }
4450 }
e3dd7493
DDAG
4451 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4452 block->idstr);
4453 } else {
56e93d26
JQ
4454 error_report("Unknown ramblock \"%s\", cannot "
4455 "accept migration", id);
4456 ret = -EINVAL;
4457 }
4458
4459 total_ram_bytes -= length;
4460 }
4461 break;
a776aa15 4462
bb890ed5 4463 case RAM_SAVE_FLAG_ZERO:
56e93d26
JQ
4464 ch = qemu_get_byte(f);
4465 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4466 break;
a776aa15 4467
56e93d26 4468 case RAM_SAVE_FLAG_PAGE:
56e93d26
JQ
4469 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4470 break;
56e93d26 4471
a776aa15 4472 case RAM_SAVE_FLAG_COMPRESS_PAGE:
56e93d26
JQ
4473 len = qemu_get_be32(f);
4474 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4475 error_report("Invalid compressed data length: %d", len);
4476 ret = -EINVAL;
4477 break;
4478 }
c1bc6626 4479 decompress_data_with_multi_threads(f, host, len);
56e93d26 4480 break;
a776aa15 4481
56e93d26 4482 case RAM_SAVE_FLAG_XBZRLE:
56e93d26
JQ
4483 if (load_xbzrle(f, addr, host) < 0) {
4484 error_report("Failed to decompress XBZRLE page at "
4485 RAM_ADDR_FMT, addr);
4486 ret = -EINVAL;
4487 break;
4488 }
4489 break;
294e5a40
JQ
4490 case RAM_SAVE_FLAG_MULTIFD_FLUSH:
4491 multifd_recv_sync_main();
4492 break;
56e93d26
JQ
4493 case RAM_SAVE_FLAG_EOS:
4494 /* normal exit */
b05292c2
JQ
4495 if (migrate_multifd_flush_after_each_section()) {
4496 multifd_recv_sync_main();
4497 }
56e93d26 4498 break;
5f1e7540
JQ
4499 case RAM_SAVE_FLAG_HOOK:
4500 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4501 break;
56e93d26 4502 default:
5f1e7540
JQ
4503 error_report("Unknown combination of migration flags: 0x%x", flags);
4504 ret = -EINVAL;
56e93d26
JQ
4505 }
4506 if (!ret) {
4507 ret = qemu_file_get_error(f);
4508 }
0393031a
HZ
4509 if (!ret && host_bak) {
4510 memcpy(host_bak, host, TARGET_PAGE_SIZE);
4511 }
56e93d26
JQ
4512 }
4513
ca1a6b70 4514 ret |= wait_for_decompress_done();
10da4a36
WY
4515 return ret;
4516}
4517
4518static int ram_load(QEMUFile *f, void *opaque, int version_id)
4519{
4520 int ret = 0;
4521 static uint64_t seq_iter;
4522 /*
4523 * If system is running in postcopy mode, page inserts to host memory must
4524 * be atomic
4525 */
4526 bool postcopy_running = postcopy_is_running();
4527
4528 seq_iter++;
4529
4530 if (version_id != 4) {
4531 return -EINVAL;
4532 }
4533
4534 /*
4535 * This RCU critical section can be very long running.
4536 * When RCU reclaims in the code start to become numerous,
4537 * it will be necessary to reduce the granularity of this
4538 * critical section.
4539 */
89ac5a1d
DDAG
4540 WITH_RCU_READ_LOCK_GUARD() {
4541 if (postcopy_running) {
36f62f11
PX
4542 /*
4543 * Note! Here RAM_CHANNEL_PRECOPY is the precopy channel of
4544 * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
4545 * service fast page faults.
4546 */
4547 ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
89ac5a1d
DDAG
4548 } else {
4549 ret = ram_load_precopy(f);
4550 }
10da4a36 4551 }
55c4446b 4552 trace_ram_load_complete(ret, seq_iter);
e6f4aa18 4553
56e93d26
JQ
4554 return ret;
4555}
4556
c6467627
VSO
4557static bool ram_has_postcopy(void *opaque)
4558{
469dd51b 4559 RAMBlock *rb;
fbd162e6 4560 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
469dd51b
JH
4561 if (ramblock_is_pmem(rb)) {
4562 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4563 "is not supported now!", rb->idstr, rb->host);
4564 return false;
4565 }
4566 }
4567
c6467627
VSO
4568 return migrate_postcopy_ram();
4569}
4570
edd090c7
PX
4571/* Sync all the dirty bitmap with destination VM. */
4572static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4573{
4574 RAMBlock *block;
4575 QEMUFile *file = s->to_dst_file;
4576 int ramblock_count = 0;
4577
4578 trace_ram_dirty_bitmap_sync_start();
4579
fbd162e6 4580 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
edd090c7
PX
4581 qemu_savevm_send_recv_bitmap(file, block->idstr);
4582 trace_ram_dirty_bitmap_request(block->idstr);
4583 ramblock_count++;
4584 }
4585
4586 trace_ram_dirty_bitmap_sync_wait();
4587
4588 /* Wait until all the ramblocks' dirty bitmap synced */
4589 while (ramblock_count--) {
4590 qemu_sem_wait(&s->rp_state.rp_sem);
4591 }
4592
4593 trace_ram_dirty_bitmap_sync_complete();
4594
4595 return 0;
4596}
4597
4598static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4599{
4600 qemu_sem_post(&s->rp_state.rp_sem);
4601}
4602
a335debb
PX
4603/*
4604 * Read the received bitmap, revert it as the initial dirty bitmap.
4605 * This is only used when the postcopy migration is paused but wants
4606 * to resume from a middle point.
4607 */
4608int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4609{
4610 int ret = -EINVAL;
43044ac0 4611 /* from_dst_file is always valid because we're within rp_thread */
a335debb
PX
4612 QEMUFile *file = s->rp_state.from_dst_file;
4613 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
a725ef9f 4614 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
4615 uint64_t size, end_mark;
4616
4617 trace_ram_dirty_bitmap_reload_begin(block->idstr);
4618
4619 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4620 error_report("%s: incorrect state %s", __func__,
4621 MigrationStatus_str(s->state));
4622 return -EINVAL;
4623 }
4624
4625 /*
4626 * Note: see comments in ramblock_recv_bitmap_send() on why we
3a4452d8 4627 * need the endianness conversion, and the paddings.
a335debb
PX
4628 */
4629 local_size = ROUND_UP(local_size, 8);
4630
4631 /* Add paddings */
4632 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4633
4634 size = qemu_get_be64(file);
4635
4636 /* The size of the bitmap should match with our ramblock */
4637 if (size != local_size) {
4638 error_report("%s: ramblock '%s' bitmap size mismatch "
4639 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4640 block->idstr, size, local_size);
4641 ret = -EINVAL;
4642 goto out;
4643 }
4644
4645 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4646 end_mark = qemu_get_be64(file);
4647
4648 ret = qemu_file_get_error(file);
4649 if (ret || size != local_size) {
4650 error_report("%s: read bitmap failed for ramblock '%s': %d"
4651 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4652 __func__, block->idstr, ret, local_size, size);
4653 ret = -EIO;
4654 goto out;
4655 }
4656
4657 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
af3bbbe9 4658 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
a335debb
PX
4659 __func__, block->idstr, end_mark);
4660 ret = -EINVAL;
4661 goto out;
4662 }
4663
4664 /*
3a4452d8 4665 * Endianness conversion. We are during postcopy (though paused).
a335debb
PX
4666 * The dirty bitmap won't change. We can directly modify it.
4667 */
4668 bitmap_from_le(block->bmap, le_bitmap, nbits);
4669
4670 /*
4671 * What we received is "received bitmap". Revert it as the initial
4672 * dirty bitmap for this ramblock.
4673 */
4674 bitmap_complement(block->bmap, block->bmap, nbits);
4675
be39b4cd
DH
4676 /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4677 ramblock_dirty_bitmap_clear_discarded_pages(block);
4678
4679 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
a335debb
PX
4680 trace_ram_dirty_bitmap_reload_complete(block->idstr);
4681
edd090c7
PX
4682 /*
4683 * We succeeded to sync bitmap for current ramblock. If this is
4684 * the last one to sync, we need to notify the main send thread.
4685 */
4686 ram_dirty_bitmap_reload_notify(s);
4687
a335debb
PX
4688 ret = 0;
4689out:
bf269906 4690 g_free(le_bitmap);
a335debb
PX
4691 return ret;
4692}
4693
edd090c7
PX
4694static int ram_resume_prepare(MigrationState *s, void *opaque)
4695{
4696 RAMState *rs = *(RAMState **)opaque;
08614f34 4697 int ret;
edd090c7 4698
08614f34
PX
4699 ret = ram_dirty_bitmap_sync_all(s, rs);
4700 if (ret) {
4701 return ret;
4702 }
4703
4704 ram_state_resume_prepare(rs, s->to_dst_file);
4705
4706 return 0;
edd090c7
PX
4707}
4708
36f62f11
PX
4709void postcopy_preempt_shutdown_file(MigrationState *s)
4710{
4711 qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
4712 qemu_fflush(s->postcopy_qemufile_src);
4713}
4714
56e93d26 4715static SaveVMHandlers savevm_ram_handlers = {
9907e842 4716 .save_setup = ram_save_setup,
56e93d26 4717 .save_live_iterate = ram_save_iterate,
763c906b 4718 .save_live_complete_postcopy = ram_save_complete,
a3e06c3d 4719 .save_live_complete_precopy = ram_save_complete,
c6467627 4720 .has_postcopy = ram_has_postcopy,
c8df4a7a
JQ
4721 .state_pending_exact = ram_state_pending_exact,
4722 .state_pending_estimate = ram_state_pending_estimate,
56e93d26 4723 .load_state = ram_load,
f265e0e4
JQ
4724 .save_cleanup = ram_save_cleanup,
4725 .load_setup = ram_load_setup,
4726 .load_cleanup = ram_load_cleanup,
edd090c7 4727 .resume_prepare = ram_resume_prepare,
56e93d26
JQ
4728};
4729
c7c0e724
DH
4730static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4731 size_t old_size, size_t new_size)
4732{
cc61c703 4733 PostcopyState ps = postcopy_state_get();
c7c0e724
DH
4734 ram_addr_t offset;
4735 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4736 Error *err = NULL;
4737
4738 if (ramblock_is_ignored(rb)) {
4739 return;
4740 }
4741
4742 if (!migration_is_idle()) {
4743 /*
4744 * Precopy code on the source cannot deal with the size of RAM blocks
4745 * changing at random points in time - especially after sending the
4746 * RAM block sizes in the migration stream, they must no longer change.
4747 * Abort and indicate a proper reason.
4748 */
4749 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
458fecca 4750 migration_cancel(err);
c7c0e724 4751 error_free(err);
c7c0e724 4752 }
cc61c703
DH
4753
4754 switch (ps) {
4755 case POSTCOPY_INCOMING_ADVISE:
4756 /*
4757 * Update what ram_postcopy_incoming_init()->init_range() does at the
4758 * time postcopy was advised. Syncing RAM blocks with the source will
4759 * result in RAM resizes.
4760 */
4761 if (old_size < new_size) {
4762 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4763 error_report("RAM block '%s' discard of resized RAM failed",
4764 rb->idstr);
4765 }
4766 }
898ba906 4767 rb->postcopy_length = new_size;
cc61c703
DH
4768 break;
4769 case POSTCOPY_INCOMING_NONE:
4770 case POSTCOPY_INCOMING_RUNNING:
4771 case POSTCOPY_INCOMING_END:
4772 /*
4773 * Once our guest is running, postcopy does no longer care about
4774 * resizes. When growing, the new memory was not available on the
4775 * source, no handler needed.
4776 */
4777 break;
4778 default:
4779 error_report("RAM block '%s' resized during postcopy state: %d",
4780 rb->idstr, ps);
4781 exit(-1);
4782 }
c7c0e724
DH
4783}
4784
4785static RAMBlockNotifier ram_mig_ram_notifier = {
4786 .ram_block_resized = ram_mig_ram_block_resized,
4787};
4788
56e93d26
JQ
4789void ram_mig_init(void)
4790{
4791 qemu_mutex_init(&XBZRLE.lock);
ce62df53 4792 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
c7c0e724 4793 ram_block_notifier_add(&ram_mig_ram_notifier);
56e93d26 4794}