]> git.proxmox.com Git - mirror_qemu.git/blame - migration/ram.c
migration: Fix migrate-set-parameters argument validation
[mirror_qemu.git] / migration / ram.c
CommitLineData
56e93d26
JQ
1/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
76cc7b58
JQ
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
56e93d26
JQ
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
e688df6b 28
1393a485 29#include "qemu/osdep.h"
33c11879 30#include "cpu.h"
f348b6d1 31#include "qemu/cutils.h"
56e93d26
JQ
32#include "qemu/bitops.h"
33#include "qemu/bitmap.h"
7205c9ec 34#include "qemu/main-loop.h"
709e3fe8 35#include "xbzrle.h"
7b1e1a22 36#include "ram.h"
6666c96a 37#include "migration.h"
f2a8f0a6 38#include "migration/register.h"
7b1e1a22 39#include "migration/misc.h"
08a0aee1 40#include "qemu-file.h"
be07b0ac 41#include "postcopy-ram.h"
53d37d36 42#include "page_cache.h"
56e93d26 43#include "qemu/error-report.h"
e688df6b 44#include "qapi/error.h"
ab7cbb0b 45#include "qapi/qapi-types-migration.h"
9af23989 46#include "qapi/qapi-events-migration.h"
8acabf69 47#include "qapi/qmp/qerror.h"
56e93d26 48#include "trace.h"
56e93d26 49#include "exec/ram_addr.h"
f9494614 50#include "exec/target_page.h"
56e93d26 51#include "qemu/rcu_queue.h"
a91246c9 52#include "migration/colo.h"
53d37d36 53#include "block.h"
af8b7d2b 54#include "sysemu/sysemu.h"
b0c3cf94 55#include "sysemu/cpu-throttle.h"
edd090c7 56#include "savevm.h"
b9ee2f7d 57#include "qemu/iov.h"
d32ca5ad 58#include "multifd.h"
278e2f55
AG
59#include "sysemu/runstate.h"
60
61#if defined(__linux__)
62#include "qemu/userfaultfd.h"
63#endif /* defined(__linux__) */
56e93d26 64
56e93d26
JQ
65/***********************************************************/
66/* ram save/restore */
67
bb890ed5
JQ
68/* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
69 * worked for pages that where filled with the same char. We switched
70 * it to only search for the zero value. And to avoid confusion with
71 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
72 */
73
56e93d26 74#define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
bb890ed5 75#define RAM_SAVE_FLAG_ZERO 0x02
56e93d26
JQ
76#define RAM_SAVE_FLAG_MEM_SIZE 0x04
77#define RAM_SAVE_FLAG_PAGE 0x08
78#define RAM_SAVE_FLAG_EOS 0x10
79#define RAM_SAVE_FLAG_CONTINUE 0x20
80#define RAM_SAVE_FLAG_XBZRLE 0x40
81/* 0x80 is reserved in migration.h start with 0x100 next */
82#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
83
56e93d26
JQ
84static inline bool is_zero_range(uint8_t *p, uint64_t size)
85{
a1febc49 86 return buffer_is_zero(p, size);
56e93d26
JQ
87}
88
9360447d
JQ
89XBZRLECacheStats xbzrle_counters;
90
56e93d26
JQ
91/* struct contains XBZRLE cache and a static page
92 used by the compression */
93static struct {
94 /* buffer used for XBZRLE encoding */
95 uint8_t *encoded_buf;
96 /* buffer for storing page content */
97 uint8_t *current_buf;
98 /* Cache for XBZRLE, Protected by lock. */
99 PageCache *cache;
100 QemuMutex lock;
c00e0928
JQ
101 /* it will store a page full of zeros */
102 uint8_t *zero_target_page;
f265e0e4
JQ
103 /* buffer used for XBZRLE decoding */
104 uint8_t *decoded_buf;
56e93d26
JQ
105} XBZRLE;
106
56e93d26
JQ
107static void XBZRLE_cache_lock(void)
108{
f4c51a6b 109 if (migrate_use_xbzrle()) {
56e93d26 110 qemu_mutex_lock(&XBZRLE.lock);
f4c51a6b 111 }
56e93d26
JQ
112}
113
114static void XBZRLE_cache_unlock(void)
115{
f4c51a6b 116 if (migrate_use_xbzrle()) {
56e93d26 117 qemu_mutex_unlock(&XBZRLE.lock);
f4c51a6b 118 }
56e93d26
JQ
119}
120
3d0684b2
JQ
121/**
122 * xbzrle_cache_resize: resize the xbzrle cache
123 *
124 * This function is called from qmp_migrate_set_cache_size in main
125 * thread, possibly while a migration is in progress. A running
126 * migration may be using the cache and might finish during this call,
127 * hence changes to the cache are protected by XBZRLE.lock().
128 *
c9dede2d 129 * Returns 0 for success or -1 for error
3d0684b2
JQ
130 *
131 * @new_size: new cache size
8acabf69 132 * @errp: set *errp if the check failed, with reason
56e93d26 133 */
c9dede2d 134int xbzrle_cache_resize(int64_t new_size, Error **errp)
56e93d26
JQ
135{
136 PageCache *new_cache;
c9dede2d 137 int64_t ret = 0;
56e93d26 138
8acabf69
JQ
139 /* Check for truncation */
140 if (new_size != (size_t)new_size) {
141 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
142 "exceeding address space");
143 return -1;
144 }
145
2a313e5c
JQ
146 if (new_size == migrate_xbzrle_cache_size()) {
147 /* nothing to do */
c9dede2d 148 return 0;
2a313e5c
JQ
149 }
150
56e93d26
JQ
151 XBZRLE_cache_lock();
152
153 if (XBZRLE.cache != NULL) {
80f8dfde 154 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
56e93d26 155 if (!new_cache) {
56e93d26
JQ
156 ret = -1;
157 goto out;
158 }
159
160 cache_fini(XBZRLE.cache);
161 XBZRLE.cache = new_cache;
162 }
56e93d26
JQ
163out:
164 XBZRLE_cache_unlock();
165 return ret;
166}
167
3ded54b1 168bool ramblock_is_ignored(RAMBlock *block)
fbd162e6
YK
169{
170 return !qemu_ram_is_migratable(block) ||
171 (migrate_ignore_shared() && qemu_ram_is_shared(block));
172}
173
343f632c
DDAG
174#undef RAMBLOCK_FOREACH
175
fbd162e6
YK
176int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
177{
178 RAMBlock *block;
179 int ret = 0;
180
89ac5a1d
DDAG
181 RCU_READ_LOCK_GUARD();
182
fbd162e6
YK
183 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
184 ret = func(block, opaque);
185 if (ret) {
186 break;
187 }
188 }
fbd162e6
YK
189 return ret;
190}
191
f9494614
AP
192static void ramblock_recv_map_init(void)
193{
194 RAMBlock *rb;
195
fbd162e6 196 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
f9494614
AP
197 assert(!rb->receivedmap);
198 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
199 }
200}
201
202int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
203{
204 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
205 rb->receivedmap);
206}
207
1cba9f6e
DDAG
208bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
209{
210 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
211}
212
f9494614
AP
213void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
214{
215 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
216}
217
218void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
219 size_t nr)
220{
221 bitmap_set_atomic(rb->receivedmap,
222 ramblock_recv_bitmap_offset(host_addr, rb),
223 nr);
224}
225
a335debb
PX
226#define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
227
228/*
229 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
230 *
231 * Returns >0 if success with sent bytes, or <0 if error.
232 */
233int64_t ramblock_recv_bitmap_send(QEMUFile *file,
234 const char *block_name)
235{
236 RAMBlock *block = qemu_ram_block_by_name(block_name);
237 unsigned long *le_bitmap, nbits;
238 uint64_t size;
239
240 if (!block) {
241 error_report("%s: invalid block name: %s", __func__, block_name);
242 return -1;
243 }
244
245 nbits = block->used_length >> TARGET_PAGE_BITS;
246
247 /*
248 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
249 * machines we may need 4 more bytes for padding (see below
250 * comment). So extend it a bit before hand.
251 */
252 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
253
254 /*
255 * Always use little endian when sending the bitmap. This is
256 * required that when source and destination VMs are not using the
3a4452d8 257 * same endianness. (Note: big endian won't work.)
a335debb
PX
258 */
259 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
260
261 /* Size of the bitmap, in bytes */
a725ef9f 262 size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
263
264 /*
265 * size is always aligned to 8 bytes for 64bit machines, but it
266 * may not be true for 32bit machines. We need this padding to
267 * make sure the migration can survive even between 32bit and
268 * 64bit machines.
269 */
270 size = ROUND_UP(size, 8);
271
272 qemu_put_be64(file, size);
273 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
274 /*
275 * Mark as an end, in case the middle part is screwed up due to
3a4452d8 276 * some "mysterious" reason.
a335debb
PX
277 */
278 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
279 qemu_fflush(file);
280
bf269906 281 g_free(le_bitmap);
a335debb
PX
282
283 if (qemu_file_get_error(file)) {
284 return qemu_file_get_error(file);
285 }
286
287 return size + sizeof(size);
288}
289
ec481c6c
JQ
290/*
291 * An outstanding page request, on the source, having been received
292 * and queued
293 */
294struct RAMSrcPageRequest {
295 RAMBlock *rb;
296 hwaddr offset;
297 hwaddr len;
298
299 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
300};
301
6f37bb8b
JQ
302/* State of RAM for migration */
303struct RAMState {
204b88b8
JQ
304 /* QEMUFile used for this migration */
305 QEMUFile *f;
278e2f55
AG
306 /* UFFD file descriptor, used in 'write-tracking' migration */
307 int uffdio_fd;
6f37bb8b
JQ
308 /* Last block that we have visited searching for dirty pages */
309 RAMBlock *last_seen_block;
310 /* Last block from where we have sent data */
311 RAMBlock *last_sent_block;
269ace29
JQ
312 /* Last dirty target page we have sent */
313 ram_addr_t last_page;
6f37bb8b
JQ
314 /* last ram version we have seen */
315 uint32_t last_version;
316 /* We are in the first round */
317 bool ram_bulk_stage;
6eeb63f7
WW
318 /* The free page optimization is enabled */
319 bool fpo_enabled;
8d820d6f
JQ
320 /* How many times we have dirty too many pages */
321 int dirty_rate_high_cnt;
f664da80
JQ
322 /* these variables are used for bitmap sync */
323 /* last time we did a full bitmap_sync */
324 int64_t time_last_bitmap_sync;
eac74159 325 /* bytes transferred at start_time */
c4bdf0cf 326 uint64_t bytes_xfer_prev;
a66cd90c 327 /* number of dirty pages since start_time */
68908ed6 328 uint64_t num_dirty_pages_period;
b5833fde
JQ
329 /* xbzrle misses since the beginning of the period */
330 uint64_t xbzrle_cache_miss_prev;
e460a4b1
WW
331 /* Amount of xbzrle pages since the beginning of the period */
332 uint64_t xbzrle_pages_prev;
333 /* Amount of xbzrle encoded bytes since the beginning of the period */
334 uint64_t xbzrle_bytes_prev;
76e03000
XG
335
336 /* compression statistics since the beginning of the period */
337 /* amount of count that no free thread to compress data */
338 uint64_t compress_thread_busy_prev;
339 /* amount bytes after compression */
340 uint64_t compressed_size_prev;
341 /* amount of compressed pages */
342 uint64_t compress_pages_prev;
343
be8b02ed
XG
344 /* total handled target pages at the beginning of period */
345 uint64_t target_page_count_prev;
346 /* total handled target pages since start */
347 uint64_t target_page_count;
9360447d 348 /* number of dirty bits in the bitmap */
2dfaf12e 349 uint64_t migration_dirty_pages;
386a907b 350 /* Protects modification of the bitmap and migration dirty pages */
108cfae0 351 QemuMutex bitmap_mutex;
68a098f3
JQ
352 /* The RAMBlock used in the last src_page_requests */
353 RAMBlock *last_req_rb;
ec481c6c
JQ
354 /* Queue of outstanding page requests from the destination */
355 QemuMutex src_page_req_mutex;
b58deb34 356 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
6f37bb8b
JQ
357};
358typedef struct RAMState RAMState;
359
53518d94 360static RAMState *ram_state;
6f37bb8b 361
bd227060
WW
362static NotifierWithReturnList precopy_notifier_list;
363
364void precopy_infrastructure_init(void)
365{
366 notifier_with_return_list_init(&precopy_notifier_list);
367}
368
369void precopy_add_notifier(NotifierWithReturn *n)
370{
371 notifier_with_return_list_add(&precopy_notifier_list, n);
372}
373
374void precopy_remove_notifier(NotifierWithReturn *n)
375{
376 notifier_with_return_remove(n);
377}
378
379int precopy_notify(PrecopyNotifyReason reason, Error **errp)
380{
381 PrecopyNotifyData pnd;
382 pnd.reason = reason;
383 pnd.errp = errp;
384
385 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
386}
387
6eeb63f7
WW
388void precopy_enable_free_page_optimization(void)
389{
390 if (!ram_state) {
391 return;
392 }
393
394 ram_state->fpo_enabled = true;
395}
396
9edabd4d 397uint64_t ram_bytes_remaining(void)
2f4fde93 398{
bae416e5
DDAG
399 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
400 0;
2f4fde93
JQ
401}
402
9360447d 403MigrationStats ram_counters;
96506894 404
b8fb8cb7
DDAG
405/* used by the search for pages to send */
406struct PageSearchStatus {
407 /* Current block being searched */
408 RAMBlock *block;
a935e30f
JQ
409 /* Current page to search from */
410 unsigned long page;
b8fb8cb7
DDAG
411 /* Set once we wrap around */
412 bool complete_round;
413};
414typedef struct PageSearchStatus PageSearchStatus;
415
76e03000
XG
416CompressionStats compression_counters;
417
56e93d26 418struct CompressParam {
56e93d26 419 bool done;
90e56fb4 420 bool quit;
5e5fdcff 421 bool zero_page;
56e93d26
JQ
422 QEMUFile *file;
423 QemuMutex mutex;
424 QemuCond cond;
425 RAMBlock *block;
426 ram_addr_t offset;
34ab9e97
XG
427
428 /* internally used fields */
dcaf446e 429 z_stream stream;
34ab9e97 430 uint8_t *originbuf;
56e93d26
JQ
431};
432typedef struct CompressParam CompressParam;
433
434struct DecompressParam {
73a8912b 435 bool done;
90e56fb4 436 bool quit;
56e93d26
JQ
437 QemuMutex mutex;
438 QemuCond cond;
439 void *des;
d341d9f3 440 uint8_t *compbuf;
56e93d26 441 int len;
797ca154 442 z_stream stream;
56e93d26
JQ
443};
444typedef struct DecompressParam DecompressParam;
445
446static CompressParam *comp_param;
447static QemuThread *compress_threads;
448/* comp_done_cond is used to wake up the migration thread when
449 * one of the compression threads has finished the compression.
450 * comp_done_lock is used to co-work with comp_done_cond.
451 */
0d9f9a5c
LL
452static QemuMutex comp_done_lock;
453static QemuCond comp_done_cond;
56e93d26
JQ
454/* The empty QEMUFileOps will be used by file in CompressParam */
455static const QEMUFileOps empty_ops = { };
456
34ab9e97 457static QEMUFile *decomp_file;
56e93d26
JQ
458static DecompressParam *decomp_param;
459static QemuThread *decompress_threads;
73a8912b
LL
460static QemuMutex decomp_done_lock;
461static QemuCond decomp_done_cond;
56e93d26 462
5e5fdcff 463static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
6ef3771c 464 ram_addr_t offset, uint8_t *source_buf);
56e93d26
JQ
465
466static void *do_data_compress(void *opaque)
467{
468 CompressParam *param = opaque;
a7a9a88f
LL
469 RAMBlock *block;
470 ram_addr_t offset;
5e5fdcff 471 bool zero_page;
56e93d26 472
a7a9a88f 473 qemu_mutex_lock(&param->mutex);
90e56fb4 474 while (!param->quit) {
a7a9a88f
LL
475 if (param->block) {
476 block = param->block;
477 offset = param->offset;
478 param->block = NULL;
479 qemu_mutex_unlock(&param->mutex);
480
5e5fdcff
XG
481 zero_page = do_compress_ram_page(param->file, &param->stream,
482 block, offset, param->originbuf);
a7a9a88f 483
0d9f9a5c 484 qemu_mutex_lock(&comp_done_lock);
a7a9a88f 485 param->done = true;
5e5fdcff 486 param->zero_page = zero_page;
0d9f9a5c
LL
487 qemu_cond_signal(&comp_done_cond);
488 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
489
490 qemu_mutex_lock(&param->mutex);
491 } else {
56e93d26
JQ
492 qemu_cond_wait(&param->cond, &param->mutex);
493 }
56e93d26 494 }
a7a9a88f 495 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
496
497 return NULL;
498}
499
f0afa331 500static void compress_threads_save_cleanup(void)
56e93d26
JQ
501{
502 int i, thread_count;
503
05306935 504 if (!migrate_use_compression() || !comp_param) {
56e93d26
JQ
505 return;
506 }
05306935 507
56e93d26
JQ
508 thread_count = migrate_compress_threads();
509 for (i = 0; i < thread_count; i++) {
dcaf446e
XG
510 /*
511 * we use it as a indicator which shows if the thread is
512 * properly init'd or not
513 */
514 if (!comp_param[i].file) {
515 break;
516 }
05306935
FL
517
518 qemu_mutex_lock(&comp_param[i].mutex);
519 comp_param[i].quit = true;
520 qemu_cond_signal(&comp_param[i].cond);
521 qemu_mutex_unlock(&comp_param[i].mutex);
522
56e93d26 523 qemu_thread_join(compress_threads + i);
56e93d26
JQ
524 qemu_mutex_destroy(&comp_param[i].mutex);
525 qemu_cond_destroy(&comp_param[i].cond);
dcaf446e 526 deflateEnd(&comp_param[i].stream);
34ab9e97 527 g_free(comp_param[i].originbuf);
dcaf446e
XG
528 qemu_fclose(comp_param[i].file);
529 comp_param[i].file = NULL;
56e93d26 530 }
0d9f9a5c
LL
531 qemu_mutex_destroy(&comp_done_lock);
532 qemu_cond_destroy(&comp_done_cond);
56e93d26
JQ
533 g_free(compress_threads);
534 g_free(comp_param);
56e93d26
JQ
535 compress_threads = NULL;
536 comp_param = NULL;
56e93d26
JQ
537}
538
dcaf446e 539static int compress_threads_save_setup(void)
56e93d26
JQ
540{
541 int i, thread_count;
542
543 if (!migrate_use_compression()) {
dcaf446e 544 return 0;
56e93d26 545 }
56e93d26
JQ
546 thread_count = migrate_compress_threads();
547 compress_threads = g_new0(QemuThread, thread_count);
548 comp_param = g_new0(CompressParam, thread_count);
0d9f9a5c
LL
549 qemu_cond_init(&comp_done_cond);
550 qemu_mutex_init(&comp_done_lock);
56e93d26 551 for (i = 0; i < thread_count; i++) {
34ab9e97
XG
552 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
553 if (!comp_param[i].originbuf) {
554 goto exit;
555 }
556
dcaf446e
XG
557 if (deflateInit(&comp_param[i].stream,
558 migrate_compress_level()) != Z_OK) {
34ab9e97 559 g_free(comp_param[i].originbuf);
dcaf446e
XG
560 goto exit;
561 }
562
e110aa91
C
563 /* comp_param[i].file is just used as a dummy buffer to save data,
564 * set its ops to empty.
56e93d26
JQ
565 */
566 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
567 comp_param[i].done = true;
90e56fb4 568 comp_param[i].quit = false;
56e93d26
JQ
569 qemu_mutex_init(&comp_param[i].mutex);
570 qemu_cond_init(&comp_param[i].cond);
571 qemu_thread_create(compress_threads + i, "compress",
572 do_data_compress, comp_param + i,
573 QEMU_THREAD_JOINABLE);
574 }
dcaf446e
XG
575 return 0;
576
577exit:
578 compress_threads_save_cleanup();
579 return -1;
56e93d26
JQ
580}
581
582/**
3d0684b2 583 * save_page_header: write page header to wire
56e93d26
JQ
584 *
585 * If this is the 1st block, it also writes the block identification
586 *
3d0684b2 587 * Returns the number of bytes written
56e93d26
JQ
588 *
589 * @f: QEMUFile where to send the data
590 * @block: block that contains the page we want to send
591 * @offset: offset inside the block for the page
592 * in the lower bits, it contains flags
593 */
2bf3aa85
JQ
594static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
595 ram_addr_t offset)
56e93d26 596{
9f5f380b 597 size_t size, len;
56e93d26 598
24795694
JQ
599 if (block == rs->last_sent_block) {
600 offset |= RAM_SAVE_FLAG_CONTINUE;
601 }
2bf3aa85 602 qemu_put_be64(f, offset);
56e93d26
JQ
603 size = 8;
604
605 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
9f5f380b 606 len = strlen(block->idstr);
2bf3aa85
JQ
607 qemu_put_byte(f, len);
608 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
9f5f380b 609 size += 1 + len;
24795694 610 rs->last_sent_block = block;
56e93d26
JQ
611 }
612 return size;
613}
614
3d0684b2
JQ
615/**
616 * mig_throttle_guest_down: throotle down the guest
617 *
618 * Reduce amount of guest cpu execution to hopefully slow down memory
619 * writes. If guest dirty memory rate is reduced below the rate at
620 * which we can transfer pages to the destination then we should be
621 * able to complete migration. Some workloads dirty memory way too
622 * fast and will not effectively converge, even with auto-converge.
070afca2 623 */
cbbf8182
KZ
624static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
625 uint64_t bytes_dirty_threshold)
070afca2
JH
626{
627 MigrationState *s = migrate_get_current();
2594f56d 628 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
cbbf8182
KZ
629 uint64_t pct_increment = s->parameters.cpu_throttle_increment;
630 bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
4cbc9c7f 631 int pct_max = s->parameters.max_cpu_throttle;
070afca2 632
cbbf8182
KZ
633 uint64_t throttle_now = cpu_throttle_get_percentage();
634 uint64_t cpu_now, cpu_ideal, throttle_inc;
635
070afca2
JH
636 /* We have not started throttling yet. Let's start it. */
637 if (!cpu_throttle_active()) {
638 cpu_throttle_set(pct_initial);
639 } else {
640 /* Throttling already on, just increase the rate */
cbbf8182
KZ
641 if (!pct_tailslow) {
642 throttle_inc = pct_increment;
643 } else {
644 /* Compute the ideal CPU percentage used by Guest, which may
645 * make the dirty rate match the dirty rate threshold. */
646 cpu_now = 100 - throttle_now;
647 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
648 bytes_dirty_period);
649 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
650 }
651 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
070afca2
JH
652 }
653}
654
3d0684b2
JQ
655/**
656 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
657 *
6f37bb8b 658 * @rs: current RAM state
3d0684b2
JQ
659 * @current_addr: address for the zero page
660 *
661 * Update the xbzrle cache to reflect a page that's been sent as all 0.
56e93d26
JQ
662 * The important thing is that a stale (not-yet-0'd) page be replaced
663 * by the new data.
664 * As a bonus, if the page wasn't in the cache it gets added so that
3d0684b2 665 * when a small write is made into the 0'd page it gets XBZRLE sent.
56e93d26 666 */
6f37bb8b 667static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
56e93d26 668{
6f37bb8b 669 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
56e93d26
JQ
670 return;
671 }
672
673 /* We don't care if this fails to allocate a new cache page
674 * as long as it updated an old one */
c00e0928 675 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
9360447d 676 ram_counters.dirty_sync_count);
56e93d26
JQ
677}
678
679#define ENCODING_FLAG_XBZRLE 0x1
680
681/**
682 * save_xbzrle_page: compress and send current page
683 *
684 * Returns: 1 means that we wrote the page
685 * 0 means that page is identical to the one already sent
686 * -1 means that xbzrle would be longer than normal
687 *
5a987738 688 * @rs: current RAM state
3d0684b2
JQ
689 * @current_data: pointer to the address of the page contents
690 * @current_addr: addr of the page
56e93d26
JQ
691 * @block: block that contains the page we want to send
692 * @offset: offset inside the block for the page
693 * @last_stage: if we are at the completion stage
56e93d26 694 */
204b88b8 695static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
56e93d26 696 ram_addr_t current_addr, RAMBlock *block,
072c2511 697 ram_addr_t offset, bool last_stage)
56e93d26
JQ
698{
699 int encoded_len = 0, bytes_xbzrle;
700 uint8_t *prev_cached_page;
701
9360447d
JQ
702 if (!cache_is_cached(XBZRLE.cache, current_addr,
703 ram_counters.dirty_sync_count)) {
704 xbzrle_counters.cache_miss++;
56e93d26
JQ
705 if (!last_stage) {
706 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
9360447d 707 ram_counters.dirty_sync_count) == -1) {
56e93d26
JQ
708 return -1;
709 } else {
710 /* update *current_data when the page has been
711 inserted into cache */
712 *current_data = get_cached_data(XBZRLE.cache, current_addr);
713 }
714 }
715 return -1;
716 }
717
e460a4b1
WW
718 /*
719 * Reaching here means the page has hit the xbzrle cache, no matter what
720 * encoding result it is (normal encoding, overflow or skipping the page),
3a4452d8 721 * count the page as encoded. This is used to calculate the encoding rate.
e460a4b1
WW
722 *
723 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
724 * 2nd page turns out to be skipped (i.e. no new bytes written to the
725 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
726 * skipped page included. In this way, the encoding rate can tell if the
727 * guest page is good for xbzrle encoding.
728 */
729 xbzrle_counters.pages++;
56e93d26
JQ
730 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
731
732 /* save current buffer into memory */
733 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
734
735 /* XBZRLE encoding (if there is no overflow) */
736 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
737 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
738 TARGET_PAGE_SIZE);
ca353803
WY
739
740 /*
741 * Update the cache contents, so that it corresponds to the data
742 * sent, in all cases except where we skip the page.
743 */
744 if (!last_stage && encoded_len != 0) {
745 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
746 /*
747 * In the case where we couldn't compress, ensure that the caller
748 * sends the data from the cache, since the guest might have
749 * changed the RAM since we copied it.
750 */
751 *current_data = prev_cached_page;
752 }
753
56e93d26 754 if (encoded_len == 0) {
55c4446b 755 trace_save_xbzrle_page_skipping();
56e93d26
JQ
756 return 0;
757 } else if (encoded_len == -1) {
55c4446b 758 trace_save_xbzrle_page_overflow();
9360447d 759 xbzrle_counters.overflow++;
e460a4b1 760 xbzrle_counters.bytes += TARGET_PAGE_SIZE;
56e93d26
JQ
761 return -1;
762 }
763
56e93d26 764 /* Send XBZRLE based compressed page */
2bf3aa85 765 bytes_xbzrle = save_page_header(rs, rs->f, block,
204b88b8
JQ
766 offset | RAM_SAVE_FLAG_XBZRLE);
767 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
768 qemu_put_be16(rs->f, encoded_len);
769 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
56e93d26 770 bytes_xbzrle += encoded_len + 1 + 2;
e460a4b1
WW
771 /*
772 * Like compressed_size (please see update_compress_thread_counts),
773 * the xbzrle encoded bytes don't count the 8 byte header with
774 * RAM_SAVE_FLAG_CONTINUE.
775 */
776 xbzrle_counters.bytes += bytes_xbzrle - 8;
9360447d 777 ram_counters.transferred += bytes_xbzrle;
56e93d26
JQ
778
779 return 1;
780}
781
3d0684b2
JQ
782/**
783 * migration_bitmap_find_dirty: find the next dirty page from start
f3f491fc 784 *
a5f7b1a6 785 * Returns the page offset within memory region of the start of a dirty page
3d0684b2 786 *
6f37bb8b 787 * @rs: current RAM state
3d0684b2 788 * @rb: RAMBlock where to search for dirty pages
a935e30f 789 * @start: page where we start the search
f3f491fc 790 */
56e93d26 791static inline
a935e30f 792unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
f20e2865 793 unsigned long start)
56e93d26 794{
6b6712ef
JQ
795 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
796 unsigned long *bitmap = rb->bmap;
56e93d26
JQ
797 unsigned long next;
798
fbd162e6 799 if (ramblock_is_ignored(rb)) {
b895de50
CLG
800 return size;
801 }
802
6eeb63f7
WW
803 /*
804 * When the free page optimization is enabled, we need to check the bitmap
805 * to send the non-free pages rather than all the pages in the bulk stage.
806 */
807 if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
6b6712ef 808 next = start + 1;
56e93d26 809 } else {
6b6712ef 810 next = find_next_bit(bitmap, size, start);
56e93d26
JQ
811 }
812
6b6712ef 813 return next;
56e93d26
JQ
814}
815
06b10688 816static inline bool migration_bitmap_clear_dirty(RAMState *rs,
f20e2865
JQ
817 RAMBlock *rb,
818 unsigned long page)
a82d593b
DDAG
819{
820 bool ret;
a82d593b 821
386a907b 822 qemu_mutex_lock(&rs->bitmap_mutex);
002cad6b
PX
823
824 /*
825 * Clear dirty bitmap if needed. This _must_ be called before we
826 * send any of the page in the chunk because we need to make sure
827 * we can capture further page content changes when we sync dirty
828 * log the next time. So as long as we are going to send any of
829 * the page in the chunk we clear the remote dirty bitmap for all.
830 * Clearing it earlier won't be a problem, but too late will.
831 */
832 if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
833 uint8_t shift = rb->clear_bmap_shift;
834 hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
8bba004c 835 hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
002cad6b
PX
836
837 /*
838 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
839 * can make things easier sometimes since then start address
840 * of the small chunk will always be 64 pages aligned so the
841 * bitmap will always be aligned to unsigned long. We should
842 * even be able to remove this restriction but I'm simply
843 * keeping it.
844 */
845 assert(shift >= 6);
846 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
847 memory_region_clear_dirty_bitmap(rb->mr, start, size);
848 }
849
6b6712ef 850 ret = test_and_clear_bit(page, rb->bmap);
a82d593b
DDAG
851
852 if (ret) {
0d8ec885 853 rs->migration_dirty_pages--;
a82d593b 854 }
386a907b
WW
855 qemu_mutex_unlock(&rs->bitmap_mutex);
856
a82d593b
DDAG
857 return ret;
858}
859
267691b6 860/* Called with RCU critical section */
7a3e9571 861static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
56e93d26 862{
fb613580
KZ
863 uint64_t new_dirty_pages =
864 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
865
866 rs->migration_dirty_pages += new_dirty_pages;
867 rs->num_dirty_pages_period += new_dirty_pages;
56e93d26
JQ
868}
869
3d0684b2
JQ
870/**
871 * ram_pagesize_summary: calculate all the pagesizes of a VM
872 *
873 * Returns a summary bitmap of the page sizes of all RAMBlocks
874 *
875 * For VMs with just normal pages this is equivalent to the host page
876 * size. If it's got some huge pages then it's the OR of all the
877 * different page sizes.
e8ca1db2
DDAG
878 */
879uint64_t ram_pagesize_summary(void)
880{
881 RAMBlock *block;
882 uint64_t summary = 0;
883
fbd162e6 884 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
e8ca1db2
DDAG
885 summary |= block->page_size;
886 }
887
888 return summary;
889}
890
aecbfe9c
XG
891uint64_t ram_get_total_transferred_pages(void)
892{
893 return ram_counters.normal + ram_counters.duplicate +
894 compression_counters.pages + xbzrle_counters.pages;
895}
896
b734035b
XG
897static void migration_update_rates(RAMState *rs, int64_t end_time)
898{
be8b02ed 899 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
76e03000 900 double compressed_size;
b734035b
XG
901
902 /* calculate period counters */
903 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
904 / (end_time - rs->time_last_bitmap_sync);
905
be8b02ed 906 if (!page_count) {
b734035b
XG
907 return;
908 }
909
910 if (migrate_use_xbzrle()) {
e460a4b1
WW
911 double encoded_size, unencoded_size;
912
b734035b 913 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
be8b02ed 914 rs->xbzrle_cache_miss_prev) / page_count;
b734035b 915 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
e460a4b1
WW
916 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
917 TARGET_PAGE_SIZE;
918 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
92271402 919 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
e460a4b1 920 xbzrle_counters.encoding_rate = 0;
e460a4b1
WW
921 } else {
922 xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
923 }
924 rs->xbzrle_pages_prev = xbzrle_counters.pages;
925 rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
b734035b 926 }
76e03000
XG
927
928 if (migrate_use_compression()) {
929 compression_counters.busy_rate = (double)(compression_counters.busy -
930 rs->compress_thread_busy_prev) / page_count;
931 rs->compress_thread_busy_prev = compression_counters.busy;
932
933 compressed_size = compression_counters.compressed_size -
934 rs->compressed_size_prev;
935 if (compressed_size) {
936 double uncompressed_size = (compression_counters.pages -
937 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
938
939 /* Compression-Ratio = Uncompressed-size / Compressed-size */
940 compression_counters.compression_rate =
941 uncompressed_size / compressed_size;
942
943 rs->compress_pages_prev = compression_counters.pages;
944 rs->compressed_size_prev = compression_counters.compressed_size;
945 }
946 }
b734035b
XG
947}
948
dc14a470
KZ
949static void migration_trigger_throttle(RAMState *rs)
950{
951 MigrationState *s = migrate_get_current();
952 uint64_t threshold = s->parameters.throttle_trigger_threshold;
953
954 uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
955 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
956 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
957
958 /* During block migration the auto-converge logic incorrectly detects
959 * that ram migration makes no progress. Avoid this by disabling the
960 * throttling logic during the bulk phase of block migration. */
961 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
962 /* The following detection logic can be refined later. For now:
963 Check to see if the ratio between dirtied bytes and the approx.
964 amount of bytes that just got transferred since the last time
965 we were in this routine reaches the threshold. If that happens
966 twice, start or increase throttling. */
967
968 if ((bytes_dirty_period > bytes_dirty_threshold) &&
969 (++rs->dirty_rate_high_cnt >= 2)) {
970 trace_migration_throttle();
971 rs->dirty_rate_high_cnt = 0;
cbbf8182
KZ
972 mig_throttle_guest_down(bytes_dirty_period,
973 bytes_dirty_threshold);
dc14a470
KZ
974 }
975 }
976}
977
8d820d6f 978static void migration_bitmap_sync(RAMState *rs)
56e93d26
JQ
979{
980 RAMBlock *block;
56e93d26 981 int64_t end_time;
56e93d26 982
9360447d 983 ram_counters.dirty_sync_count++;
56e93d26 984
f664da80
JQ
985 if (!rs->time_last_bitmap_sync) {
986 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
56e93d26
JQ
987 }
988
989 trace_migration_bitmap_sync_start();
9c1f8f44 990 memory_global_dirty_log_sync();
56e93d26 991
108cfae0 992 qemu_mutex_lock(&rs->bitmap_mutex);
89ac5a1d
DDAG
993 WITH_RCU_READ_LOCK_GUARD() {
994 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
995 ramblock_sync_dirty_bitmap(rs, block);
996 }
997 ram_counters.remaining = ram_bytes_remaining();
56e93d26 998 }
108cfae0 999 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26 1000
9458a9a1 1001 memory_global_after_dirty_log_sync();
a66cd90c 1002 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1ffb5dfd 1003
56e93d26
JQ
1004 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1005
1006 /* more than 1 second = 1000 millisecons */
f664da80 1007 if (end_time > rs->time_last_bitmap_sync + 1000) {
dc14a470 1008 migration_trigger_throttle(rs);
070afca2 1009
b734035b
XG
1010 migration_update_rates(rs, end_time);
1011
be8b02ed 1012 rs->target_page_count_prev = rs->target_page_count;
d693c6f1
FF
1013
1014 /* reset period counters */
f664da80 1015 rs->time_last_bitmap_sync = end_time;
a66cd90c 1016 rs->num_dirty_pages_period = 0;
dc14a470 1017 rs->bytes_xfer_prev = ram_counters.transferred;
56e93d26 1018 }
4addcd4f 1019 if (migrate_use_events()) {
3ab72385 1020 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
4addcd4f 1021 }
56e93d26
JQ
1022}
1023
bd227060
WW
1024static void migration_bitmap_sync_precopy(RAMState *rs)
1025{
1026 Error *local_err = NULL;
1027
1028 /*
1029 * The current notifier usage is just an optimization to migration, so we
1030 * don't stop the normal migration process in the error case.
1031 */
1032 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1033 error_report_err(local_err);
b4a1733c 1034 local_err = NULL;
bd227060
WW
1035 }
1036
1037 migration_bitmap_sync(rs);
1038
1039 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1040 error_report_err(local_err);
1041 }
1042}
1043
6c97ec5f
XG
1044/**
1045 * save_zero_page_to_file: send the zero page to the file
1046 *
1047 * Returns the size of data written to the file, 0 means the page is not
1048 * a zero page
1049 *
1050 * @rs: current RAM state
1051 * @file: the file where the data is saved
1052 * @block: block that contains the page we want to send
1053 * @offset: offset inside the block for the page
1054 */
1055static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1056 RAMBlock *block, ram_addr_t offset)
1057{
1058 uint8_t *p = block->host + offset;
1059 int len = 0;
1060
1061 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1062 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1063 qemu_put_byte(file, 0);
1064 len += 1;
1065 }
1066 return len;
1067}
1068
56e93d26 1069/**
3d0684b2 1070 * save_zero_page: send the zero page to the stream
56e93d26 1071 *
3d0684b2 1072 * Returns the number of pages written.
56e93d26 1073 *
f7ccd61b 1074 * @rs: current RAM state
56e93d26
JQ
1075 * @block: block that contains the page we want to send
1076 * @offset: offset inside the block for the page
56e93d26 1077 */
7faccdc3 1078static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
56e93d26 1079{
6c97ec5f 1080 int len = save_zero_page_to_file(rs, rs->f, block, offset);
56e93d26 1081
6c97ec5f 1082 if (len) {
9360447d 1083 ram_counters.duplicate++;
6c97ec5f
XG
1084 ram_counters.transferred += len;
1085 return 1;
56e93d26 1086 }
6c97ec5f 1087 return -1;
56e93d26
JQ
1088}
1089
5727309d 1090static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
53f09a10 1091{
5727309d 1092 if (!migrate_release_ram() || !migration_in_postcopy()) {
53f09a10
PB
1093 return;
1094 }
1095
8bba004c 1096 ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
53f09a10
PB
1097}
1098
059ff0fb
XG
1099/*
1100 * @pages: the number of pages written by the control path,
1101 * < 0 - error
1102 * > 0 - number of pages written
1103 *
1104 * Return true if the pages has been saved, otherwise false is returned.
1105 */
1106static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1107 int *pages)
1108{
1109 uint64_t bytes_xmit = 0;
1110 int ret;
1111
1112 *pages = -1;
1113 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1114 &bytes_xmit);
1115 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1116 return false;
1117 }
1118
1119 if (bytes_xmit) {
1120 ram_counters.transferred += bytes_xmit;
1121 *pages = 1;
1122 }
1123
1124 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1125 return true;
1126 }
1127
1128 if (bytes_xmit > 0) {
1129 ram_counters.normal++;
1130 } else if (bytes_xmit == 0) {
1131 ram_counters.duplicate++;
1132 }
1133
1134 return true;
1135}
1136
65dacaa0
XG
1137/*
1138 * directly send the page to the stream
1139 *
1140 * Returns the number of pages written.
1141 *
1142 * @rs: current RAM state
1143 * @block: block that contains the page we want to send
1144 * @offset: offset inside the block for the page
1145 * @buf: the page to be sent
1146 * @async: send to page asyncly
1147 */
1148static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1149 uint8_t *buf, bool async)
1150{
1151 ram_counters.transferred += save_page_header(rs, rs->f, block,
1152 offset | RAM_SAVE_FLAG_PAGE);
1153 if (async) {
1154 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1155 migrate_release_ram() &
1156 migration_in_postcopy());
1157 } else {
1158 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1159 }
1160 ram_counters.transferred += TARGET_PAGE_SIZE;
1161 ram_counters.normal++;
1162 return 1;
1163}
1164
56e93d26 1165/**
3d0684b2 1166 * ram_save_page: send the given page to the stream
56e93d26 1167 *
3d0684b2 1168 * Returns the number of pages written.
3fd3c4b3
DDAG
1169 * < 0 - error
1170 * >=0 - Number of pages written - this might legally be 0
1171 * if xbzrle noticed the page was the same.
56e93d26 1172 *
6f37bb8b 1173 * @rs: current RAM state
56e93d26
JQ
1174 * @block: block that contains the page we want to send
1175 * @offset: offset inside the block for the page
1176 * @last_stage: if we are at the completion stage
56e93d26 1177 */
a0a8aa14 1178static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
56e93d26
JQ
1179{
1180 int pages = -1;
56e93d26 1181 uint8_t *p;
56e93d26 1182 bool send_async = true;
a08f6890 1183 RAMBlock *block = pss->block;
8bba004c 1184 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
059ff0fb 1185 ram_addr_t current_addr = block->offset + offset;
56e93d26 1186
2f68e399 1187 p = block->host + offset;
1db9d8e5 1188 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
56e93d26 1189
56e93d26 1190 XBZRLE_cache_lock();
d7400a34
XG
1191 if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1192 migrate_use_xbzrle()) {
059ff0fb
XG
1193 pages = save_xbzrle_page(rs, &p, current_addr, block,
1194 offset, last_stage);
1195 if (!last_stage) {
1196 /* Can't send this cached data async, since the cache page
1197 * might get updated before it gets to the wire
56e93d26 1198 */
059ff0fb 1199 send_async = false;
56e93d26
JQ
1200 }
1201 }
1202
1203 /* XBZRLE overflow or normal page */
1204 if (pages == -1) {
65dacaa0 1205 pages = save_normal_page(rs, block, offset, p, send_async);
56e93d26
JQ
1206 }
1207
1208 XBZRLE_cache_unlock();
1209
1210 return pages;
1211}
1212
b9ee2f7d
JQ
1213static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1214 ram_addr_t offset)
1215{
67a4c891 1216 if (multifd_queue_page(rs->f, block, offset) < 0) {
713f762a
IR
1217 return -1;
1218 }
b9ee2f7d
JQ
1219 ram_counters.normal++;
1220
1221 return 1;
1222}
1223
5e5fdcff 1224static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
6ef3771c 1225 ram_addr_t offset, uint8_t *source_buf)
56e93d26 1226{
53518d94 1227 RAMState *rs = ram_state;
a7a9a88f 1228 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
5e5fdcff 1229 bool zero_page = false;
6ef3771c 1230 int ret;
56e93d26 1231
5e5fdcff
XG
1232 if (save_zero_page_to_file(rs, f, block, offset)) {
1233 zero_page = true;
1234 goto exit;
1235 }
1236
6ef3771c 1237 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
34ab9e97
XG
1238
1239 /*
1240 * copy it to a internal buffer to avoid it being modified by VM
1241 * so that we can catch up the error during compression and
1242 * decompression
1243 */
1244 memcpy(source_buf, p, TARGET_PAGE_SIZE);
6ef3771c
XG
1245 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1246 if (ret < 0) {
1247 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
b3be2896 1248 error_report("compressed data failed!");
5e5fdcff 1249 return false;
b3be2896 1250 }
56e93d26 1251
5e5fdcff 1252exit:
6ef3771c 1253 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
5e5fdcff
XG
1254 return zero_page;
1255}
1256
1257static void
1258update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1259{
76e03000
XG
1260 ram_counters.transferred += bytes_xmit;
1261
5e5fdcff
XG
1262 if (param->zero_page) {
1263 ram_counters.duplicate++;
76e03000 1264 return;
5e5fdcff 1265 }
76e03000
XG
1266
1267 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1268 compression_counters.compressed_size += bytes_xmit - 8;
1269 compression_counters.pages++;
56e93d26
JQ
1270}
1271
32b05495
XG
1272static bool save_page_use_compression(RAMState *rs);
1273
ce25d337 1274static void flush_compressed_data(RAMState *rs)
56e93d26
JQ
1275{
1276 int idx, len, thread_count;
1277
32b05495 1278 if (!save_page_use_compression(rs)) {
56e93d26
JQ
1279 return;
1280 }
1281 thread_count = migrate_compress_threads();
a7a9a88f 1282
0d9f9a5c 1283 qemu_mutex_lock(&comp_done_lock);
56e93d26 1284 for (idx = 0; idx < thread_count; idx++) {
a7a9a88f 1285 while (!comp_param[idx].done) {
0d9f9a5c 1286 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26 1287 }
a7a9a88f 1288 }
0d9f9a5c 1289 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
1290
1291 for (idx = 0; idx < thread_count; idx++) {
1292 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 1293 if (!comp_param[idx].quit) {
ce25d337 1294 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
5e5fdcff
XG
1295 /*
1296 * it's safe to fetch zero_page without holding comp_done_lock
1297 * as there is no further request submitted to the thread,
1298 * i.e, the thread should be waiting for a request at this point.
1299 */
1300 update_compress_thread_counts(&comp_param[idx], len);
56e93d26 1301 }
a7a9a88f 1302 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26
JQ
1303 }
1304}
1305
1306static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1307 ram_addr_t offset)
1308{
1309 param->block = block;
1310 param->offset = offset;
1311}
1312
ce25d337
JQ
1313static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1314 ram_addr_t offset)
56e93d26
JQ
1315{
1316 int idx, thread_count, bytes_xmit = -1, pages = -1;
1d58872a 1317 bool wait = migrate_compress_wait_thread();
56e93d26
JQ
1318
1319 thread_count = migrate_compress_threads();
0d9f9a5c 1320 qemu_mutex_lock(&comp_done_lock);
1d58872a
XG
1321retry:
1322 for (idx = 0; idx < thread_count; idx++) {
1323 if (comp_param[idx].done) {
1324 comp_param[idx].done = false;
1325 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1326 qemu_mutex_lock(&comp_param[idx].mutex);
1327 set_compress_params(&comp_param[idx], block, offset);
1328 qemu_cond_signal(&comp_param[idx].cond);
1329 qemu_mutex_unlock(&comp_param[idx].mutex);
1330 pages = 1;
5e5fdcff 1331 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
56e93d26 1332 break;
56e93d26
JQ
1333 }
1334 }
1d58872a
XG
1335
1336 /*
1337 * wait for the free thread if the user specifies 'compress-wait-thread',
1338 * otherwise we will post the page out in the main thread as normal page.
1339 */
1340 if (pages < 0 && wait) {
1341 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1342 goto retry;
1343 }
0d9f9a5c 1344 qemu_mutex_unlock(&comp_done_lock);
56e93d26
JQ
1345
1346 return pages;
1347}
1348
3d0684b2
JQ
1349/**
1350 * find_dirty_block: find the next dirty page and update any state
1351 * associated with the search process.
b9e60928 1352 *
a5f7b1a6 1353 * Returns true if a page is found
b9e60928 1354 *
6f37bb8b 1355 * @rs: current RAM state
3d0684b2
JQ
1356 * @pss: data about the state of the current dirty page scan
1357 * @again: set to false if the search has scanned the whole of RAM
b9e60928 1358 */
f20e2865 1359static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
b9e60928 1360{
f20e2865 1361 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
6f37bb8b 1362 if (pss->complete_round && pss->block == rs->last_seen_block &&
a935e30f 1363 pss->page >= rs->last_page) {
b9e60928
DDAG
1364 /*
1365 * We've been once around the RAM and haven't found anything.
1366 * Give up.
1367 */
1368 *again = false;
1369 return false;
1370 }
8bba004c
AR
1371 if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS)
1372 >= pss->block->used_length) {
b9e60928 1373 /* Didn't find anything in this RAM Block */
a935e30f 1374 pss->page = 0;
b9e60928
DDAG
1375 pss->block = QLIST_NEXT_RCU(pss->block, next);
1376 if (!pss->block) {
48df9d80
XG
1377 /*
1378 * If memory migration starts over, we will meet a dirtied page
1379 * which may still exists in compression threads's ring, so we
1380 * should flush the compressed data to make sure the new page
1381 * is not overwritten by the old one in the destination.
1382 *
1383 * Also If xbzrle is on, stop using the data compression at this
1384 * point. In theory, xbzrle can do better than compression.
1385 */
1386 flush_compressed_data(rs);
1387
b9e60928
DDAG
1388 /* Hit the end of the list */
1389 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1390 /* Flag that we've looped */
1391 pss->complete_round = true;
6f37bb8b 1392 rs->ram_bulk_stage = false;
b9e60928
DDAG
1393 }
1394 /* Didn't find anything this time, but try again on the new block */
1395 *again = true;
1396 return false;
1397 } else {
1398 /* Can go around again, but... */
1399 *again = true;
1400 /* We've found something so probably don't need to */
1401 return true;
1402 }
1403}
1404
3d0684b2
JQ
1405/**
1406 * unqueue_page: gets a page of the queue
1407 *
a82d593b 1408 * Helper for 'get_queued_page' - gets a page off the queue
a82d593b 1409 *
3d0684b2
JQ
1410 * Returns the block of the page (or NULL if none available)
1411 *
ec481c6c 1412 * @rs: current RAM state
3d0684b2 1413 * @offset: used to return the offset within the RAMBlock
a82d593b 1414 */
f20e2865 1415static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
a82d593b
DDAG
1416{
1417 RAMBlock *block = NULL;
1418
ae526e32
XG
1419 if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1420 return NULL;
1421 }
1422
6e8a355d 1423 QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
ec481c6c
JQ
1424 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1425 struct RAMSrcPageRequest *entry =
1426 QSIMPLEQ_FIRST(&rs->src_page_requests);
a82d593b
DDAG
1427 block = entry->rb;
1428 *offset = entry->offset;
a82d593b
DDAG
1429
1430 if (entry->len > TARGET_PAGE_SIZE) {
1431 entry->len -= TARGET_PAGE_SIZE;
1432 entry->offset += TARGET_PAGE_SIZE;
1433 } else {
1434 memory_region_unref(block->mr);
ec481c6c 1435 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
a82d593b 1436 g_free(entry);
e03a34f8 1437 migration_consume_urgent_request();
a82d593b
DDAG
1438 }
1439 }
a82d593b
DDAG
1440
1441 return block;
1442}
1443
278e2f55
AG
1444#if defined(__linux__)
1445/**
1446 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1447 * is found, return RAM block pointer and page offset
1448 *
1449 * Returns pointer to the RAMBlock containing faulting page,
1450 * NULL if no write faults are pending
1451 *
1452 * @rs: current RAM state
1453 * @offset: page offset from the beginning of the block
1454 */
1455static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1456{
1457 struct uffd_msg uffd_msg;
1458 void *page_address;
1459 RAMBlock *bs;
1460 int res;
1461
1462 if (!migrate_background_snapshot()) {
1463 return NULL;
1464 }
1465
1466 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1467 if (res <= 0) {
1468 return NULL;
1469 }
1470
1471 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1472 bs = qemu_ram_block_from_host(page_address, false, offset);
1473 assert(bs && (bs->flags & RAM_UF_WRITEPROTECT) != 0);
1474 return bs;
1475}
1476
1477/**
1478 * ram_save_release_protection: release UFFD write protection after
1479 * a range of pages has been saved
1480 *
1481 * @rs: current RAM state
1482 * @pss: page-search-status structure
1483 * @start_page: index of the first page in the range relative to pss->block
1484 *
1485 * Returns 0 on success, negative value in case of an error
1486*/
1487static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1488 unsigned long start_page)
1489{
1490 int res = 0;
1491
1492 /* Check if page is from UFFD-managed region. */
1493 if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1494 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1495 uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS;
1496
1497 /* Flush async buffers before un-protect. */
1498 qemu_fflush(rs->f);
1499 /* Un-protect memory range. */
1500 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1501 false, false);
1502 }
1503
1504 return res;
1505}
1506
1507/* ram_write_tracking_available: check if kernel supports required UFFD features
1508 *
1509 * Returns true if supports, false otherwise
1510 */
1511bool ram_write_tracking_available(void)
1512{
1513 uint64_t uffd_features;
1514 int res;
1515
1516 res = uffd_query_features(&uffd_features);
1517 return (res == 0 &&
1518 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1519}
1520
1521/* ram_write_tracking_compatible: check if guest configuration is
1522 * compatible with 'write-tracking'
1523 *
1524 * Returns true if compatible, false otherwise
1525 */
1526bool ram_write_tracking_compatible(void)
1527{
1528 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1529 int uffd_fd;
1530 RAMBlock *bs;
1531 bool ret = false;
1532
1533 /* Open UFFD file descriptor */
1534 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1535 if (uffd_fd < 0) {
1536 return false;
1537 }
1538
1539 RCU_READ_LOCK_GUARD();
1540
1541 RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
1542 uint64_t uffd_ioctls;
1543
1544 /* Nothing to do with read-only and MMIO-writable regions */
1545 if (bs->mr->readonly || bs->mr->rom_device) {
1546 continue;
1547 }
1548 /* Try to register block memory via UFFD-IO to track writes */
1549 if (uffd_register_memory(uffd_fd, bs->host, bs->max_length,
1550 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1551 goto out;
1552 }
1553 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1554 goto out;
1555 }
1556 }
1557 ret = true;
1558
1559out:
1560 uffd_close_fd(uffd_fd);
1561 return ret;
1562}
1563
1564/*
1565 * ram_write_tracking_start: start UFFD-WP memory tracking
1566 *
1567 * Returns 0 for success or negative value in case of error
1568 */
1569int ram_write_tracking_start(void)
1570{
1571 int uffd_fd;
1572 RAMState *rs = ram_state;
1573 RAMBlock *bs;
1574
1575 /* Open UFFD file descriptor */
1576 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1577 if (uffd_fd < 0) {
1578 return uffd_fd;
1579 }
1580 rs->uffdio_fd = uffd_fd;
1581
1582 RCU_READ_LOCK_GUARD();
1583
1584 RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
1585 /* Nothing to do with read-only and MMIO-writable regions */
1586 if (bs->mr->readonly || bs->mr->rom_device) {
1587 continue;
1588 }
1589
1590 /* Register block memory with UFFD to track writes */
1591 if (uffd_register_memory(rs->uffdio_fd, bs->host,
1592 bs->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1593 goto fail;
1594 }
1595 /* Apply UFFD write protection to the block memory range */
1596 if (uffd_change_protection(rs->uffdio_fd, bs->host,
1597 bs->max_length, true, false)) {
1598 goto fail;
1599 }
1600 bs->flags |= RAM_UF_WRITEPROTECT;
1601 memory_region_ref(bs->mr);
1602
1603 trace_ram_write_tracking_ramblock_start(bs->idstr, bs->page_size,
1604 bs->host, bs->max_length);
1605 }
1606
1607 return 0;
1608
1609fail:
1610 error_report("ram_write_tracking_start() failed: restoring initial memory state");
1611
1612 RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
1613 if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) {
1614 continue;
1615 }
1616 /*
1617 * In case some memory block failed to be write-protected
1618 * remove protection and unregister all succeeded RAM blocks
1619 */
1620 uffd_change_protection(rs->uffdio_fd, bs->host, bs->max_length, false, false);
1621 uffd_unregister_memory(rs->uffdio_fd, bs->host, bs->max_length);
1622 /* Cleanup flags and remove reference */
1623 bs->flags &= ~RAM_UF_WRITEPROTECT;
1624 memory_region_unref(bs->mr);
1625 }
1626
1627 uffd_close_fd(uffd_fd);
1628 rs->uffdio_fd = -1;
1629 return -1;
1630}
1631
1632/**
1633 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1634 */
1635void ram_write_tracking_stop(void)
1636{
1637 RAMState *rs = ram_state;
1638 RAMBlock *bs;
1639
1640 RCU_READ_LOCK_GUARD();
1641
1642 RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
1643 if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) {
1644 continue;
1645 }
1646 /* Remove protection and unregister all affected RAM blocks */
1647 uffd_change_protection(rs->uffdio_fd, bs->host, bs->max_length, false, false);
1648 uffd_unregister_memory(rs->uffdio_fd, bs->host, bs->max_length);
1649
1650 trace_ram_write_tracking_ramblock_stop(bs->idstr, bs->page_size,
1651 bs->host, bs->max_length);
1652
1653 /* Cleanup flags and remove reference */
1654 bs->flags &= ~RAM_UF_WRITEPROTECT;
1655 memory_region_unref(bs->mr);
1656 }
1657
1658 /* Finally close UFFD file descriptor */
1659 uffd_close_fd(rs->uffdio_fd);
1660 rs->uffdio_fd = -1;
1661}
1662
1663#else
1664/* No target OS support, stubs just fail or ignore */
1665
1666static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1667{
1668 (void) rs;
1669 (void) offset;
1670
1671 return NULL;
1672}
1673
1674static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1675 unsigned long start_page)
1676{
1677 (void) rs;
1678 (void) pss;
1679 (void) start_page;
1680
1681 return 0;
1682}
1683
1684bool ram_write_tracking_available(void)
1685{
1686 return false;
1687}
1688
1689bool ram_write_tracking_compatible(void)
1690{
1691 assert(0);
1692 return false;
1693}
1694
1695int ram_write_tracking_start(void)
1696{
1697 assert(0);
1698 return -1;
1699}
1700
1701void ram_write_tracking_stop(void)
1702{
1703 assert(0);
1704}
1705#endif /* defined(__linux__) */
1706
3d0684b2 1707/**
ff1543af 1708 * get_queued_page: unqueue a page from the postcopy requests
3d0684b2
JQ
1709 *
1710 * Skips pages that are already sent (!dirty)
a82d593b 1711 *
a5f7b1a6 1712 * Returns true if a queued page is found
a82d593b 1713 *
6f37bb8b 1714 * @rs: current RAM state
3d0684b2 1715 * @pss: data about the state of the current dirty page scan
a82d593b 1716 */
f20e2865 1717static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
a82d593b
DDAG
1718{
1719 RAMBlock *block;
1720 ram_addr_t offset;
1721 bool dirty;
1722
1723 do {
f20e2865 1724 block = unqueue_page(rs, &offset);
a82d593b
DDAG
1725 /*
1726 * We're sending this page, and since it's postcopy nothing else
1727 * will dirty it, and we must make sure it doesn't get sent again
1728 * even if this queue request was received after the background
1729 * search already sent it.
1730 */
1731 if (block) {
f20e2865
JQ
1732 unsigned long page;
1733
6b6712ef
JQ
1734 page = offset >> TARGET_PAGE_BITS;
1735 dirty = test_bit(page, block->bmap);
a82d593b 1736 if (!dirty) {
06b10688 1737 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
64737606 1738 page);
a82d593b 1739 } else {
f20e2865 1740 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
a82d593b
DDAG
1741 }
1742 }
1743
1744 } while (block && !dirty);
1745
278e2f55
AG
1746 if (!block) {
1747 /*
1748 * Poll write faults too if background snapshot is enabled; that's
1749 * when we have vcpus got blocked by the write protected pages.
1750 */
1751 block = poll_fault_page(rs, &offset);
1752 }
1753
a82d593b
DDAG
1754 if (block) {
1755 /*
1756 * As soon as we start servicing pages out of order, then we have
1757 * to kill the bulk stage, since the bulk stage assumes
1758 * in (migration_bitmap_find_and_reset_dirty) that every page is
1759 * dirty, that's no longer true.
1760 */
6f37bb8b 1761 rs->ram_bulk_stage = false;
a82d593b
DDAG
1762
1763 /*
1764 * We want the background search to continue from the queued page
1765 * since the guest is likely to want other pages near to the page
1766 * it just requested.
1767 */
1768 pss->block = block;
a935e30f 1769 pss->page = offset >> TARGET_PAGE_BITS;
422314e7
WY
1770
1771 /*
1772 * This unqueued page would break the "one round" check, even is
1773 * really rare.
1774 */
1775 pss->complete_round = false;
a82d593b
DDAG
1776 }
1777
1778 return !!block;
1779}
1780
6c595cde 1781/**
5e58f968
JQ
1782 * migration_page_queue_free: drop any remaining pages in the ram
1783 * request queue
6c595cde 1784 *
3d0684b2
JQ
1785 * It should be empty at the end anyway, but in error cases there may
1786 * be some left. in case that there is any page left, we drop it.
1787 *
6c595cde 1788 */
83c13382 1789static void migration_page_queue_free(RAMState *rs)
6c595cde 1790{
ec481c6c 1791 struct RAMSrcPageRequest *mspr, *next_mspr;
6c595cde
DDAG
1792 /* This queue generally should be empty - but in the case of a failed
1793 * migration might have some droppings in.
1794 */
89ac5a1d 1795 RCU_READ_LOCK_GUARD();
ec481c6c 1796 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
6c595cde 1797 memory_region_unref(mspr->rb->mr);
ec481c6c 1798 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
6c595cde
DDAG
1799 g_free(mspr);
1800 }
6c595cde
DDAG
1801}
1802
1803/**
3d0684b2
JQ
1804 * ram_save_queue_pages: queue the page for transmission
1805 *
1806 * A request from postcopy destination for example.
1807 *
1808 * Returns zero on success or negative on error
1809 *
3d0684b2
JQ
1810 * @rbname: Name of the RAMBLock of the request. NULL means the
1811 * same that last one.
1812 * @start: starting address from the start of the RAMBlock
1813 * @len: length (in bytes) to send
6c595cde 1814 */
96506894 1815int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
6c595cde
DDAG
1816{
1817 RAMBlock *ramblock;
53518d94 1818 RAMState *rs = ram_state;
6c595cde 1819
9360447d 1820 ram_counters.postcopy_requests++;
89ac5a1d
DDAG
1821 RCU_READ_LOCK_GUARD();
1822
6c595cde
DDAG
1823 if (!rbname) {
1824 /* Reuse last RAMBlock */
68a098f3 1825 ramblock = rs->last_req_rb;
6c595cde
DDAG
1826
1827 if (!ramblock) {
1828 /*
1829 * Shouldn't happen, we can't reuse the last RAMBlock if
1830 * it's the 1st request.
1831 */
1832 error_report("ram_save_queue_pages no previous block");
03acb4e9 1833 return -1;
6c595cde
DDAG
1834 }
1835 } else {
1836 ramblock = qemu_ram_block_by_name(rbname);
1837
1838 if (!ramblock) {
1839 /* We shouldn't be asked for a non-existent RAMBlock */
1840 error_report("ram_save_queue_pages no block '%s'", rbname);
03acb4e9 1841 return -1;
6c595cde 1842 }
68a098f3 1843 rs->last_req_rb = ramblock;
6c595cde
DDAG
1844 }
1845 trace_ram_save_queue_pages(ramblock->idstr, start, len);
395cb450 1846 if (start + len > ramblock->used_length) {
9458ad6b
JQ
1847 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1848 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
6c595cde 1849 __func__, start, len, ramblock->used_length);
03acb4e9 1850 return -1;
6c595cde
DDAG
1851 }
1852
ec481c6c
JQ
1853 struct RAMSrcPageRequest *new_entry =
1854 g_malloc0(sizeof(struct RAMSrcPageRequest));
6c595cde
DDAG
1855 new_entry->rb = ramblock;
1856 new_entry->offset = start;
1857 new_entry->len = len;
1858
1859 memory_region_ref(ramblock->mr);
ec481c6c
JQ
1860 qemu_mutex_lock(&rs->src_page_req_mutex);
1861 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
e03a34f8 1862 migration_make_urgent_request();
ec481c6c 1863 qemu_mutex_unlock(&rs->src_page_req_mutex);
6c595cde
DDAG
1864
1865 return 0;
6c595cde
DDAG
1866}
1867
d7400a34
XG
1868static bool save_page_use_compression(RAMState *rs)
1869{
1870 if (!migrate_use_compression()) {
1871 return false;
1872 }
1873
1874 /*
1875 * If xbzrle is on, stop using the data compression after first
1876 * round of migration even if compression is enabled. In theory,
1877 * xbzrle can do better than compression.
1878 */
1879 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1880 return true;
1881 }
1882
1883 return false;
1884}
1885
5e5fdcff
XG
1886/*
1887 * try to compress the page before posting it out, return true if the page
1888 * has been properly handled by compression, otherwise needs other
1889 * paths to handle it
1890 */
1891static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1892{
1893 if (!save_page_use_compression(rs)) {
1894 return false;
1895 }
1896
1897 /*
1898 * When starting the process of a new block, the first page of
1899 * the block should be sent out before other pages in the same
1900 * block, and all the pages in last block should have been sent
1901 * out, keeping this order is important, because the 'cont' flag
1902 * is used to avoid resending the block name.
1903 *
1904 * We post the fist page as normal page as compression will take
1905 * much CPU resource.
1906 */
1907 if (block != rs->last_sent_block) {
1908 flush_compressed_data(rs);
1909 return false;
1910 }
1911
1912 if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1913 return true;
1914 }
1915
76e03000 1916 compression_counters.busy++;
5e5fdcff
XG
1917 return false;
1918}
1919
a82d593b 1920/**
3d0684b2 1921 * ram_save_target_page: save one target page
a82d593b 1922 *
3d0684b2 1923 * Returns the number of pages written
a82d593b 1924 *
6f37bb8b 1925 * @rs: current RAM state
3d0684b2 1926 * @pss: data about the page we want to send
a82d593b 1927 * @last_stage: if we are at the completion stage
a82d593b 1928 */
a0a8aa14 1929static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
f20e2865 1930 bool last_stage)
a82d593b 1931{
a8ec91f9 1932 RAMBlock *block = pss->block;
8bba004c 1933 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
a8ec91f9
XG
1934 int res;
1935
1936 if (control_save_page(rs, block, offset, &res)) {
1937 return res;
1938 }
1939
5e5fdcff
XG
1940 if (save_compress_page(rs, block, offset)) {
1941 return 1;
d7400a34
XG
1942 }
1943
1944 res = save_zero_page(rs, block, offset);
1945 if (res > 0) {
1946 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1947 * page would be stale
1948 */
1949 if (!save_page_use_compression(rs)) {
1950 XBZRLE_cache_lock();
1951 xbzrle_cache_zero_page(rs, block->offset + offset);
1952 XBZRLE_cache_unlock();
1953 }
1954 ram_release_pages(block->idstr, offset, res);
1955 return res;
1956 }
1957
da3f56cb 1958 /*
c6b3a2e0
WY
1959 * Do not use multifd for:
1960 * 1. Compression as the first page in the new block should be posted out
1961 * before sending the compressed page
1962 * 2. In postcopy as one whole host page should be placed
da3f56cb 1963 */
c6b3a2e0
WY
1964 if (!save_page_use_compression(rs) && migrate_use_multifd()
1965 && !migration_in_postcopy()) {
b9ee2f7d 1966 return ram_save_multifd_page(rs, block, offset);
a82d593b
DDAG
1967 }
1968
1faa5665 1969 return ram_save_page(rs, pss, last_stage);
a82d593b
DDAG
1970}
1971
1972/**
3d0684b2 1973 * ram_save_host_page: save a whole host page
a82d593b 1974 *
3d0684b2
JQ
1975 * Starting at *offset send pages up to the end of the current host
1976 * page. It's valid for the initial offset to point into the middle of
1977 * a host page in which case the remainder of the hostpage is sent.
1978 * Only dirty target pages are sent. Note that the host page size may
1979 * be a huge page for this block.
1eb3fc0a
DDAG
1980 * The saving stops at the boundary of the used_length of the block
1981 * if the RAMBlock isn't a multiple of the host page size.
a82d593b 1982 *
3d0684b2
JQ
1983 * Returns the number of pages written or negative on error
1984 *
6f37bb8b 1985 * @rs: current RAM state
3d0684b2 1986 * @ms: current migration state
3d0684b2 1987 * @pss: data about the page we want to send
a82d593b 1988 * @last_stage: if we are at the completion stage
a82d593b 1989 */
a0a8aa14 1990static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
f20e2865 1991 bool last_stage)
a82d593b
DDAG
1992{
1993 int tmppages, pages = 0;
a935e30f
JQ
1994 size_t pagesize_bits =
1995 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
278e2f55
AG
1996 unsigned long start_page = pss->page;
1997 int res;
4c011c37 1998
fbd162e6 1999 if (ramblock_is_ignored(pss->block)) {
b895de50
CLG
2000 error_report("block %s should not be migrated !", pss->block->idstr);
2001 return 0;
2002 }
2003
a82d593b 2004 do {
1faa5665
XG
2005 /* Check the pages is dirty and if it is send it */
2006 if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2007 pss->page++;
2008 continue;
2009 }
2010
f20e2865 2011 tmppages = ram_save_target_page(rs, pss, last_stage);
a82d593b
DDAG
2012 if (tmppages < 0) {
2013 return tmppages;
2014 }
2015
2016 pages += tmppages;
a935e30f 2017 pss->page++;
97e1e067
DDAG
2018 /* Allow rate limiting to happen in the middle of huge pages */
2019 migration_rate_limit();
1eb3fc0a 2020 } while ((pss->page & (pagesize_bits - 1)) &&
8bba004c
AR
2021 offset_in_ramblock(pss->block,
2022 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
a82d593b 2023 /* The offset we leave with is the last one we looked at */
a935e30f 2024 pss->page--;
278e2f55
AG
2025
2026 res = ram_save_release_protection(rs, pss, start_page);
2027 return (res < 0 ? res : pages);
a82d593b 2028}
6c595cde 2029
56e93d26 2030/**
3d0684b2 2031 * ram_find_and_save_block: finds a dirty page and sends it to f
56e93d26
JQ
2032 *
2033 * Called within an RCU critical section.
2034 *
e8f3735f
XG
2035 * Returns the number of pages written where zero means no dirty pages,
2036 * or negative on error
56e93d26 2037 *
6f37bb8b 2038 * @rs: current RAM state
56e93d26 2039 * @last_stage: if we are at the completion stage
a82d593b
DDAG
2040 *
2041 * On systems where host-page-size > target-page-size it will send all the
2042 * pages in a host page that are dirty.
56e93d26
JQ
2043 */
2044
ce25d337 2045static int ram_find_and_save_block(RAMState *rs, bool last_stage)
56e93d26 2046{
b8fb8cb7 2047 PageSearchStatus pss;
56e93d26 2048 int pages = 0;
b9e60928 2049 bool again, found;
56e93d26 2050
0827b9e9
AA
2051 /* No dirty page as there is zero RAM */
2052 if (!ram_bytes_total()) {
2053 return pages;
2054 }
2055
6f37bb8b 2056 pss.block = rs->last_seen_block;
a935e30f 2057 pss.page = rs->last_page;
b8fb8cb7
DDAG
2058 pss.complete_round = false;
2059
2060 if (!pss.block) {
2061 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2062 }
56e93d26 2063
b9e60928 2064 do {
a82d593b 2065 again = true;
f20e2865 2066 found = get_queued_page(rs, &pss);
b9e60928 2067
a82d593b
DDAG
2068 if (!found) {
2069 /* priority queue empty, so just search for something dirty */
f20e2865 2070 found = find_dirty_block(rs, &pss, &again);
a82d593b 2071 }
f3f491fc 2072
a82d593b 2073 if (found) {
f20e2865 2074 pages = ram_save_host_page(rs, &pss, last_stage);
56e93d26 2075 }
b9e60928 2076 } while (!pages && again);
56e93d26 2077
6f37bb8b 2078 rs->last_seen_block = pss.block;
a935e30f 2079 rs->last_page = pss.page;
56e93d26
JQ
2080
2081 return pages;
2082}
2083
2084void acct_update_position(QEMUFile *f, size_t size, bool zero)
2085{
2086 uint64_t pages = size / TARGET_PAGE_SIZE;
f7ccd61b 2087
56e93d26 2088 if (zero) {
9360447d 2089 ram_counters.duplicate += pages;
56e93d26 2090 } else {
9360447d
JQ
2091 ram_counters.normal += pages;
2092 ram_counters.transferred += size;
56e93d26
JQ
2093 qemu_update_position(f, size);
2094 }
2095}
2096
fbd162e6 2097static uint64_t ram_bytes_total_common(bool count_ignored)
56e93d26
JQ
2098{
2099 RAMBlock *block;
2100 uint64_t total = 0;
2101
89ac5a1d
DDAG
2102 RCU_READ_LOCK_GUARD();
2103
fbd162e6
YK
2104 if (count_ignored) {
2105 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2106 total += block->used_length;
2107 }
2108 } else {
2109 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2110 total += block->used_length;
2111 }
99e15582 2112 }
56e93d26
JQ
2113 return total;
2114}
2115
fbd162e6
YK
2116uint64_t ram_bytes_total(void)
2117{
2118 return ram_bytes_total_common(false);
2119}
2120
f265e0e4 2121static void xbzrle_load_setup(void)
56e93d26 2122{
f265e0e4 2123 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
56e93d26
JQ
2124}
2125
f265e0e4
JQ
2126static void xbzrle_load_cleanup(void)
2127{
2128 g_free(XBZRLE.decoded_buf);
2129 XBZRLE.decoded_buf = NULL;
2130}
2131
7d7c96be
PX
2132static void ram_state_cleanup(RAMState **rsp)
2133{
b9ccaf6d
DDAG
2134 if (*rsp) {
2135 migration_page_queue_free(*rsp);
2136 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2137 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2138 g_free(*rsp);
2139 *rsp = NULL;
2140 }
7d7c96be
PX
2141}
2142
84593a08
PX
2143static void xbzrle_cleanup(void)
2144{
2145 XBZRLE_cache_lock();
2146 if (XBZRLE.cache) {
2147 cache_fini(XBZRLE.cache);
2148 g_free(XBZRLE.encoded_buf);
2149 g_free(XBZRLE.current_buf);
2150 g_free(XBZRLE.zero_target_page);
2151 XBZRLE.cache = NULL;
2152 XBZRLE.encoded_buf = NULL;
2153 XBZRLE.current_buf = NULL;
2154 XBZRLE.zero_target_page = NULL;
2155 }
2156 XBZRLE_cache_unlock();
2157}
2158
f265e0e4 2159static void ram_save_cleanup(void *opaque)
56e93d26 2160{
53518d94 2161 RAMState **rsp = opaque;
6b6712ef 2162 RAMBlock *block;
eb859c53 2163
278e2f55
AG
2164 /* We don't use dirty log with background snapshots */
2165 if (!migrate_background_snapshot()) {
2166 /* caller have hold iothread lock or is in a bh, so there is
2167 * no writing race against the migration bitmap
2168 */
2169 memory_global_dirty_log_stop();
2170 }
6b6712ef 2171
fbd162e6 2172 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
002cad6b
PX
2173 g_free(block->clear_bmap);
2174 block->clear_bmap = NULL;
6b6712ef
JQ
2175 g_free(block->bmap);
2176 block->bmap = NULL;
56e93d26
JQ
2177 }
2178
84593a08 2179 xbzrle_cleanup();
f0afa331 2180 compress_threads_save_cleanup();
7d7c96be 2181 ram_state_cleanup(rsp);
56e93d26
JQ
2182}
2183
6f37bb8b 2184static void ram_state_reset(RAMState *rs)
56e93d26 2185{
6f37bb8b
JQ
2186 rs->last_seen_block = NULL;
2187 rs->last_sent_block = NULL;
269ace29 2188 rs->last_page = 0;
6f37bb8b
JQ
2189 rs->last_version = ram_list.version;
2190 rs->ram_bulk_stage = true;
6eeb63f7 2191 rs->fpo_enabled = false;
56e93d26
JQ
2192}
2193
2194#define MAX_WAIT 50 /* ms, half buffered_file limit */
2195
4f2e4252
DDAG
2196/*
2197 * 'expected' is the value you expect the bitmap mostly to be full
2198 * of; it won't bother printing lines that are all this value.
2199 * If 'todump' is null the migration bitmap is dumped.
2200 */
6b6712ef
JQ
2201void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2202 unsigned long pages)
4f2e4252 2203{
4f2e4252
DDAG
2204 int64_t cur;
2205 int64_t linelen = 128;
2206 char linebuf[129];
2207
6b6712ef 2208 for (cur = 0; cur < pages; cur += linelen) {
4f2e4252
DDAG
2209 int64_t curb;
2210 bool found = false;
2211 /*
2212 * Last line; catch the case where the line length
2213 * is longer than remaining ram
2214 */
6b6712ef
JQ
2215 if (cur + linelen > pages) {
2216 linelen = pages - cur;
4f2e4252
DDAG
2217 }
2218 for (curb = 0; curb < linelen; curb++) {
2219 bool thisbit = test_bit(cur + curb, todump);
2220 linebuf[curb] = thisbit ? '1' : '.';
2221 found = found || (thisbit != expected);
2222 }
2223 if (found) {
2224 linebuf[curb] = '\0';
2225 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
2226 }
2227 }
2228}
2229
e0b266f0
DDAG
2230/* **** functions for postcopy ***** */
2231
ced1c616
PB
2232void ram_postcopy_migrated_memory_release(MigrationState *ms)
2233{
2234 struct RAMBlock *block;
ced1c616 2235
fbd162e6 2236 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
6b6712ef
JQ
2237 unsigned long *bitmap = block->bmap;
2238 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2239 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
ced1c616
PB
2240
2241 while (run_start < range) {
2242 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
8bba004c
AR
2243 ram_discard_range(block->idstr,
2244 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2245 ((ram_addr_t)(run_end - run_start))
2246 << TARGET_PAGE_BITS);
ced1c616
PB
2247 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2248 }
2249 }
2250}
2251
3d0684b2
JQ
2252/**
2253 * postcopy_send_discard_bm_ram: discard a RAMBlock
2254 *
2255 * Returns zero on success
2256 *
e0b266f0 2257 * Callback from postcopy_each_ram_send_discard for each RAMBlock
3d0684b2
JQ
2258 *
2259 * @ms: current migration state
89dab31b 2260 * @block: RAMBlock to discard
e0b266f0 2261 */
810cf2bb 2262static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
e0b266f0 2263{
6b6712ef 2264 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
e0b266f0 2265 unsigned long current;
1e7cf8c3 2266 unsigned long *bitmap = block->bmap;
e0b266f0 2267
6b6712ef 2268 for (current = 0; current < end; ) {
1e7cf8c3 2269 unsigned long one = find_next_bit(bitmap, end, current);
33a5cb62 2270 unsigned long zero, discard_length;
e0b266f0 2271
33a5cb62
WY
2272 if (one >= end) {
2273 break;
2274 }
e0b266f0 2275
1e7cf8c3 2276 zero = find_next_zero_bit(bitmap, end, one + 1);
33a5cb62
WY
2277
2278 if (zero >= end) {
2279 discard_length = end - one;
e0b266f0 2280 } else {
33a5cb62
WY
2281 discard_length = zero - one;
2282 }
810cf2bb 2283 postcopy_discard_send_range(ms, one, discard_length);
33a5cb62 2284 current = one + discard_length;
e0b266f0
DDAG
2285 }
2286
2287 return 0;
2288}
2289
3d0684b2
JQ
2290/**
2291 * postcopy_each_ram_send_discard: discard all RAMBlocks
2292 *
2293 * Returns 0 for success or negative for error
2294 *
e0b266f0
DDAG
2295 * Utility for the outgoing postcopy code.
2296 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2297 * passing it bitmap indexes and name.
e0b266f0
DDAG
2298 * (qemu_ram_foreach_block ends up passing unscaled lengths
2299 * which would mean postcopy code would have to deal with target page)
3d0684b2
JQ
2300 *
2301 * @ms: current migration state
e0b266f0
DDAG
2302 */
2303static int postcopy_each_ram_send_discard(MigrationState *ms)
2304{
2305 struct RAMBlock *block;
2306 int ret;
2307
fbd162e6 2308 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
810cf2bb 2309 postcopy_discard_send_init(ms, block->idstr);
e0b266f0
DDAG
2310
2311 /*
2312 * Postcopy sends chunks of bitmap over the wire, but it
2313 * just needs indexes at this point, avoids it having
2314 * target page specific code.
2315 */
810cf2bb
WY
2316 ret = postcopy_send_discard_bm_ram(ms, block);
2317 postcopy_discard_send_finish(ms);
e0b266f0
DDAG
2318 if (ret) {
2319 return ret;
2320 }
2321 }
2322
2323 return 0;
2324}
2325
3d0684b2 2326/**
8324ef86 2327 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
3d0684b2
JQ
2328 *
2329 * Helper for postcopy_chunk_hostpages; it's called twice to
2330 * canonicalize the two bitmaps, that are similar, but one is
2331 * inverted.
99e314eb 2332 *
3d0684b2
JQ
2333 * Postcopy requires that all target pages in a hostpage are dirty or
2334 * clean, not a mix. This function canonicalizes the bitmaps.
99e314eb 2335 *
3d0684b2 2336 * @ms: current migration state
3d0684b2 2337 * @block: block that contains the page we want to canonicalize
99e314eb 2338 */
1e7cf8c3 2339static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
99e314eb 2340{
53518d94 2341 RAMState *rs = ram_state;
6b6712ef 2342 unsigned long *bitmap = block->bmap;
29c59172 2343 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
6b6712ef 2344 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
99e314eb
DDAG
2345 unsigned long run_start;
2346
29c59172
DDAG
2347 if (block->page_size == TARGET_PAGE_SIZE) {
2348 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2349 return;
2350 }
2351
1e7cf8c3
WY
2352 /* Find a dirty page */
2353 run_start = find_next_bit(bitmap, pages, 0);
99e314eb 2354
6b6712ef 2355 while (run_start < pages) {
99e314eb
DDAG
2356
2357 /*
2358 * If the start of this run of pages is in the middle of a host
2359 * page, then we need to fixup this host page.
2360 */
9dec3cc3 2361 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
99e314eb 2362 /* Find the end of this run */
1e7cf8c3 2363 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
99e314eb
DDAG
2364 /*
2365 * If the end isn't at the start of a host page, then the
2366 * run doesn't finish at the end of a host page
2367 * and we need to discard.
2368 */
99e314eb
DDAG
2369 }
2370
9dec3cc3 2371 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
99e314eb 2372 unsigned long page;
dad45ab2
WY
2373 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2374 host_ratio);
2375 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
99e314eb 2376
99e314eb
DDAG
2377 /* Clean up the bitmap */
2378 for (page = fixup_start_addr;
2379 page < fixup_start_addr + host_ratio; page++) {
99e314eb
DDAG
2380 /*
2381 * Remark them as dirty, updating the count for any pages
2382 * that weren't previously dirty.
2383 */
0d8ec885 2384 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
99e314eb
DDAG
2385 }
2386 }
2387
1e7cf8c3
WY
2388 /* Find the next dirty page for the next iteration */
2389 run_start = find_next_bit(bitmap, pages, run_start);
99e314eb
DDAG
2390 }
2391}
2392
3d0684b2 2393/**
89dab31b 2394 * postcopy_chunk_hostpages: discard any partially sent host page
3d0684b2 2395 *
99e314eb
DDAG
2396 * Utility for the outgoing postcopy code.
2397 *
2398 * Discard any partially sent host-page size chunks, mark any partially
29c59172
DDAG
2399 * dirty host-page size chunks as all dirty. In this case the host-page
2400 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
99e314eb 2401 *
3d0684b2
JQ
2402 * Returns zero on success
2403 *
2404 * @ms: current migration state
6b6712ef 2405 * @block: block we want to work with
99e314eb 2406 */
6b6712ef 2407static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
99e314eb 2408{
810cf2bb 2409 postcopy_discard_send_init(ms, block->idstr);
99e314eb 2410
6b6712ef 2411 /*
1e7cf8c3 2412 * Ensure that all partially dirty host pages are made fully dirty.
6b6712ef 2413 */
1e7cf8c3 2414 postcopy_chunk_hostpages_pass(ms, block);
99e314eb 2415
810cf2bb 2416 postcopy_discard_send_finish(ms);
99e314eb
DDAG
2417 return 0;
2418}
2419
3d0684b2
JQ
2420/**
2421 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2422 *
2423 * Returns zero on success
2424 *
e0b266f0
DDAG
2425 * Transmit the set of pages to be discarded after precopy to the target
2426 * these are pages that:
2427 * a) Have been previously transmitted but are now dirty again
2428 * b) Pages that have never been transmitted, this ensures that
2429 * any pages on the destination that have been mapped by background
2430 * tasks get discarded (transparent huge pages is the specific concern)
2431 * Hopefully this is pretty sparse
3d0684b2
JQ
2432 *
2433 * @ms: current migration state
e0b266f0
DDAG
2434 */
2435int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2436{
53518d94 2437 RAMState *rs = ram_state;
6b6712ef 2438 RAMBlock *block;
e0b266f0 2439 int ret;
e0b266f0 2440
89ac5a1d 2441 RCU_READ_LOCK_GUARD();
e0b266f0
DDAG
2442
2443 /* This should be our last sync, the src is now paused */
eb859c53 2444 migration_bitmap_sync(rs);
e0b266f0 2445
6b6712ef
JQ
2446 /* Easiest way to make sure we don't resume in the middle of a host-page */
2447 rs->last_seen_block = NULL;
2448 rs->last_sent_block = NULL;
2449 rs->last_page = 0;
e0b266f0 2450
fbd162e6 2451 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
6b6712ef
JQ
2452 /* Deal with TPS != HPS and huge pages */
2453 ret = postcopy_chunk_hostpages(ms, block);
2454 if (ret) {
6b6712ef
JQ
2455 return ret;
2456 }
e0b266f0 2457
e0b266f0 2458#ifdef DEBUG_POSTCOPY
1e7cf8c3
WY
2459 ram_debug_dump_bitmap(block->bmap, true,
2460 block->used_length >> TARGET_PAGE_BITS);
e0b266f0 2461#endif
6b6712ef
JQ
2462 }
2463 trace_ram_postcopy_send_discard_bitmap();
e0b266f0 2464
b3ac2b94 2465 return postcopy_each_ram_send_discard(ms);
e0b266f0
DDAG
2466}
2467
3d0684b2
JQ
2468/**
2469 * ram_discard_range: discard dirtied pages at the beginning of postcopy
e0b266f0 2470 *
3d0684b2 2471 * Returns zero on success
e0b266f0 2472 *
36449157
JQ
2473 * @rbname: name of the RAMBlock of the request. NULL means the
2474 * same that last one.
3d0684b2
JQ
2475 * @start: RAMBlock starting page
2476 * @length: RAMBlock size
e0b266f0 2477 */
aaa2064c 2478int ram_discard_range(const char *rbname, uint64_t start, size_t length)
e0b266f0 2479{
36449157 2480 trace_ram_discard_range(rbname, start, length);
d3a5038c 2481
89ac5a1d 2482 RCU_READ_LOCK_GUARD();
36449157 2483 RAMBlock *rb = qemu_ram_block_by_name(rbname);
e0b266f0
DDAG
2484
2485 if (!rb) {
36449157 2486 error_report("ram_discard_range: Failed to find block '%s'", rbname);
03acb4e9 2487 return -1;
e0b266f0
DDAG
2488 }
2489
814bb08f
PX
2490 /*
2491 * On source VM, we don't need to update the received bitmap since
2492 * we don't even have one.
2493 */
2494 if (rb->receivedmap) {
2495 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2496 length >> qemu_target_page_bits());
2497 }
2498
03acb4e9 2499 return ram_block_discard_range(rb, start, length);
e0b266f0
DDAG
2500}
2501
84593a08
PX
2502/*
2503 * For every allocation, we will try not to crash the VM if the
2504 * allocation failed.
2505 */
2506static int xbzrle_init(void)
2507{
2508 Error *local_err = NULL;
2509
2510 if (!migrate_use_xbzrle()) {
2511 return 0;
2512 }
2513
2514 XBZRLE_cache_lock();
2515
2516 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2517 if (!XBZRLE.zero_target_page) {
2518 error_report("%s: Error allocating zero page", __func__);
2519 goto err_out;
2520 }
2521
2522 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2523 TARGET_PAGE_SIZE, &local_err);
2524 if (!XBZRLE.cache) {
2525 error_report_err(local_err);
2526 goto free_zero_page;
2527 }
2528
2529 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2530 if (!XBZRLE.encoded_buf) {
2531 error_report("%s: Error allocating encoded_buf", __func__);
2532 goto free_cache;
2533 }
2534
2535 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2536 if (!XBZRLE.current_buf) {
2537 error_report("%s: Error allocating current_buf", __func__);
2538 goto free_encoded_buf;
2539 }
2540
2541 /* We are all good */
2542 XBZRLE_cache_unlock();
2543 return 0;
2544
2545free_encoded_buf:
2546 g_free(XBZRLE.encoded_buf);
2547 XBZRLE.encoded_buf = NULL;
2548free_cache:
2549 cache_fini(XBZRLE.cache);
2550 XBZRLE.cache = NULL;
2551free_zero_page:
2552 g_free(XBZRLE.zero_target_page);
2553 XBZRLE.zero_target_page = NULL;
2554err_out:
2555 XBZRLE_cache_unlock();
2556 return -ENOMEM;
2557}
2558
53518d94 2559static int ram_state_init(RAMState **rsp)
56e93d26 2560{
7d00ee6a
PX
2561 *rsp = g_try_new0(RAMState, 1);
2562
2563 if (!*rsp) {
2564 error_report("%s: Init ramstate fail", __func__);
2565 return -1;
2566 }
53518d94
JQ
2567
2568 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2569 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2570 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
56e93d26 2571
7d00ee6a 2572 /*
40c4d4a8
IR
2573 * Count the total number of pages used by ram blocks not including any
2574 * gaps due to alignment or unplugs.
03158519 2575 * This must match with the initial values of dirty bitmap.
7d00ee6a 2576 */
40c4d4a8 2577 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
7d00ee6a
PX
2578 ram_state_reset(*rsp);
2579
2580 return 0;
2581}
2582
d6eff5d7 2583static void ram_list_init_bitmaps(void)
7d00ee6a 2584{
002cad6b 2585 MigrationState *ms = migrate_get_current();
d6eff5d7
PX
2586 RAMBlock *block;
2587 unsigned long pages;
002cad6b 2588 uint8_t shift;
56e93d26 2589
0827b9e9
AA
2590 /* Skip setting bitmap if there is no RAM */
2591 if (ram_bytes_total()) {
002cad6b
PX
2592 shift = ms->clear_bitmap_shift;
2593 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2594 error_report("clear_bitmap_shift (%u) too big, using "
2595 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2596 shift = CLEAR_BITMAP_SHIFT_MAX;
2597 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2598 error_report("clear_bitmap_shift (%u) too small, using "
2599 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2600 shift = CLEAR_BITMAP_SHIFT_MIN;
2601 }
2602
fbd162e6 2603 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
d6eff5d7 2604 pages = block->max_length >> TARGET_PAGE_BITS;
03158519
WY
2605 /*
2606 * The initial dirty bitmap for migration must be set with all
2607 * ones to make sure we'll migrate every guest RAM page to
2608 * destination.
40c4d4a8
IR
2609 * Here we set RAMBlock.bmap all to 1 because when rebegin a
2610 * new migration after a failed migration, ram_list.
2611 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2612 * guest memory.
03158519 2613 */
6b6712ef 2614 block->bmap = bitmap_new(pages);
40c4d4a8 2615 bitmap_set(block->bmap, 0, pages);
002cad6b
PX
2616 block->clear_bmap_shift = shift;
2617 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
0827b9e9 2618 }
f3f491fc 2619 }
d6eff5d7
PX
2620}
2621
2622static void ram_init_bitmaps(RAMState *rs)
2623{
2624 /* For memory_global_dirty_log_start below. */
2625 qemu_mutex_lock_iothread();
2626 qemu_mutex_lock_ramlist();
f3f491fc 2627
89ac5a1d
DDAG
2628 WITH_RCU_READ_LOCK_GUARD() {
2629 ram_list_init_bitmaps();
278e2f55
AG
2630 /* We don't use dirty log with background snapshots */
2631 if (!migrate_background_snapshot()) {
2632 memory_global_dirty_log_start();
2633 migration_bitmap_sync_precopy(rs);
2634 }
89ac5a1d 2635 }
56e93d26 2636 qemu_mutex_unlock_ramlist();
49877834 2637 qemu_mutex_unlock_iothread();
d6eff5d7
PX
2638}
2639
2640static int ram_init_all(RAMState **rsp)
2641{
2642 if (ram_state_init(rsp)) {
2643 return -1;
2644 }
2645
2646 if (xbzrle_init()) {
2647 ram_state_cleanup(rsp);
2648 return -1;
2649 }
2650
2651 ram_init_bitmaps(*rsp);
a91246c9
HZ
2652
2653 return 0;
2654}
2655
08614f34
PX
2656static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2657{
2658 RAMBlock *block;
2659 uint64_t pages = 0;
2660
2661 /*
2662 * Postcopy is not using xbzrle/compression, so no need for that.
2663 * Also, since source are already halted, we don't need to care
2664 * about dirty page logging as well.
2665 */
2666
fbd162e6 2667 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
08614f34
PX
2668 pages += bitmap_count_one(block->bmap,
2669 block->used_length >> TARGET_PAGE_BITS);
2670 }
2671
2672 /* This may not be aligned with current bitmaps. Recalculate. */
2673 rs->migration_dirty_pages = pages;
2674
2675 rs->last_seen_block = NULL;
2676 rs->last_sent_block = NULL;
2677 rs->last_page = 0;
2678 rs->last_version = ram_list.version;
2679 /*
2680 * Disable the bulk stage, otherwise we'll resend the whole RAM no
2681 * matter what we have sent.
2682 */
2683 rs->ram_bulk_stage = false;
2684
2685 /* Update RAMState cache of output QEMUFile */
2686 rs->f = out;
2687
2688 trace_ram_state_resume_prepare(pages);
2689}
2690
6bcb05fc
WW
2691/*
2692 * This function clears bits of the free pages reported by the caller from the
2693 * migration dirty bitmap. @addr is the host address corresponding to the
2694 * start of the continuous guest free pages, and @len is the total bytes of
2695 * those pages.
2696 */
2697void qemu_guest_free_page_hint(void *addr, size_t len)
2698{
2699 RAMBlock *block;
2700 ram_addr_t offset;
2701 size_t used_len, start, npages;
2702 MigrationState *s = migrate_get_current();
2703
2704 /* This function is currently expected to be used during live migration */
2705 if (!migration_is_setup_or_active(s->state)) {
2706 return;
2707 }
2708
2709 for (; len > 0; len -= used_len, addr += used_len) {
2710 block = qemu_ram_block_from_host(addr, false, &offset);
2711 if (unlikely(!block || offset >= block->used_length)) {
2712 /*
2713 * The implementation might not support RAMBlock resize during
2714 * live migration, but it could happen in theory with future
2715 * updates. So we add a check here to capture that case.
2716 */
2717 error_report_once("%s unexpected error", __func__);
2718 return;
2719 }
2720
2721 if (len <= block->used_length - offset) {
2722 used_len = len;
2723 } else {
2724 used_len = block->used_length - offset;
2725 }
2726
2727 start = offset >> TARGET_PAGE_BITS;
2728 npages = used_len >> TARGET_PAGE_BITS;
2729
2730 qemu_mutex_lock(&ram_state->bitmap_mutex);
2731 ram_state->migration_dirty_pages -=
2732 bitmap_count_one_with_offset(block->bmap, start, npages);
2733 bitmap_clear(block->bmap, start, npages);
2734 qemu_mutex_unlock(&ram_state->bitmap_mutex);
2735 }
2736}
2737
3d0684b2
JQ
2738/*
2739 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
a91246c9
HZ
2740 * long-running RCU critical section. When rcu-reclaims in the code
2741 * start to become numerous it will be necessary to reduce the
2742 * granularity of these critical sections.
2743 */
2744
3d0684b2
JQ
2745/**
2746 * ram_save_setup: Setup RAM for migration
2747 *
2748 * Returns zero to indicate success and negative for error
2749 *
2750 * @f: QEMUFile where to send the data
2751 * @opaque: RAMState pointer
2752 */
a91246c9
HZ
2753static int ram_save_setup(QEMUFile *f, void *opaque)
2754{
53518d94 2755 RAMState **rsp = opaque;
a91246c9
HZ
2756 RAMBlock *block;
2757
dcaf446e
XG
2758 if (compress_threads_save_setup()) {
2759 return -1;
2760 }
2761
a91246c9
HZ
2762 /* migration has already setup the bitmap, reuse it. */
2763 if (!migration_in_colo_state()) {
7d00ee6a 2764 if (ram_init_all(rsp) != 0) {
dcaf446e 2765 compress_threads_save_cleanup();
a91246c9 2766 return -1;
53518d94 2767 }
a91246c9 2768 }
53518d94 2769 (*rsp)->f = f;
a91246c9 2770
0e6ebd48
DDAG
2771 WITH_RCU_READ_LOCK_GUARD() {
2772 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
56e93d26 2773
0e6ebd48
DDAG
2774 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2775 qemu_put_byte(f, strlen(block->idstr));
2776 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2777 qemu_put_be64(f, block->used_length);
2778 if (migrate_postcopy_ram() && block->page_size !=
2779 qemu_host_page_size) {
2780 qemu_put_be64(f, block->page_size);
2781 }
2782 if (migrate_ignore_shared()) {
2783 qemu_put_be64(f, block->mr->addr);
2784 }
fbd162e6 2785 }
56e93d26
JQ
2786 }
2787
56e93d26
JQ
2788 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2789 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2790
99f2c6fb 2791 multifd_send_sync_main(f);
56e93d26 2792 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
35374cbd 2793 qemu_fflush(f);
56e93d26
JQ
2794
2795 return 0;
2796}
2797
3d0684b2
JQ
2798/**
2799 * ram_save_iterate: iterative stage for migration
2800 *
2801 * Returns zero to indicate success and negative for error
2802 *
2803 * @f: QEMUFile where to send the data
2804 * @opaque: RAMState pointer
2805 */
56e93d26
JQ
2806static int ram_save_iterate(QEMUFile *f, void *opaque)
2807{
53518d94
JQ
2808 RAMState **temp = opaque;
2809 RAMState *rs = *temp;
3d4095b2 2810 int ret = 0;
56e93d26
JQ
2811 int i;
2812 int64_t t0;
5c90308f 2813 int done = 0;
56e93d26 2814
b2557345
PL
2815 if (blk_mig_bulk_active()) {
2816 /* Avoid transferring ram during bulk phase of block migration as
2817 * the bulk phase will usually take a long time and transferring
2818 * ram updates during that time is pointless. */
2819 goto out;
2820 }
2821
89ac5a1d
DDAG
2822 WITH_RCU_READ_LOCK_GUARD() {
2823 if (ram_list.version != rs->last_version) {
2824 ram_state_reset(rs);
2825 }
56e93d26 2826
89ac5a1d
DDAG
2827 /* Read version before ram_list.blocks */
2828 smp_rmb();
56e93d26 2829
89ac5a1d 2830 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
56e93d26 2831
89ac5a1d
DDAG
2832 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2833 i = 0;
2834 while ((ret = qemu_file_rate_limit(f)) == 0 ||
2835 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2836 int pages;
e03a34f8 2837
89ac5a1d
DDAG
2838 if (qemu_file_get_error(f)) {
2839 break;
2840 }
e8f3735f 2841
89ac5a1d
DDAG
2842 pages = ram_find_and_save_block(rs, false);
2843 /* no more pages to sent */
2844 if (pages == 0) {
2845 done = 1;
2846 break;
2847 }
e8f3735f 2848
89ac5a1d
DDAG
2849 if (pages < 0) {
2850 qemu_file_set_error(f, pages);
56e93d26
JQ
2851 break;
2852 }
89ac5a1d
DDAG
2853
2854 rs->target_page_count += pages;
2855
644acf99
WY
2856 /*
2857 * During postcopy, it is necessary to make sure one whole host
2858 * page is sent in one chunk.
2859 */
2860 if (migrate_postcopy_ram()) {
2861 flush_compressed_data(rs);
2862 }
2863
89ac5a1d
DDAG
2864 /*
2865 * we want to check in the 1st loop, just in case it was the 1st
2866 * time and we had to sync the dirty bitmap.
2867 * qemu_clock_get_ns() is a bit expensive, so we only check each
2868 * some iterations
2869 */
2870 if ((i & 63) == 0) {
2871 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2872 1000000;
2873 if (t1 > MAX_WAIT) {
2874 trace_ram_save_iterate_big_wait(t1, i);
2875 break;
2876 }
2877 }
2878 i++;
56e93d26 2879 }
56e93d26 2880 }
56e93d26
JQ
2881
2882 /*
2883 * Must occur before EOS (or any QEMUFile operation)
2884 * because of RDMA protocol.
2885 */
2886 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2887
b2557345 2888out:
b69a0227
JQ
2889 if (ret >= 0
2890 && migration_is_setup_or_active(migrate_get_current()->state)) {
99f2c6fb 2891 multifd_send_sync_main(rs->f);
3d4095b2
JQ
2892 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2893 qemu_fflush(f);
2894 ram_counters.transferred += 8;
56e93d26 2895
3d4095b2
JQ
2896 ret = qemu_file_get_error(f);
2897 }
56e93d26
JQ
2898 if (ret < 0) {
2899 return ret;
2900 }
2901
5c90308f 2902 return done;
56e93d26
JQ
2903}
2904
3d0684b2
JQ
2905/**
2906 * ram_save_complete: function called to send the remaining amount of ram
2907 *
e8f3735f 2908 * Returns zero to indicate success or negative on error
3d0684b2
JQ
2909 *
2910 * Called with iothread lock
2911 *
2912 * @f: QEMUFile where to send the data
2913 * @opaque: RAMState pointer
2914 */
56e93d26
JQ
2915static int ram_save_complete(QEMUFile *f, void *opaque)
2916{
53518d94
JQ
2917 RAMState **temp = opaque;
2918 RAMState *rs = *temp;
e8f3735f 2919 int ret = 0;
6f37bb8b 2920
89ac5a1d
DDAG
2921 WITH_RCU_READ_LOCK_GUARD() {
2922 if (!migration_in_postcopy()) {
2923 migration_bitmap_sync_precopy(rs);
2924 }
56e93d26 2925
89ac5a1d 2926 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
56e93d26 2927
89ac5a1d 2928 /* try transferring iterative blocks of memory */
56e93d26 2929
89ac5a1d
DDAG
2930 /* flush all remaining blocks regardless of rate limiting */
2931 while (true) {
2932 int pages;
56e93d26 2933
89ac5a1d
DDAG
2934 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2935 /* no more blocks to sent */
2936 if (pages == 0) {
2937 break;
2938 }
2939 if (pages < 0) {
2940 ret = pages;
2941 break;
2942 }
e8f3735f 2943 }
56e93d26 2944
89ac5a1d
DDAG
2945 flush_compressed_data(rs);
2946 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2947 }
d09a6fde 2948
3d4095b2 2949 if (ret >= 0) {
99f2c6fb 2950 multifd_send_sync_main(rs->f);
3d4095b2
JQ
2951 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2952 qemu_fflush(f);
2953 }
56e93d26 2954
e8f3735f 2955 return ret;
56e93d26
JQ
2956}
2957
c31b098f 2958static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
47995026
VSO
2959 uint64_t *res_precopy_only,
2960 uint64_t *res_compatible,
2961 uint64_t *res_postcopy_only)
56e93d26 2962{
53518d94
JQ
2963 RAMState **temp = opaque;
2964 RAMState *rs = *temp;
56e93d26
JQ
2965 uint64_t remaining_size;
2966
9edabd4d 2967 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 2968
5727309d 2969 if (!migration_in_postcopy() &&
663e6c1d 2970 remaining_size < max_size) {
56e93d26 2971 qemu_mutex_lock_iothread();
89ac5a1d
DDAG
2972 WITH_RCU_READ_LOCK_GUARD() {
2973 migration_bitmap_sync_precopy(rs);
2974 }
56e93d26 2975 qemu_mutex_unlock_iothread();
9edabd4d 2976 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 2977 }
c31b098f 2978
86e1167e
VSO
2979 if (migrate_postcopy_ram()) {
2980 /* We can do postcopy, and all the data is postcopiable */
47995026 2981 *res_compatible += remaining_size;
86e1167e 2982 } else {
47995026 2983 *res_precopy_only += remaining_size;
86e1167e 2984 }
56e93d26
JQ
2985}
2986
2987static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2988{
2989 unsigned int xh_len;
2990 int xh_flags;
063e760a 2991 uint8_t *loaded_data;
56e93d26 2992
56e93d26
JQ
2993 /* extract RLE header */
2994 xh_flags = qemu_get_byte(f);
2995 xh_len = qemu_get_be16(f);
2996
2997 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2998 error_report("Failed to load XBZRLE page - wrong compression!");
2999 return -1;
3000 }
3001
3002 if (xh_len > TARGET_PAGE_SIZE) {
3003 error_report("Failed to load XBZRLE page - len overflow!");
3004 return -1;
3005 }
f265e0e4 3006 loaded_data = XBZRLE.decoded_buf;
56e93d26 3007 /* load data and decode */
f265e0e4 3008 /* it can change loaded_data to point to an internal buffer */
063e760a 3009 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
56e93d26
JQ
3010
3011 /* decode RLE */
063e760a 3012 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
56e93d26
JQ
3013 TARGET_PAGE_SIZE) == -1) {
3014 error_report("Failed to load XBZRLE page - decode error!");
3015 return -1;
3016 }
3017
3018 return 0;
3019}
3020
3d0684b2
JQ
3021/**
3022 * ram_block_from_stream: read a RAMBlock id from the migration stream
3023 *
3024 * Must be called from within a rcu critical section.
3025 *
56e93d26 3026 * Returns a pointer from within the RCU-protected ram_list.
a7180877 3027 *
3d0684b2
JQ
3028 * @f: QEMUFile where to read the data from
3029 * @flags: Page flags (mostly to see if it's a continuation of previous block)
a7180877 3030 */
3d0684b2 3031static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
56e93d26 3032{
49324e93 3033 static RAMBlock *block;
56e93d26
JQ
3034 char id[256];
3035 uint8_t len;
3036
3037 if (flags & RAM_SAVE_FLAG_CONTINUE) {
4c4bad48 3038 if (!block) {
56e93d26
JQ
3039 error_report("Ack, bad migration stream!");
3040 return NULL;
3041 }
4c4bad48 3042 return block;
56e93d26
JQ
3043 }
3044
3045 len = qemu_get_byte(f);
3046 qemu_get_buffer(f, (uint8_t *)id, len);
3047 id[len] = 0;
3048
e3dd7493 3049 block = qemu_ram_block_by_name(id);
4c4bad48
HZ
3050 if (!block) {
3051 error_report("Can't find block %s", id);
3052 return NULL;
56e93d26
JQ
3053 }
3054
fbd162e6 3055 if (ramblock_is_ignored(block)) {
b895de50
CLG
3056 error_report("block %s should not be migrated !", id);
3057 return NULL;
3058 }
3059
4c4bad48
HZ
3060 return block;
3061}
3062
3063static inline void *host_from_ram_block_offset(RAMBlock *block,
3064 ram_addr_t offset)
3065{
3066 if (!offset_in_ramblock(block, offset)) {
3067 return NULL;
3068 }
3069
3070 return block->host + offset;
56e93d26
JQ
3071}
3072
13af18f2 3073static inline void *colo_cache_from_block_offset(RAMBlock *block,
8af66371 3074 ram_addr_t offset, bool record_bitmap)
13af18f2
ZC
3075{
3076 if (!offset_in_ramblock(block, offset)) {
3077 return NULL;
3078 }
3079 if (!block->colo_cache) {
3080 error_report("%s: colo_cache is NULL in block :%s",
3081 __func__, block->idstr);
3082 return NULL;
3083 }
7d9acafa
ZC
3084
3085 /*
3086 * During colo checkpoint, we need bitmap of these migrated pages.
3087 * It help us to decide which pages in ram cache should be flushed
3088 * into VM's RAM later.
3089 */
8af66371
HZ
3090 if (record_bitmap &&
3091 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
7d9acafa
ZC
3092 ram_state->migration_dirty_pages++;
3093 }
13af18f2
ZC
3094 return block->colo_cache + offset;
3095}
3096
3d0684b2
JQ
3097/**
3098 * ram_handle_compressed: handle the zero page case
3099 *
56e93d26
JQ
3100 * If a page (or a whole RDMA chunk) has been
3101 * determined to be zero, then zap it.
3d0684b2
JQ
3102 *
3103 * @host: host address for the zero page
3104 * @ch: what the page is filled from. We only support zero
3105 * @size: size of the zero page
56e93d26
JQ
3106 */
3107void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3108{
3109 if (ch != 0 || !is_zero_range(host, size)) {
3110 memset(host, ch, size);
3111 }
3112}
3113
797ca154
XG
3114/* return the size after decompression, or negative value on error */
3115static int
3116qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3117 const uint8_t *source, size_t source_len)
3118{
3119 int err;
3120
3121 err = inflateReset(stream);
3122 if (err != Z_OK) {
3123 return -1;
3124 }
3125
3126 stream->avail_in = source_len;
3127 stream->next_in = (uint8_t *)source;
3128 stream->avail_out = dest_len;
3129 stream->next_out = dest;
3130
3131 err = inflate(stream, Z_NO_FLUSH);
3132 if (err != Z_STREAM_END) {
3133 return -1;
3134 }
3135
3136 return stream->total_out;
3137}
3138
56e93d26
JQ
3139static void *do_data_decompress(void *opaque)
3140{
3141 DecompressParam *param = opaque;
3142 unsigned long pagesize;
33d151f4 3143 uint8_t *des;
34ab9e97 3144 int len, ret;
56e93d26 3145
33d151f4 3146 qemu_mutex_lock(&param->mutex);
90e56fb4 3147 while (!param->quit) {
33d151f4
LL
3148 if (param->des) {
3149 des = param->des;
3150 len = param->len;
3151 param->des = 0;
3152 qemu_mutex_unlock(&param->mutex);
3153
56e93d26 3154 pagesize = TARGET_PAGE_SIZE;
34ab9e97
XG
3155
3156 ret = qemu_uncompress_data(&param->stream, des, pagesize,
3157 param->compbuf, len);
f548222c 3158 if (ret < 0 && migrate_get_current()->decompress_error_check) {
34ab9e97
XG
3159 error_report("decompress data failed");
3160 qemu_file_set_error(decomp_file, ret);
3161 }
73a8912b 3162
33d151f4
LL
3163 qemu_mutex_lock(&decomp_done_lock);
3164 param->done = true;
3165 qemu_cond_signal(&decomp_done_cond);
3166 qemu_mutex_unlock(&decomp_done_lock);
3167
3168 qemu_mutex_lock(&param->mutex);
3169 } else {
3170 qemu_cond_wait(&param->cond, &param->mutex);
3171 }
56e93d26 3172 }
33d151f4 3173 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
3174
3175 return NULL;
3176}
3177
34ab9e97 3178static int wait_for_decompress_done(void)
5533b2e9
LL
3179{
3180 int idx, thread_count;
3181
3182 if (!migrate_use_compression()) {
34ab9e97 3183 return 0;
5533b2e9
LL
3184 }
3185
3186 thread_count = migrate_decompress_threads();
3187 qemu_mutex_lock(&decomp_done_lock);
3188 for (idx = 0; idx < thread_count; idx++) {
3189 while (!decomp_param[idx].done) {
3190 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3191 }
3192 }
3193 qemu_mutex_unlock(&decomp_done_lock);
34ab9e97 3194 return qemu_file_get_error(decomp_file);
5533b2e9
LL
3195}
3196
f0afa331 3197static void compress_threads_load_cleanup(void)
56e93d26
JQ
3198{
3199 int i, thread_count;
3200
3416ab5b
JQ
3201 if (!migrate_use_compression()) {
3202 return;
3203 }
56e93d26
JQ
3204 thread_count = migrate_decompress_threads();
3205 for (i = 0; i < thread_count; i++) {
797ca154
XG
3206 /*
3207 * we use it as a indicator which shows if the thread is
3208 * properly init'd or not
3209 */
3210 if (!decomp_param[i].compbuf) {
3211 break;
3212 }
3213
56e93d26 3214 qemu_mutex_lock(&decomp_param[i].mutex);
90e56fb4 3215 decomp_param[i].quit = true;
56e93d26
JQ
3216 qemu_cond_signal(&decomp_param[i].cond);
3217 qemu_mutex_unlock(&decomp_param[i].mutex);
3218 }
3219 for (i = 0; i < thread_count; i++) {
797ca154
XG
3220 if (!decomp_param[i].compbuf) {
3221 break;
3222 }
3223
56e93d26
JQ
3224 qemu_thread_join(decompress_threads + i);
3225 qemu_mutex_destroy(&decomp_param[i].mutex);
3226 qemu_cond_destroy(&decomp_param[i].cond);
797ca154 3227 inflateEnd(&decomp_param[i].stream);
56e93d26 3228 g_free(decomp_param[i].compbuf);
797ca154 3229 decomp_param[i].compbuf = NULL;
56e93d26
JQ
3230 }
3231 g_free(decompress_threads);
3232 g_free(decomp_param);
56e93d26
JQ
3233 decompress_threads = NULL;
3234 decomp_param = NULL;
34ab9e97 3235 decomp_file = NULL;
56e93d26
JQ
3236}
3237
34ab9e97 3238static int compress_threads_load_setup(QEMUFile *f)
797ca154
XG
3239{
3240 int i, thread_count;
3241
3242 if (!migrate_use_compression()) {
3243 return 0;
3244 }
3245
3246 thread_count = migrate_decompress_threads();
3247 decompress_threads = g_new0(QemuThread, thread_count);
3248 decomp_param = g_new0(DecompressParam, thread_count);
3249 qemu_mutex_init(&decomp_done_lock);
3250 qemu_cond_init(&decomp_done_cond);
34ab9e97 3251 decomp_file = f;
797ca154
XG
3252 for (i = 0; i < thread_count; i++) {
3253 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3254 goto exit;
3255 }
3256
3257 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3258 qemu_mutex_init(&decomp_param[i].mutex);
3259 qemu_cond_init(&decomp_param[i].cond);
3260 decomp_param[i].done = true;
3261 decomp_param[i].quit = false;
3262 qemu_thread_create(decompress_threads + i, "decompress",
3263 do_data_decompress, decomp_param + i,
3264 QEMU_THREAD_JOINABLE);
3265 }
3266 return 0;
3267exit:
3268 compress_threads_load_cleanup();
3269 return -1;
3270}
3271
c1bc6626 3272static void decompress_data_with_multi_threads(QEMUFile *f,
56e93d26
JQ
3273 void *host, int len)
3274{
3275 int idx, thread_count;
3276
3277 thread_count = migrate_decompress_threads();
73a8912b 3278 qemu_mutex_lock(&decomp_done_lock);
56e93d26
JQ
3279 while (true) {
3280 for (idx = 0; idx < thread_count; idx++) {
73a8912b 3281 if (decomp_param[idx].done) {
33d151f4
LL
3282 decomp_param[idx].done = false;
3283 qemu_mutex_lock(&decomp_param[idx].mutex);
c1bc6626 3284 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
56e93d26
JQ
3285 decomp_param[idx].des = host;
3286 decomp_param[idx].len = len;
33d151f4
LL
3287 qemu_cond_signal(&decomp_param[idx].cond);
3288 qemu_mutex_unlock(&decomp_param[idx].mutex);
56e93d26
JQ
3289 break;
3290 }
3291 }
3292 if (idx < thread_count) {
3293 break;
73a8912b
LL
3294 } else {
3295 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
56e93d26
JQ
3296 }
3297 }
73a8912b 3298 qemu_mutex_unlock(&decomp_done_lock);
56e93d26
JQ
3299}
3300
b70cb3b4
RL
3301 /*
3302 * we must set ram_bulk_stage to false, otherwise in
3303 * migation_bitmap_find_dirty the bitmap will be unused and
3304 * all the pages in ram cache wil be flushed to the ram of
3305 * secondary VM.
3306 */
3307static void colo_init_ram_state(void)
3308{
3309 ram_state_init(&ram_state);
3310 ram_state->ram_bulk_stage = false;
3311}
3312
13af18f2
ZC
3313/*
3314 * colo cache: this is for secondary VM, we cache the whole
3315 * memory of the secondary VM, it is need to hold the global lock
3316 * to call this helper.
3317 */
3318int colo_init_ram_cache(void)
3319{
3320 RAMBlock *block;
3321
44901b5a
PB
3322 WITH_RCU_READ_LOCK_GUARD() {
3323 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3324 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3325 NULL,
3326 false);
3327 if (!block->colo_cache) {
3328 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3329 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3330 block->used_length);
3331 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3332 if (block->colo_cache) {
3333 qemu_anon_ram_free(block->colo_cache, block->used_length);
3334 block->colo_cache = NULL;
3335 }
89ac5a1d 3336 }
44901b5a 3337 return -errno;
89ac5a1d 3338 }
13af18f2 3339 }
13af18f2 3340 }
44901b5a 3341
7d9acafa
ZC
3342 /*
3343 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3344 * with to decide which page in cache should be flushed into SVM's RAM. Here
3345 * we use the same name 'ram_bitmap' as for migration.
3346 */
3347 if (ram_bytes_total()) {
3348 RAMBlock *block;
3349
fbd162e6 3350 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7d9acafa 3351 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
7d9acafa 3352 block->bmap = bitmap_new(pages);
7d9acafa
ZC
3353 }
3354 }
7d9acafa 3355
b70cb3b4 3356 colo_init_ram_state();
13af18f2 3357 return 0;
13af18f2
ZC
3358}
3359
0393031a
HZ
3360/* TODO: duplicated with ram_init_bitmaps */
3361void colo_incoming_start_dirty_log(void)
3362{
3363 RAMBlock *block = NULL;
3364 /* For memory_global_dirty_log_start below. */
3365 qemu_mutex_lock_iothread();
3366 qemu_mutex_lock_ramlist();
3367
3368 memory_global_dirty_log_sync();
3369 WITH_RCU_READ_LOCK_GUARD() {
3370 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3371 ramblock_sync_dirty_bitmap(ram_state, block);
3372 /* Discard this dirty bitmap record */
3373 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3374 }
3375 memory_global_dirty_log_start();
3376 }
3377 ram_state->migration_dirty_pages = 0;
3378 qemu_mutex_unlock_ramlist();
3379 qemu_mutex_unlock_iothread();
3380}
3381
13af18f2
ZC
3382/* It is need to hold the global lock to call this helper */
3383void colo_release_ram_cache(void)
3384{
3385 RAMBlock *block;
3386
d1955d22 3387 memory_global_dirty_log_stop();
fbd162e6 3388 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7d9acafa
ZC
3389 g_free(block->bmap);
3390 block->bmap = NULL;
3391 }
3392
89ac5a1d
DDAG
3393 WITH_RCU_READ_LOCK_GUARD() {
3394 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3395 if (block->colo_cache) {
3396 qemu_anon_ram_free(block->colo_cache, block->used_length);
3397 block->colo_cache = NULL;
3398 }
13af18f2
ZC
3399 }
3400 }
0393031a 3401 ram_state_cleanup(&ram_state);
13af18f2
ZC
3402}
3403
f265e0e4
JQ
3404/**
3405 * ram_load_setup: Setup RAM for migration incoming side
3406 *
3407 * Returns zero to indicate success and negative for error
3408 *
3409 * @f: QEMUFile where to receive the data
3410 * @opaque: RAMState pointer
3411 */
3412static int ram_load_setup(QEMUFile *f, void *opaque)
3413{
34ab9e97 3414 if (compress_threads_load_setup(f)) {
797ca154
XG
3415 return -1;
3416 }
3417
f265e0e4 3418 xbzrle_load_setup();
f9494614 3419 ramblock_recv_map_init();
13af18f2 3420
f265e0e4
JQ
3421 return 0;
3422}
3423
3424static int ram_load_cleanup(void *opaque)
3425{
f9494614 3426 RAMBlock *rb;
56eb90af 3427
fbd162e6 3428 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
bd108a44 3429 qemu_ram_block_writeback(rb);
56eb90af
JH
3430 }
3431
f265e0e4 3432 xbzrle_load_cleanup();
f0afa331 3433 compress_threads_load_cleanup();
f9494614 3434
fbd162e6 3435 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
f9494614
AP
3436 g_free(rb->receivedmap);
3437 rb->receivedmap = NULL;
3438 }
13af18f2 3439
f265e0e4
JQ
3440 return 0;
3441}
3442
3d0684b2
JQ
3443/**
3444 * ram_postcopy_incoming_init: allocate postcopy data structures
3445 *
3446 * Returns 0 for success and negative if there was one error
3447 *
3448 * @mis: current migration incoming state
3449 *
3450 * Allocate data structures etc needed by incoming migration with
3451 * postcopy-ram. postcopy-ram's similarly names
3452 * postcopy_ram_incoming_init does the work.
1caddf8a
DDAG
3453 */
3454int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3455{
c136180c 3456 return postcopy_ram_incoming_init(mis);
1caddf8a
DDAG
3457}
3458
3d0684b2
JQ
3459/**
3460 * ram_load_postcopy: load a page in postcopy case
3461 *
3462 * Returns 0 for success or -errno in case of error
3463 *
a7180877
DDAG
3464 * Called in postcopy mode by ram_load().
3465 * rcu_read_lock is taken prior to this being called.
3d0684b2
JQ
3466 *
3467 * @f: QEMUFile where to send the data
a7180877
DDAG
3468 */
3469static int ram_load_postcopy(QEMUFile *f)
3470{
3471 int flags = 0, ret = 0;
3472 bool place_needed = false;
1aa83678 3473 bool matches_target_page_size = false;
a7180877
DDAG
3474 MigrationIncomingState *mis = migration_incoming_get_current();
3475 /* Temporary page that is later 'placed' */
3414322a 3476 void *postcopy_host_page = mis->postcopy_tmp_page;
91ba442f 3477 void *this_host = NULL;
ddf35bdf 3478 bool all_zero = true;
4cbb3c63 3479 int target_pages = 0;
a7180877
DDAG
3480
3481 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3482 ram_addr_t addr;
3483 void *host = NULL;
3484 void *page_buffer = NULL;
3485 void *place_source = NULL;
df9ff5e1 3486 RAMBlock *block = NULL;
a7180877 3487 uint8_t ch;
644acf99 3488 int len;
a7180877
DDAG
3489
3490 addr = qemu_get_be64(f);
7a9ddfbf
PX
3491
3492 /*
3493 * If qemu file error, we should stop here, and then "addr"
3494 * may be invalid
3495 */
3496 ret = qemu_file_get_error(f);
3497 if (ret) {
3498 break;
3499 }
3500
a7180877
DDAG
3501 flags = addr & ~TARGET_PAGE_MASK;
3502 addr &= TARGET_PAGE_MASK;
3503
3504 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
644acf99
WY
3505 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3506 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
df9ff5e1 3507 block = ram_block_from_stream(f, flags);
4c4bad48
HZ
3508
3509 host = host_from_ram_block_offset(block, addr);
a7180877
DDAG
3510 if (!host) {
3511 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3512 ret = -EINVAL;
3513 break;
3514 }
4cbb3c63 3515 target_pages++;
1aa83678 3516 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
a7180877 3517 /*
28abd200
DDAG
3518 * Postcopy requires that we place whole host pages atomically;
3519 * these may be huge pages for RAMBlocks that are backed by
3520 * hugetlbfs.
a7180877
DDAG
3521 * To make it atomic, the data is read into a temporary page
3522 * that's moved into place later.
3523 * The migration protocol uses, possibly smaller, target-pages
3524 * however the source ensures it always sends all the components
91ba442f 3525 * of a host page in one chunk.
a7180877
DDAG
3526 */
3527 page_buffer = postcopy_host_page +
28abd200 3528 ((uintptr_t)host & (block->page_size - 1));
e5e73b0f 3529 if (target_pages == 1) {
91ba442f
WY
3530 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3531 block->page_size);
c53b7ddc
DDAG
3532 } else {
3533 /* not the 1st TP within the HP */
91ba442f
WY
3534 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) !=
3535 (uintptr_t)this_host) {
3536 error_report("Non-same host page %p/%p",
3537 host, this_host);
c53b7ddc
DDAG
3538 ret = -EINVAL;
3539 break;
3540 }
a7180877
DDAG
3541 }
3542
3543 /*
3544 * If it's the last part of a host page then we place the host
3545 * page
3546 */
4cbb3c63
WY
3547 if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3548 place_needed = true;
4cbb3c63 3549 }
a7180877
DDAG
3550 place_source = postcopy_host_page;
3551 }
3552
3553 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
bb890ed5 3554 case RAM_SAVE_FLAG_ZERO:
a7180877 3555 ch = qemu_get_byte(f);
2e36bc1b
WY
3556 /*
3557 * Can skip to set page_buffer when
3558 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3559 */
3560 if (ch || !matches_target_page_size) {
3561 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3562 }
a7180877
DDAG
3563 if (ch) {
3564 all_zero = false;
3565 }
3566 break;
3567
3568 case RAM_SAVE_FLAG_PAGE:
3569 all_zero = false;
1aa83678
PX
3570 if (!matches_target_page_size) {
3571 /* For huge pages, we always use temporary buffer */
a7180877
DDAG
3572 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3573 } else {
1aa83678
PX
3574 /*
3575 * For small pages that matches target page size, we
3576 * avoid the qemu_file copy. Instead we directly use
3577 * the buffer of QEMUFile to place the page. Note: we
3578 * cannot do any QEMUFile operation before using that
3579 * buffer to make sure the buffer is valid when
3580 * placing the page.
a7180877
DDAG
3581 */
3582 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3583 TARGET_PAGE_SIZE);
3584 }
3585 break;
644acf99
WY
3586 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3587 all_zero = false;
3588 len = qemu_get_be32(f);
3589 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3590 error_report("Invalid compressed data length: %d", len);
3591 ret = -EINVAL;
3592 break;
3593 }
3594 decompress_data_with_multi_threads(f, page_buffer, len);
3595 break;
3596
a7180877
DDAG
3597 case RAM_SAVE_FLAG_EOS:
3598 /* normal exit */
6df264ac 3599 multifd_recv_sync_main();
a7180877
DDAG
3600 break;
3601 default:
29fccade 3602 error_report("Unknown combination of migration flags: 0x%x"
a7180877
DDAG
3603 " (postcopy mode)", flags);
3604 ret = -EINVAL;
7a9ddfbf
PX
3605 break;
3606 }
3607
644acf99
WY
3608 /* Got the whole host page, wait for decompress before placing. */
3609 if (place_needed) {
3610 ret |= wait_for_decompress_done();
3611 }
3612
7a9ddfbf
PX
3613 /* Detect for any possible file errors */
3614 if (!ret && qemu_file_get_error(f)) {
3615 ret = qemu_file_get_error(f);
a7180877
DDAG
3616 }
3617
7a9ddfbf 3618 if (!ret && place_needed) {
a7180877 3619 /* This gets called at the last target page in the host page */
91ba442f
WY
3620 void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3621 block->page_size);
df9ff5e1 3622
a7180877 3623 if (all_zero) {
df9ff5e1 3624 ret = postcopy_place_page_zero(mis, place_dest,
8be4620b 3625 block);
a7180877 3626 } else {
df9ff5e1 3627 ret = postcopy_place_page(mis, place_dest,
8be4620b 3628 place_source, block);
a7180877 3629 }
ddf35bdf
DH
3630 place_needed = false;
3631 target_pages = 0;
3632 /* Assume we have a zero page until we detect something different */
3633 all_zero = true;
a7180877 3634 }
a7180877
DDAG
3635 }
3636
3637 return ret;
3638}
3639
acab30b8
DHB
3640static bool postcopy_is_advised(void)
3641{
3642 PostcopyState ps = postcopy_state_get();
3643 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3644}
3645
3646static bool postcopy_is_running(void)
3647{
3648 PostcopyState ps = postcopy_state_get();
3649 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3650}
3651
e6f4aa18
ZC
3652/*
3653 * Flush content of RAM cache into SVM's memory.
3654 * Only flush the pages that be dirtied by PVM or SVM or both.
3655 */
24fa16f8 3656void colo_flush_ram_cache(void)
e6f4aa18
ZC
3657{
3658 RAMBlock *block = NULL;
3659 void *dst_host;
3660 void *src_host;
3661 unsigned long offset = 0;
3662
d1955d22 3663 memory_global_dirty_log_sync();
89ac5a1d
DDAG
3664 WITH_RCU_READ_LOCK_GUARD() {
3665 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3666 ramblock_sync_dirty_bitmap(ram_state, block);
3667 }
d1955d22 3668 }
d1955d22 3669
e6f4aa18 3670 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
89ac5a1d
DDAG
3671 WITH_RCU_READ_LOCK_GUARD() {
3672 block = QLIST_FIRST_RCU(&ram_list.blocks);
e6f4aa18 3673
89ac5a1d
DDAG
3674 while (block) {
3675 offset = migration_bitmap_find_dirty(ram_state, block, offset);
e6f4aa18 3676
8bba004c
AR
3677 if (((ram_addr_t)offset) << TARGET_PAGE_BITS
3678 >= block->used_length) {
89ac5a1d
DDAG
3679 offset = 0;
3680 block = QLIST_NEXT_RCU(block, next);
3681 } else {
3682 migration_bitmap_clear_dirty(ram_state, block, offset);
8bba004c
AR
3683 dst_host = block->host
3684 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3685 src_host = block->colo_cache
3686 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
89ac5a1d
DDAG
3687 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3688 }
e6f4aa18
ZC
3689 }
3690 }
e6f4aa18
ZC
3691 trace_colo_flush_ram_cache_end();
3692}
3693
10da4a36
WY
3694/**
3695 * ram_load_precopy: load pages in precopy case
3696 *
3697 * Returns 0 for success or -errno in case of error
3698 *
3699 * Called in precopy mode by ram_load().
3700 * rcu_read_lock is taken prior to this being called.
3701 *
3702 * @f: QEMUFile where to send the data
3703 */
3704static int ram_load_precopy(QEMUFile *f)
56e93d26 3705{
e65cec5e 3706 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
ef08fb38 3707 /* ADVISE is earlier, it shows the source has the postcopy capability on */
acab30b8 3708 bool postcopy_advised = postcopy_is_advised();
edc60127
JQ
3709 if (!migrate_use_compression()) {
3710 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3711 }
a7180877 3712
10da4a36 3713 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
56e93d26 3714 ram_addr_t addr, total_ram_bytes;
0393031a 3715 void *host = NULL, *host_bak = NULL;
56e93d26
JQ
3716 uint8_t ch;
3717
e65cec5e
YK
3718 /*
3719 * Yield periodically to let main loop run, but an iteration of
3720 * the main loop is expensive, so do it each some iterations
3721 */
3722 if ((i & 32767) == 0 && qemu_in_coroutine()) {
3723 aio_co_schedule(qemu_get_current_aio_context(),
3724 qemu_coroutine_self());
3725 qemu_coroutine_yield();
3726 }
3727 i++;
3728
56e93d26
JQ
3729 addr = qemu_get_be64(f);
3730 flags = addr & ~TARGET_PAGE_MASK;
3731 addr &= TARGET_PAGE_MASK;
3732
edc60127
JQ
3733 if (flags & invalid_flags) {
3734 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3735 error_report("Received an unexpected compressed page");
3736 }
3737
3738 ret = -EINVAL;
3739 break;
3740 }
3741
bb890ed5 3742 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
a776aa15 3743 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4c4bad48
HZ
3744 RAMBlock *block = ram_block_from_stream(f, flags);
3745
0393031a 3746 host = host_from_ram_block_offset(block, addr);
13af18f2 3747 /*
0393031a
HZ
3748 * After going into COLO stage, we should not load the page
3749 * into SVM's memory directly, we put them into colo_cache firstly.
3750 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3751 * Previously, we copied all these memory in preparing stage of COLO
3752 * while we need to stop VM, which is a time-consuming process.
3753 * Here we optimize it by a trick, back-up every page while in
3754 * migration process while COLO is enabled, though it affects the
3755 * speed of the migration, but it obviously reduce the downtime of
3756 * back-up all SVM'S memory in COLO preparing stage.
13af18f2 3757 */
0393031a
HZ
3758 if (migration_incoming_colo_enabled()) {
3759 if (migration_incoming_in_colo_state()) {
3760 /* In COLO stage, put all pages into cache temporarily */
8af66371 3761 host = colo_cache_from_block_offset(block, addr, true);
0393031a
HZ
3762 } else {
3763 /*
3764 * In migration stage but before COLO stage,
3765 * Put all pages into both cache and SVM's memory.
3766 */
8af66371 3767 host_bak = colo_cache_from_block_offset(block, addr, false);
0393031a 3768 }
13af18f2 3769 }
a776aa15
DDAG
3770 if (!host) {
3771 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3772 ret = -EINVAL;
3773 break;
3774 }
13af18f2
ZC
3775 if (!migration_incoming_in_colo_state()) {
3776 ramblock_recv_bitmap_set(block, host);
3777 }
3778
1db9d8e5 3779 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
a776aa15
DDAG
3780 }
3781
56e93d26
JQ
3782 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3783 case RAM_SAVE_FLAG_MEM_SIZE:
3784 /* Synchronize RAM block list */
3785 total_ram_bytes = addr;
3786 while (!ret && total_ram_bytes) {
3787 RAMBlock *block;
56e93d26
JQ
3788 char id[256];
3789 ram_addr_t length;
3790
3791 len = qemu_get_byte(f);
3792 qemu_get_buffer(f, (uint8_t *)id, len);
3793 id[len] = 0;
3794 length = qemu_get_be64(f);
3795
e3dd7493 3796 block = qemu_ram_block_by_name(id);
b895de50
CLG
3797 if (block && !qemu_ram_is_migratable(block)) {
3798 error_report("block %s should not be migrated !", id);
3799 ret = -EINVAL;
3800 } else if (block) {
e3dd7493
DDAG
3801 if (length != block->used_length) {
3802 Error *local_err = NULL;
56e93d26 3803
fa53a0e5 3804 ret = qemu_ram_resize(block, length,
e3dd7493
DDAG
3805 &local_err);
3806 if (local_err) {
3807 error_report_err(local_err);
56e93d26 3808 }
56e93d26 3809 }
ef08fb38
DDAG
3810 /* For postcopy we need to check hugepage sizes match */
3811 if (postcopy_advised &&
3812 block->page_size != qemu_host_page_size) {
3813 uint64_t remote_page_size = qemu_get_be64(f);
3814 if (remote_page_size != block->page_size) {
3815 error_report("Mismatched RAM page size %s "
3816 "(local) %zd != %" PRId64,
3817 id, block->page_size,
3818 remote_page_size);
3819 ret = -EINVAL;
3820 }
3821 }
fbd162e6
YK
3822 if (migrate_ignore_shared()) {
3823 hwaddr addr = qemu_get_be64(f);
fbd162e6
YK
3824 if (ramblock_is_ignored(block) &&
3825 block->mr->addr != addr) {
3826 error_report("Mismatched GPAs for block %s "
3827 "%" PRId64 "!= %" PRId64,
3828 id, (uint64_t)addr,
3829 (uint64_t)block->mr->addr);
3830 ret = -EINVAL;
3831 }
3832 }
e3dd7493
DDAG
3833 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3834 block->idstr);
3835 } else {
56e93d26
JQ
3836 error_report("Unknown ramblock \"%s\", cannot "
3837 "accept migration", id);
3838 ret = -EINVAL;
3839 }
3840
3841 total_ram_bytes -= length;
3842 }
3843 break;
a776aa15 3844
bb890ed5 3845 case RAM_SAVE_FLAG_ZERO:
56e93d26
JQ
3846 ch = qemu_get_byte(f);
3847 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3848 break;
a776aa15 3849
56e93d26 3850 case RAM_SAVE_FLAG_PAGE:
56e93d26
JQ
3851 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3852 break;
56e93d26 3853
a776aa15 3854 case RAM_SAVE_FLAG_COMPRESS_PAGE:
56e93d26
JQ
3855 len = qemu_get_be32(f);
3856 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3857 error_report("Invalid compressed data length: %d", len);
3858 ret = -EINVAL;
3859 break;
3860 }
c1bc6626 3861 decompress_data_with_multi_threads(f, host, len);
56e93d26 3862 break;
a776aa15 3863
56e93d26 3864 case RAM_SAVE_FLAG_XBZRLE:
56e93d26
JQ
3865 if (load_xbzrle(f, addr, host) < 0) {
3866 error_report("Failed to decompress XBZRLE page at "
3867 RAM_ADDR_FMT, addr);
3868 ret = -EINVAL;
3869 break;
3870 }
3871 break;
3872 case RAM_SAVE_FLAG_EOS:
3873 /* normal exit */
6df264ac 3874 multifd_recv_sync_main();
56e93d26
JQ
3875 break;
3876 default:
3877 if (flags & RAM_SAVE_FLAG_HOOK) {
632e3a5c 3878 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
56e93d26 3879 } else {
29fccade 3880 error_report("Unknown combination of migration flags: 0x%x",
56e93d26
JQ
3881 flags);
3882 ret = -EINVAL;
3883 }
3884 }
3885 if (!ret) {
3886 ret = qemu_file_get_error(f);
3887 }
0393031a
HZ
3888 if (!ret && host_bak) {
3889 memcpy(host_bak, host, TARGET_PAGE_SIZE);
3890 }
56e93d26
JQ
3891 }
3892
ca1a6b70 3893 ret |= wait_for_decompress_done();
10da4a36
WY
3894 return ret;
3895}
3896
3897static int ram_load(QEMUFile *f, void *opaque, int version_id)
3898{
3899 int ret = 0;
3900 static uint64_t seq_iter;
3901 /*
3902 * If system is running in postcopy mode, page inserts to host memory must
3903 * be atomic
3904 */
3905 bool postcopy_running = postcopy_is_running();
3906
3907 seq_iter++;
3908
3909 if (version_id != 4) {
3910 return -EINVAL;
3911 }
3912
3913 /*
3914 * This RCU critical section can be very long running.
3915 * When RCU reclaims in the code start to become numerous,
3916 * it will be necessary to reduce the granularity of this
3917 * critical section.
3918 */
89ac5a1d
DDAG
3919 WITH_RCU_READ_LOCK_GUARD() {
3920 if (postcopy_running) {
3921 ret = ram_load_postcopy(f);
3922 } else {
3923 ret = ram_load_precopy(f);
3924 }
10da4a36 3925 }
55c4446b 3926 trace_ram_load_complete(ret, seq_iter);
e6f4aa18 3927
56e93d26
JQ
3928 return ret;
3929}
3930
c6467627
VSO
3931static bool ram_has_postcopy(void *opaque)
3932{
469dd51b 3933 RAMBlock *rb;
fbd162e6 3934 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
469dd51b
JH
3935 if (ramblock_is_pmem(rb)) {
3936 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3937 "is not supported now!", rb->idstr, rb->host);
3938 return false;
3939 }
3940 }
3941
c6467627
VSO
3942 return migrate_postcopy_ram();
3943}
3944
edd090c7
PX
3945/* Sync all the dirty bitmap with destination VM. */
3946static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3947{
3948 RAMBlock *block;
3949 QEMUFile *file = s->to_dst_file;
3950 int ramblock_count = 0;
3951
3952 trace_ram_dirty_bitmap_sync_start();
3953
fbd162e6 3954 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
edd090c7
PX
3955 qemu_savevm_send_recv_bitmap(file, block->idstr);
3956 trace_ram_dirty_bitmap_request(block->idstr);
3957 ramblock_count++;
3958 }
3959
3960 trace_ram_dirty_bitmap_sync_wait();
3961
3962 /* Wait until all the ramblocks' dirty bitmap synced */
3963 while (ramblock_count--) {
3964 qemu_sem_wait(&s->rp_state.rp_sem);
3965 }
3966
3967 trace_ram_dirty_bitmap_sync_complete();
3968
3969 return 0;
3970}
3971
3972static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3973{
3974 qemu_sem_post(&s->rp_state.rp_sem);
3975}
3976
a335debb
PX
3977/*
3978 * Read the received bitmap, revert it as the initial dirty bitmap.
3979 * This is only used when the postcopy migration is paused but wants
3980 * to resume from a middle point.
3981 */
3982int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3983{
3984 int ret = -EINVAL;
3985 QEMUFile *file = s->rp_state.from_dst_file;
3986 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
a725ef9f 3987 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
3988 uint64_t size, end_mark;
3989
3990 trace_ram_dirty_bitmap_reload_begin(block->idstr);
3991
3992 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3993 error_report("%s: incorrect state %s", __func__,
3994 MigrationStatus_str(s->state));
3995 return -EINVAL;
3996 }
3997
3998 /*
3999 * Note: see comments in ramblock_recv_bitmap_send() on why we
3a4452d8 4000 * need the endianness conversion, and the paddings.
a335debb
PX
4001 */
4002 local_size = ROUND_UP(local_size, 8);
4003
4004 /* Add paddings */
4005 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4006
4007 size = qemu_get_be64(file);
4008
4009 /* The size of the bitmap should match with our ramblock */
4010 if (size != local_size) {
4011 error_report("%s: ramblock '%s' bitmap size mismatch "
4012 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4013 block->idstr, size, local_size);
4014 ret = -EINVAL;
4015 goto out;
4016 }
4017
4018 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4019 end_mark = qemu_get_be64(file);
4020
4021 ret = qemu_file_get_error(file);
4022 if (ret || size != local_size) {
4023 error_report("%s: read bitmap failed for ramblock '%s': %d"
4024 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4025 __func__, block->idstr, ret, local_size, size);
4026 ret = -EIO;
4027 goto out;
4028 }
4029
4030 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
af3bbbe9 4031 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
a335debb
PX
4032 __func__, block->idstr, end_mark);
4033 ret = -EINVAL;
4034 goto out;
4035 }
4036
4037 /*
3a4452d8 4038 * Endianness conversion. We are during postcopy (though paused).
a335debb
PX
4039 * The dirty bitmap won't change. We can directly modify it.
4040 */
4041 bitmap_from_le(block->bmap, le_bitmap, nbits);
4042
4043 /*
4044 * What we received is "received bitmap". Revert it as the initial
4045 * dirty bitmap for this ramblock.
4046 */
4047 bitmap_complement(block->bmap, block->bmap, nbits);
4048
4049 trace_ram_dirty_bitmap_reload_complete(block->idstr);
4050
edd090c7
PX
4051 /*
4052 * We succeeded to sync bitmap for current ramblock. If this is
4053 * the last one to sync, we need to notify the main send thread.
4054 */
4055 ram_dirty_bitmap_reload_notify(s);
4056
a335debb
PX
4057 ret = 0;
4058out:
bf269906 4059 g_free(le_bitmap);
a335debb
PX
4060 return ret;
4061}
4062
edd090c7
PX
4063static int ram_resume_prepare(MigrationState *s, void *opaque)
4064{
4065 RAMState *rs = *(RAMState **)opaque;
08614f34 4066 int ret;
edd090c7 4067
08614f34
PX
4068 ret = ram_dirty_bitmap_sync_all(s, rs);
4069 if (ret) {
4070 return ret;
4071 }
4072
4073 ram_state_resume_prepare(rs, s->to_dst_file);
4074
4075 return 0;
edd090c7
PX
4076}
4077
56e93d26 4078static SaveVMHandlers savevm_ram_handlers = {
9907e842 4079 .save_setup = ram_save_setup,
56e93d26 4080 .save_live_iterate = ram_save_iterate,
763c906b 4081 .save_live_complete_postcopy = ram_save_complete,
a3e06c3d 4082 .save_live_complete_precopy = ram_save_complete,
c6467627 4083 .has_postcopy = ram_has_postcopy,
56e93d26
JQ
4084 .save_live_pending = ram_save_pending,
4085 .load_state = ram_load,
f265e0e4
JQ
4086 .save_cleanup = ram_save_cleanup,
4087 .load_setup = ram_load_setup,
4088 .load_cleanup = ram_load_cleanup,
edd090c7 4089 .resume_prepare = ram_resume_prepare,
56e93d26
JQ
4090};
4091
4092void ram_mig_init(void)
4093{
4094 qemu_mutex_init(&XBZRLE.lock);
ce62df53 4095 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
56e93d26 4096}