]> git.proxmox.com Git - mirror_qemu.git/blame - migration/ram.c
chardev: Add macOS to list of OSes that support -chardev serial
[mirror_qemu.git] / migration / ram.c
CommitLineData
56e93d26
JQ
1/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
76cc7b58
JQ
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
56e93d26
JQ
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
e688df6b 28
1393a485 29#include "qemu/osdep.h"
33c11879 30#include "cpu.h"
f348b6d1 31#include "qemu/cutils.h"
56e93d26
JQ
32#include "qemu/bitops.h"
33#include "qemu/bitmap.h"
7205c9ec 34#include "qemu/main-loop.h"
709e3fe8 35#include "xbzrle.h"
7b1e1a22 36#include "ram.h"
6666c96a 37#include "migration.h"
f2a8f0a6 38#include "migration/register.h"
7b1e1a22 39#include "migration/misc.h"
08a0aee1 40#include "qemu-file.h"
be07b0ac 41#include "postcopy-ram.h"
53d37d36 42#include "page_cache.h"
56e93d26 43#include "qemu/error-report.h"
e688df6b 44#include "qapi/error.h"
ab7cbb0b 45#include "qapi/qapi-types-migration.h"
9af23989 46#include "qapi/qapi-events-migration.h"
8acabf69 47#include "qapi/qmp/qerror.h"
56e93d26 48#include "trace.h"
56e93d26 49#include "exec/ram_addr.h"
f9494614 50#include "exec/target_page.h"
56e93d26 51#include "qemu/rcu_queue.h"
a91246c9 52#include "migration/colo.h"
53d37d36 53#include "block.h"
af8b7d2b 54#include "sysemu/sysemu.h"
edd090c7 55#include "savevm.h"
b9ee2f7d 56#include "qemu/iov.h"
d32ca5ad 57#include "multifd.h"
56e93d26 58
56e93d26
JQ
59/***********************************************************/
60/* ram save/restore */
61
bb890ed5
JQ
62/* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
63 * worked for pages that where filled with the same char. We switched
64 * it to only search for the zero value. And to avoid confusion with
65 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
66 */
67
56e93d26 68#define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
bb890ed5 69#define RAM_SAVE_FLAG_ZERO 0x02
56e93d26
JQ
70#define RAM_SAVE_FLAG_MEM_SIZE 0x04
71#define RAM_SAVE_FLAG_PAGE 0x08
72#define RAM_SAVE_FLAG_EOS 0x10
73#define RAM_SAVE_FLAG_CONTINUE 0x20
74#define RAM_SAVE_FLAG_XBZRLE 0x40
75/* 0x80 is reserved in migration.h start with 0x100 next */
76#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
77
56e93d26
JQ
78static inline bool is_zero_range(uint8_t *p, uint64_t size)
79{
a1febc49 80 return buffer_is_zero(p, size);
56e93d26
JQ
81}
82
9360447d
JQ
83XBZRLECacheStats xbzrle_counters;
84
56e93d26
JQ
85/* struct contains XBZRLE cache and a static page
86 used by the compression */
87static struct {
88 /* buffer used for XBZRLE encoding */
89 uint8_t *encoded_buf;
90 /* buffer for storing page content */
91 uint8_t *current_buf;
92 /* Cache for XBZRLE, Protected by lock. */
93 PageCache *cache;
94 QemuMutex lock;
c00e0928
JQ
95 /* it will store a page full of zeros */
96 uint8_t *zero_target_page;
f265e0e4
JQ
97 /* buffer used for XBZRLE decoding */
98 uint8_t *decoded_buf;
56e93d26
JQ
99} XBZRLE;
100
56e93d26
JQ
101static void XBZRLE_cache_lock(void)
102{
103 if (migrate_use_xbzrle())
104 qemu_mutex_lock(&XBZRLE.lock);
105}
106
107static void XBZRLE_cache_unlock(void)
108{
109 if (migrate_use_xbzrle())
110 qemu_mutex_unlock(&XBZRLE.lock);
111}
112
3d0684b2
JQ
113/**
114 * xbzrle_cache_resize: resize the xbzrle cache
115 *
116 * This function is called from qmp_migrate_set_cache_size in main
117 * thread, possibly while a migration is in progress. A running
118 * migration may be using the cache and might finish during this call,
119 * hence changes to the cache are protected by XBZRLE.lock().
120 *
c9dede2d 121 * Returns 0 for success or -1 for error
3d0684b2
JQ
122 *
123 * @new_size: new cache size
8acabf69 124 * @errp: set *errp if the check failed, with reason
56e93d26 125 */
c9dede2d 126int xbzrle_cache_resize(int64_t new_size, Error **errp)
56e93d26
JQ
127{
128 PageCache *new_cache;
c9dede2d 129 int64_t ret = 0;
56e93d26 130
8acabf69
JQ
131 /* Check for truncation */
132 if (new_size != (size_t)new_size) {
133 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
134 "exceeding address space");
135 return -1;
136 }
137
2a313e5c
JQ
138 if (new_size == migrate_xbzrle_cache_size()) {
139 /* nothing to do */
c9dede2d 140 return 0;
2a313e5c
JQ
141 }
142
56e93d26
JQ
143 XBZRLE_cache_lock();
144
145 if (XBZRLE.cache != NULL) {
80f8dfde 146 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
56e93d26 147 if (!new_cache) {
56e93d26
JQ
148 ret = -1;
149 goto out;
150 }
151
152 cache_fini(XBZRLE.cache);
153 XBZRLE.cache = new_cache;
154 }
56e93d26
JQ
155out:
156 XBZRLE_cache_unlock();
157 return ret;
158}
159
fbd162e6
YK
160static bool ramblock_is_ignored(RAMBlock *block)
161{
162 return !qemu_ram_is_migratable(block) ||
163 (migrate_ignore_shared() && qemu_ram_is_shared(block));
164}
165
b895de50 166/* Should be holding either ram_list.mutex, or the RCU lock. */
fbd162e6
YK
167#define RAMBLOCK_FOREACH_NOT_IGNORED(block) \
168 INTERNAL_RAMBLOCK_FOREACH(block) \
169 if (ramblock_is_ignored(block)) {} else
170
b895de50 171#define RAMBLOCK_FOREACH_MIGRATABLE(block) \
343f632c 172 INTERNAL_RAMBLOCK_FOREACH(block) \
b895de50
CLG
173 if (!qemu_ram_is_migratable(block)) {} else
174
343f632c
DDAG
175#undef RAMBLOCK_FOREACH
176
fbd162e6
YK
177int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
178{
179 RAMBlock *block;
180 int ret = 0;
181
89ac5a1d
DDAG
182 RCU_READ_LOCK_GUARD();
183
fbd162e6
YK
184 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
185 ret = func(block, opaque);
186 if (ret) {
187 break;
188 }
189 }
fbd162e6
YK
190 return ret;
191}
192
f9494614
AP
193static void ramblock_recv_map_init(void)
194{
195 RAMBlock *rb;
196
fbd162e6 197 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
f9494614
AP
198 assert(!rb->receivedmap);
199 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
200 }
201}
202
203int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
204{
205 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
206 rb->receivedmap);
207}
208
1cba9f6e
DDAG
209bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
210{
211 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
212}
213
f9494614
AP
214void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
215{
216 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
217}
218
219void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
220 size_t nr)
221{
222 bitmap_set_atomic(rb->receivedmap,
223 ramblock_recv_bitmap_offset(host_addr, rb),
224 nr);
225}
226
a335debb
PX
227#define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
228
229/*
230 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
231 *
232 * Returns >0 if success with sent bytes, or <0 if error.
233 */
234int64_t ramblock_recv_bitmap_send(QEMUFile *file,
235 const char *block_name)
236{
237 RAMBlock *block = qemu_ram_block_by_name(block_name);
238 unsigned long *le_bitmap, nbits;
239 uint64_t size;
240
241 if (!block) {
242 error_report("%s: invalid block name: %s", __func__, block_name);
243 return -1;
244 }
245
246 nbits = block->used_length >> TARGET_PAGE_BITS;
247
248 /*
249 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
250 * machines we may need 4 more bytes for padding (see below
251 * comment). So extend it a bit before hand.
252 */
253 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
254
255 /*
256 * Always use little endian when sending the bitmap. This is
257 * required that when source and destination VMs are not using the
258 * same endianess. (Note: big endian won't work.)
259 */
260 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
261
262 /* Size of the bitmap, in bytes */
a725ef9f 263 size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
264
265 /*
266 * size is always aligned to 8 bytes for 64bit machines, but it
267 * may not be true for 32bit machines. We need this padding to
268 * make sure the migration can survive even between 32bit and
269 * 64bit machines.
270 */
271 size = ROUND_UP(size, 8);
272
273 qemu_put_be64(file, size);
274 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
275 /*
276 * Mark as an end, in case the middle part is screwed up due to
277 * some "misterious" reason.
278 */
279 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
280 qemu_fflush(file);
281
bf269906 282 g_free(le_bitmap);
a335debb
PX
283
284 if (qemu_file_get_error(file)) {
285 return qemu_file_get_error(file);
286 }
287
288 return size + sizeof(size);
289}
290
ec481c6c
JQ
291/*
292 * An outstanding page request, on the source, having been received
293 * and queued
294 */
295struct RAMSrcPageRequest {
296 RAMBlock *rb;
297 hwaddr offset;
298 hwaddr len;
299
300 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
301};
302
6f37bb8b
JQ
303/* State of RAM for migration */
304struct RAMState {
204b88b8
JQ
305 /* QEMUFile used for this migration */
306 QEMUFile *f;
6f37bb8b
JQ
307 /* Last block that we have visited searching for dirty pages */
308 RAMBlock *last_seen_block;
309 /* Last block from where we have sent data */
310 RAMBlock *last_sent_block;
269ace29
JQ
311 /* Last dirty target page we have sent */
312 ram_addr_t last_page;
6f37bb8b
JQ
313 /* last ram version we have seen */
314 uint32_t last_version;
315 /* We are in the first round */
316 bool ram_bulk_stage;
6eeb63f7
WW
317 /* The free page optimization is enabled */
318 bool fpo_enabled;
8d820d6f
JQ
319 /* How many times we have dirty too many pages */
320 int dirty_rate_high_cnt;
f664da80
JQ
321 /* these variables are used for bitmap sync */
322 /* last time we did a full bitmap_sync */
323 int64_t time_last_bitmap_sync;
eac74159 324 /* bytes transferred at start_time */
c4bdf0cf 325 uint64_t bytes_xfer_prev;
a66cd90c 326 /* number of dirty pages since start_time */
68908ed6 327 uint64_t num_dirty_pages_period;
b5833fde
JQ
328 /* xbzrle misses since the beginning of the period */
329 uint64_t xbzrle_cache_miss_prev;
76e03000
XG
330
331 /* compression statistics since the beginning of the period */
332 /* amount of count that no free thread to compress data */
333 uint64_t compress_thread_busy_prev;
334 /* amount bytes after compression */
335 uint64_t compressed_size_prev;
336 /* amount of compressed pages */
337 uint64_t compress_pages_prev;
338
be8b02ed
XG
339 /* total handled target pages at the beginning of period */
340 uint64_t target_page_count_prev;
341 /* total handled target pages since start */
342 uint64_t target_page_count;
9360447d 343 /* number of dirty bits in the bitmap */
2dfaf12e 344 uint64_t migration_dirty_pages;
386a907b 345 /* Protects modification of the bitmap and migration dirty pages */
108cfae0 346 QemuMutex bitmap_mutex;
68a098f3
JQ
347 /* The RAMBlock used in the last src_page_requests */
348 RAMBlock *last_req_rb;
ec481c6c
JQ
349 /* Queue of outstanding page requests from the destination */
350 QemuMutex src_page_req_mutex;
b58deb34 351 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
6f37bb8b
JQ
352};
353typedef struct RAMState RAMState;
354
53518d94 355static RAMState *ram_state;
6f37bb8b 356
bd227060
WW
357static NotifierWithReturnList precopy_notifier_list;
358
359void precopy_infrastructure_init(void)
360{
361 notifier_with_return_list_init(&precopy_notifier_list);
362}
363
364void precopy_add_notifier(NotifierWithReturn *n)
365{
366 notifier_with_return_list_add(&precopy_notifier_list, n);
367}
368
369void precopy_remove_notifier(NotifierWithReturn *n)
370{
371 notifier_with_return_remove(n);
372}
373
374int precopy_notify(PrecopyNotifyReason reason, Error **errp)
375{
376 PrecopyNotifyData pnd;
377 pnd.reason = reason;
378 pnd.errp = errp;
379
380 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
381}
382
6eeb63f7
WW
383void precopy_enable_free_page_optimization(void)
384{
385 if (!ram_state) {
386 return;
387 }
388
389 ram_state->fpo_enabled = true;
390}
391
9edabd4d 392uint64_t ram_bytes_remaining(void)
2f4fde93 393{
bae416e5
DDAG
394 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
395 0;
2f4fde93
JQ
396}
397
9360447d 398MigrationStats ram_counters;
96506894 399
b8fb8cb7
DDAG
400/* used by the search for pages to send */
401struct PageSearchStatus {
402 /* Current block being searched */
403 RAMBlock *block;
a935e30f
JQ
404 /* Current page to search from */
405 unsigned long page;
b8fb8cb7
DDAG
406 /* Set once we wrap around */
407 bool complete_round;
408};
409typedef struct PageSearchStatus PageSearchStatus;
410
76e03000
XG
411CompressionStats compression_counters;
412
56e93d26 413struct CompressParam {
56e93d26 414 bool done;
90e56fb4 415 bool quit;
5e5fdcff 416 bool zero_page;
56e93d26
JQ
417 QEMUFile *file;
418 QemuMutex mutex;
419 QemuCond cond;
420 RAMBlock *block;
421 ram_addr_t offset;
34ab9e97
XG
422
423 /* internally used fields */
dcaf446e 424 z_stream stream;
34ab9e97 425 uint8_t *originbuf;
56e93d26
JQ
426};
427typedef struct CompressParam CompressParam;
428
429struct DecompressParam {
73a8912b 430 bool done;
90e56fb4 431 bool quit;
56e93d26
JQ
432 QemuMutex mutex;
433 QemuCond cond;
434 void *des;
d341d9f3 435 uint8_t *compbuf;
56e93d26 436 int len;
797ca154 437 z_stream stream;
56e93d26
JQ
438};
439typedef struct DecompressParam DecompressParam;
440
441static CompressParam *comp_param;
442static QemuThread *compress_threads;
443/* comp_done_cond is used to wake up the migration thread when
444 * one of the compression threads has finished the compression.
445 * comp_done_lock is used to co-work with comp_done_cond.
446 */
0d9f9a5c
LL
447static QemuMutex comp_done_lock;
448static QemuCond comp_done_cond;
56e93d26
JQ
449/* The empty QEMUFileOps will be used by file in CompressParam */
450static const QEMUFileOps empty_ops = { };
451
34ab9e97 452static QEMUFile *decomp_file;
56e93d26
JQ
453static DecompressParam *decomp_param;
454static QemuThread *decompress_threads;
73a8912b
LL
455static QemuMutex decomp_done_lock;
456static QemuCond decomp_done_cond;
56e93d26 457
5e5fdcff 458static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
6ef3771c 459 ram_addr_t offset, uint8_t *source_buf);
56e93d26
JQ
460
461static void *do_data_compress(void *opaque)
462{
463 CompressParam *param = opaque;
a7a9a88f
LL
464 RAMBlock *block;
465 ram_addr_t offset;
5e5fdcff 466 bool zero_page;
56e93d26 467
a7a9a88f 468 qemu_mutex_lock(&param->mutex);
90e56fb4 469 while (!param->quit) {
a7a9a88f
LL
470 if (param->block) {
471 block = param->block;
472 offset = param->offset;
473 param->block = NULL;
474 qemu_mutex_unlock(&param->mutex);
475
5e5fdcff
XG
476 zero_page = do_compress_ram_page(param->file, &param->stream,
477 block, offset, param->originbuf);
a7a9a88f 478
0d9f9a5c 479 qemu_mutex_lock(&comp_done_lock);
a7a9a88f 480 param->done = true;
5e5fdcff 481 param->zero_page = zero_page;
0d9f9a5c
LL
482 qemu_cond_signal(&comp_done_cond);
483 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
484
485 qemu_mutex_lock(&param->mutex);
486 } else {
56e93d26
JQ
487 qemu_cond_wait(&param->cond, &param->mutex);
488 }
56e93d26 489 }
a7a9a88f 490 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
491
492 return NULL;
493}
494
f0afa331 495static void compress_threads_save_cleanup(void)
56e93d26
JQ
496{
497 int i, thread_count;
498
05306935 499 if (!migrate_use_compression() || !comp_param) {
56e93d26
JQ
500 return;
501 }
05306935 502
56e93d26
JQ
503 thread_count = migrate_compress_threads();
504 for (i = 0; i < thread_count; i++) {
dcaf446e
XG
505 /*
506 * we use it as a indicator which shows if the thread is
507 * properly init'd or not
508 */
509 if (!comp_param[i].file) {
510 break;
511 }
05306935
FL
512
513 qemu_mutex_lock(&comp_param[i].mutex);
514 comp_param[i].quit = true;
515 qemu_cond_signal(&comp_param[i].cond);
516 qemu_mutex_unlock(&comp_param[i].mutex);
517
56e93d26 518 qemu_thread_join(compress_threads + i);
56e93d26
JQ
519 qemu_mutex_destroy(&comp_param[i].mutex);
520 qemu_cond_destroy(&comp_param[i].cond);
dcaf446e 521 deflateEnd(&comp_param[i].stream);
34ab9e97 522 g_free(comp_param[i].originbuf);
dcaf446e
XG
523 qemu_fclose(comp_param[i].file);
524 comp_param[i].file = NULL;
56e93d26 525 }
0d9f9a5c
LL
526 qemu_mutex_destroy(&comp_done_lock);
527 qemu_cond_destroy(&comp_done_cond);
56e93d26
JQ
528 g_free(compress_threads);
529 g_free(comp_param);
56e93d26
JQ
530 compress_threads = NULL;
531 comp_param = NULL;
56e93d26
JQ
532}
533
dcaf446e 534static int compress_threads_save_setup(void)
56e93d26
JQ
535{
536 int i, thread_count;
537
538 if (!migrate_use_compression()) {
dcaf446e 539 return 0;
56e93d26 540 }
56e93d26
JQ
541 thread_count = migrate_compress_threads();
542 compress_threads = g_new0(QemuThread, thread_count);
543 comp_param = g_new0(CompressParam, thread_count);
0d9f9a5c
LL
544 qemu_cond_init(&comp_done_cond);
545 qemu_mutex_init(&comp_done_lock);
56e93d26 546 for (i = 0; i < thread_count; i++) {
34ab9e97
XG
547 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
548 if (!comp_param[i].originbuf) {
549 goto exit;
550 }
551
dcaf446e
XG
552 if (deflateInit(&comp_param[i].stream,
553 migrate_compress_level()) != Z_OK) {
34ab9e97 554 g_free(comp_param[i].originbuf);
dcaf446e
XG
555 goto exit;
556 }
557
e110aa91
C
558 /* comp_param[i].file is just used as a dummy buffer to save data,
559 * set its ops to empty.
56e93d26
JQ
560 */
561 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
562 comp_param[i].done = true;
90e56fb4 563 comp_param[i].quit = false;
56e93d26
JQ
564 qemu_mutex_init(&comp_param[i].mutex);
565 qemu_cond_init(&comp_param[i].cond);
566 qemu_thread_create(compress_threads + i, "compress",
567 do_data_compress, comp_param + i,
568 QEMU_THREAD_JOINABLE);
569 }
dcaf446e
XG
570 return 0;
571
572exit:
573 compress_threads_save_cleanup();
574 return -1;
56e93d26
JQ
575}
576
577/**
3d0684b2 578 * save_page_header: write page header to wire
56e93d26
JQ
579 *
580 * If this is the 1st block, it also writes the block identification
581 *
3d0684b2 582 * Returns the number of bytes written
56e93d26
JQ
583 *
584 * @f: QEMUFile where to send the data
585 * @block: block that contains the page we want to send
586 * @offset: offset inside the block for the page
587 * in the lower bits, it contains flags
588 */
2bf3aa85
JQ
589static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
590 ram_addr_t offset)
56e93d26 591{
9f5f380b 592 size_t size, len;
56e93d26 593
24795694
JQ
594 if (block == rs->last_sent_block) {
595 offset |= RAM_SAVE_FLAG_CONTINUE;
596 }
2bf3aa85 597 qemu_put_be64(f, offset);
56e93d26
JQ
598 size = 8;
599
600 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
9f5f380b 601 len = strlen(block->idstr);
2bf3aa85
JQ
602 qemu_put_byte(f, len);
603 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
9f5f380b 604 size += 1 + len;
24795694 605 rs->last_sent_block = block;
56e93d26
JQ
606 }
607 return size;
608}
609
3d0684b2
JQ
610/**
611 * mig_throttle_guest_down: throotle down the guest
612 *
613 * Reduce amount of guest cpu execution to hopefully slow down memory
614 * writes. If guest dirty memory rate is reduced below the rate at
615 * which we can transfer pages to the destination then we should be
616 * able to complete migration. Some workloads dirty memory way too
617 * fast and will not effectively converge, even with auto-converge.
070afca2
JH
618 */
619static void mig_throttle_guest_down(void)
620{
621 MigrationState *s = migrate_get_current();
2594f56d
DB
622 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
623 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
4cbc9c7f 624 int pct_max = s->parameters.max_cpu_throttle;
070afca2
JH
625
626 /* We have not started throttling yet. Let's start it. */
627 if (!cpu_throttle_active()) {
628 cpu_throttle_set(pct_initial);
629 } else {
630 /* Throttling already on, just increase the rate */
4cbc9c7f
LQ
631 cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
632 pct_max));
070afca2
JH
633 }
634}
635
3d0684b2
JQ
636/**
637 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
638 *
6f37bb8b 639 * @rs: current RAM state
3d0684b2
JQ
640 * @current_addr: address for the zero page
641 *
642 * Update the xbzrle cache to reflect a page that's been sent as all 0.
56e93d26
JQ
643 * The important thing is that a stale (not-yet-0'd) page be replaced
644 * by the new data.
645 * As a bonus, if the page wasn't in the cache it gets added so that
3d0684b2 646 * when a small write is made into the 0'd page it gets XBZRLE sent.
56e93d26 647 */
6f37bb8b 648static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
56e93d26 649{
6f37bb8b 650 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
56e93d26
JQ
651 return;
652 }
653
654 /* We don't care if this fails to allocate a new cache page
655 * as long as it updated an old one */
c00e0928 656 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
9360447d 657 ram_counters.dirty_sync_count);
56e93d26
JQ
658}
659
660#define ENCODING_FLAG_XBZRLE 0x1
661
662/**
663 * save_xbzrle_page: compress and send current page
664 *
665 * Returns: 1 means that we wrote the page
666 * 0 means that page is identical to the one already sent
667 * -1 means that xbzrle would be longer than normal
668 *
5a987738 669 * @rs: current RAM state
3d0684b2
JQ
670 * @current_data: pointer to the address of the page contents
671 * @current_addr: addr of the page
56e93d26
JQ
672 * @block: block that contains the page we want to send
673 * @offset: offset inside the block for the page
674 * @last_stage: if we are at the completion stage
56e93d26 675 */
204b88b8 676static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
56e93d26 677 ram_addr_t current_addr, RAMBlock *block,
072c2511 678 ram_addr_t offset, bool last_stage)
56e93d26
JQ
679{
680 int encoded_len = 0, bytes_xbzrle;
681 uint8_t *prev_cached_page;
682
9360447d
JQ
683 if (!cache_is_cached(XBZRLE.cache, current_addr,
684 ram_counters.dirty_sync_count)) {
685 xbzrle_counters.cache_miss++;
56e93d26
JQ
686 if (!last_stage) {
687 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
9360447d 688 ram_counters.dirty_sync_count) == -1) {
56e93d26
JQ
689 return -1;
690 } else {
691 /* update *current_data when the page has been
692 inserted into cache */
693 *current_data = get_cached_data(XBZRLE.cache, current_addr);
694 }
695 }
696 return -1;
697 }
698
699 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
700
701 /* save current buffer into memory */
702 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
703
704 /* XBZRLE encoding (if there is no overflow) */
705 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
706 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
707 TARGET_PAGE_SIZE);
ca353803
WY
708
709 /*
710 * Update the cache contents, so that it corresponds to the data
711 * sent, in all cases except where we skip the page.
712 */
713 if (!last_stage && encoded_len != 0) {
714 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
715 /*
716 * In the case where we couldn't compress, ensure that the caller
717 * sends the data from the cache, since the guest might have
718 * changed the RAM since we copied it.
719 */
720 *current_data = prev_cached_page;
721 }
722
56e93d26 723 if (encoded_len == 0) {
55c4446b 724 trace_save_xbzrle_page_skipping();
56e93d26
JQ
725 return 0;
726 } else if (encoded_len == -1) {
55c4446b 727 trace_save_xbzrle_page_overflow();
9360447d 728 xbzrle_counters.overflow++;
56e93d26
JQ
729 return -1;
730 }
731
56e93d26 732 /* Send XBZRLE based compressed page */
2bf3aa85 733 bytes_xbzrle = save_page_header(rs, rs->f, block,
204b88b8
JQ
734 offset | RAM_SAVE_FLAG_XBZRLE);
735 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
736 qemu_put_be16(rs->f, encoded_len);
737 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
56e93d26 738 bytes_xbzrle += encoded_len + 1 + 2;
9360447d
JQ
739 xbzrle_counters.pages++;
740 xbzrle_counters.bytes += bytes_xbzrle;
741 ram_counters.transferred += bytes_xbzrle;
56e93d26
JQ
742
743 return 1;
744}
745
3d0684b2
JQ
746/**
747 * migration_bitmap_find_dirty: find the next dirty page from start
f3f491fc 748 *
a5f7b1a6 749 * Returns the page offset within memory region of the start of a dirty page
3d0684b2 750 *
6f37bb8b 751 * @rs: current RAM state
3d0684b2 752 * @rb: RAMBlock where to search for dirty pages
a935e30f 753 * @start: page where we start the search
f3f491fc 754 */
56e93d26 755static inline
a935e30f 756unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
f20e2865 757 unsigned long start)
56e93d26 758{
6b6712ef
JQ
759 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
760 unsigned long *bitmap = rb->bmap;
56e93d26
JQ
761 unsigned long next;
762
fbd162e6 763 if (ramblock_is_ignored(rb)) {
b895de50
CLG
764 return size;
765 }
766
6eeb63f7
WW
767 /*
768 * When the free page optimization is enabled, we need to check the bitmap
769 * to send the non-free pages rather than all the pages in the bulk stage.
770 */
771 if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
6b6712ef 772 next = start + 1;
56e93d26 773 } else {
6b6712ef 774 next = find_next_bit(bitmap, size, start);
56e93d26
JQ
775 }
776
6b6712ef 777 return next;
56e93d26
JQ
778}
779
06b10688 780static inline bool migration_bitmap_clear_dirty(RAMState *rs,
f20e2865
JQ
781 RAMBlock *rb,
782 unsigned long page)
a82d593b
DDAG
783{
784 bool ret;
a82d593b 785
386a907b 786 qemu_mutex_lock(&rs->bitmap_mutex);
002cad6b
PX
787
788 /*
789 * Clear dirty bitmap if needed. This _must_ be called before we
790 * send any of the page in the chunk because we need to make sure
791 * we can capture further page content changes when we sync dirty
792 * log the next time. So as long as we are going to send any of
793 * the page in the chunk we clear the remote dirty bitmap for all.
794 * Clearing it earlier won't be a problem, but too late will.
795 */
796 if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
797 uint8_t shift = rb->clear_bmap_shift;
798 hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
8bba004c 799 hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
002cad6b
PX
800
801 /*
802 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
803 * can make things easier sometimes since then start address
804 * of the small chunk will always be 64 pages aligned so the
805 * bitmap will always be aligned to unsigned long. We should
806 * even be able to remove this restriction but I'm simply
807 * keeping it.
808 */
809 assert(shift >= 6);
810 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
811 memory_region_clear_dirty_bitmap(rb->mr, start, size);
812 }
813
6b6712ef 814 ret = test_and_clear_bit(page, rb->bmap);
a82d593b
DDAG
815
816 if (ret) {
0d8ec885 817 rs->migration_dirty_pages--;
a82d593b 818 }
386a907b
WW
819 qemu_mutex_unlock(&rs->bitmap_mutex);
820
a82d593b
DDAG
821 return ret;
822}
823
267691b6 824/* Called with RCU critical section */
7a3e9571 825static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
56e93d26 826{
0d8ec885 827 rs->migration_dirty_pages +=
5d0980a4 828 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length,
0d8ec885 829 &rs->num_dirty_pages_period);
56e93d26
JQ
830}
831
3d0684b2
JQ
832/**
833 * ram_pagesize_summary: calculate all the pagesizes of a VM
834 *
835 * Returns a summary bitmap of the page sizes of all RAMBlocks
836 *
837 * For VMs with just normal pages this is equivalent to the host page
838 * size. If it's got some huge pages then it's the OR of all the
839 * different page sizes.
e8ca1db2
DDAG
840 */
841uint64_t ram_pagesize_summary(void)
842{
843 RAMBlock *block;
844 uint64_t summary = 0;
845
fbd162e6 846 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
e8ca1db2
DDAG
847 summary |= block->page_size;
848 }
849
850 return summary;
851}
852
aecbfe9c
XG
853uint64_t ram_get_total_transferred_pages(void)
854{
855 return ram_counters.normal + ram_counters.duplicate +
856 compression_counters.pages + xbzrle_counters.pages;
857}
858
b734035b
XG
859static void migration_update_rates(RAMState *rs, int64_t end_time)
860{
be8b02ed 861 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
76e03000 862 double compressed_size;
b734035b
XG
863
864 /* calculate period counters */
865 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
866 / (end_time - rs->time_last_bitmap_sync);
867
be8b02ed 868 if (!page_count) {
b734035b
XG
869 return;
870 }
871
872 if (migrate_use_xbzrle()) {
873 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
be8b02ed 874 rs->xbzrle_cache_miss_prev) / page_count;
b734035b
XG
875 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
876 }
76e03000
XG
877
878 if (migrate_use_compression()) {
879 compression_counters.busy_rate = (double)(compression_counters.busy -
880 rs->compress_thread_busy_prev) / page_count;
881 rs->compress_thread_busy_prev = compression_counters.busy;
882
883 compressed_size = compression_counters.compressed_size -
884 rs->compressed_size_prev;
885 if (compressed_size) {
886 double uncompressed_size = (compression_counters.pages -
887 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
888
889 /* Compression-Ratio = Uncompressed-size / Compressed-size */
890 compression_counters.compression_rate =
891 uncompressed_size / compressed_size;
892
893 rs->compress_pages_prev = compression_counters.pages;
894 rs->compressed_size_prev = compression_counters.compressed_size;
895 }
896 }
b734035b
XG
897}
898
dc14a470
KZ
899static void migration_trigger_throttle(RAMState *rs)
900{
901 MigrationState *s = migrate_get_current();
902 uint64_t threshold = s->parameters.throttle_trigger_threshold;
903
904 uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
905 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
906 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
907
908 /* During block migration the auto-converge logic incorrectly detects
909 * that ram migration makes no progress. Avoid this by disabling the
910 * throttling logic during the bulk phase of block migration. */
911 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
912 /* The following detection logic can be refined later. For now:
913 Check to see if the ratio between dirtied bytes and the approx.
914 amount of bytes that just got transferred since the last time
915 we were in this routine reaches the threshold. If that happens
916 twice, start or increase throttling. */
917
918 if ((bytes_dirty_period > bytes_dirty_threshold) &&
919 (++rs->dirty_rate_high_cnt >= 2)) {
920 trace_migration_throttle();
921 rs->dirty_rate_high_cnt = 0;
922 mig_throttle_guest_down();
923 }
924 }
925}
926
8d820d6f 927static void migration_bitmap_sync(RAMState *rs)
56e93d26
JQ
928{
929 RAMBlock *block;
56e93d26 930 int64_t end_time;
56e93d26 931
9360447d 932 ram_counters.dirty_sync_count++;
56e93d26 933
f664da80
JQ
934 if (!rs->time_last_bitmap_sync) {
935 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
56e93d26
JQ
936 }
937
938 trace_migration_bitmap_sync_start();
9c1f8f44 939 memory_global_dirty_log_sync();
56e93d26 940
108cfae0 941 qemu_mutex_lock(&rs->bitmap_mutex);
89ac5a1d
DDAG
942 WITH_RCU_READ_LOCK_GUARD() {
943 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
944 ramblock_sync_dirty_bitmap(rs, block);
945 }
946 ram_counters.remaining = ram_bytes_remaining();
56e93d26 947 }
108cfae0 948 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26 949
9458a9a1 950 memory_global_after_dirty_log_sync();
a66cd90c 951 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1ffb5dfd 952
56e93d26
JQ
953 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
954
955 /* more than 1 second = 1000 millisecons */
f664da80 956 if (end_time > rs->time_last_bitmap_sync + 1000) {
dc14a470 957 migration_trigger_throttle(rs);
070afca2 958
b734035b
XG
959 migration_update_rates(rs, end_time);
960
be8b02ed 961 rs->target_page_count_prev = rs->target_page_count;
d693c6f1
FF
962
963 /* reset period counters */
f664da80 964 rs->time_last_bitmap_sync = end_time;
a66cd90c 965 rs->num_dirty_pages_period = 0;
dc14a470 966 rs->bytes_xfer_prev = ram_counters.transferred;
56e93d26 967 }
4addcd4f 968 if (migrate_use_events()) {
3ab72385 969 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
4addcd4f 970 }
56e93d26
JQ
971}
972
bd227060
WW
973static void migration_bitmap_sync_precopy(RAMState *rs)
974{
975 Error *local_err = NULL;
976
977 /*
978 * The current notifier usage is just an optimization to migration, so we
979 * don't stop the normal migration process in the error case.
980 */
981 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
982 error_report_err(local_err);
b4a1733c 983 local_err = NULL;
bd227060
WW
984 }
985
986 migration_bitmap_sync(rs);
987
988 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
989 error_report_err(local_err);
990 }
991}
992
6c97ec5f
XG
993/**
994 * save_zero_page_to_file: send the zero page to the file
995 *
996 * Returns the size of data written to the file, 0 means the page is not
997 * a zero page
998 *
999 * @rs: current RAM state
1000 * @file: the file where the data is saved
1001 * @block: block that contains the page we want to send
1002 * @offset: offset inside the block for the page
1003 */
1004static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1005 RAMBlock *block, ram_addr_t offset)
1006{
1007 uint8_t *p = block->host + offset;
1008 int len = 0;
1009
1010 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1011 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1012 qemu_put_byte(file, 0);
1013 len += 1;
1014 }
1015 return len;
1016}
1017
56e93d26 1018/**
3d0684b2 1019 * save_zero_page: send the zero page to the stream
56e93d26 1020 *
3d0684b2 1021 * Returns the number of pages written.
56e93d26 1022 *
f7ccd61b 1023 * @rs: current RAM state
56e93d26
JQ
1024 * @block: block that contains the page we want to send
1025 * @offset: offset inside the block for the page
56e93d26 1026 */
7faccdc3 1027static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
56e93d26 1028{
6c97ec5f 1029 int len = save_zero_page_to_file(rs, rs->f, block, offset);
56e93d26 1030
6c97ec5f 1031 if (len) {
9360447d 1032 ram_counters.duplicate++;
6c97ec5f
XG
1033 ram_counters.transferred += len;
1034 return 1;
56e93d26 1035 }
6c97ec5f 1036 return -1;
56e93d26
JQ
1037}
1038
5727309d 1039static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
53f09a10 1040{
5727309d 1041 if (!migrate_release_ram() || !migration_in_postcopy()) {
53f09a10
PB
1042 return;
1043 }
1044
8bba004c 1045 ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
53f09a10
PB
1046}
1047
059ff0fb
XG
1048/*
1049 * @pages: the number of pages written by the control path,
1050 * < 0 - error
1051 * > 0 - number of pages written
1052 *
1053 * Return true if the pages has been saved, otherwise false is returned.
1054 */
1055static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1056 int *pages)
1057{
1058 uint64_t bytes_xmit = 0;
1059 int ret;
1060
1061 *pages = -1;
1062 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1063 &bytes_xmit);
1064 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1065 return false;
1066 }
1067
1068 if (bytes_xmit) {
1069 ram_counters.transferred += bytes_xmit;
1070 *pages = 1;
1071 }
1072
1073 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1074 return true;
1075 }
1076
1077 if (bytes_xmit > 0) {
1078 ram_counters.normal++;
1079 } else if (bytes_xmit == 0) {
1080 ram_counters.duplicate++;
1081 }
1082
1083 return true;
1084}
1085
65dacaa0
XG
1086/*
1087 * directly send the page to the stream
1088 *
1089 * Returns the number of pages written.
1090 *
1091 * @rs: current RAM state
1092 * @block: block that contains the page we want to send
1093 * @offset: offset inside the block for the page
1094 * @buf: the page to be sent
1095 * @async: send to page asyncly
1096 */
1097static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1098 uint8_t *buf, bool async)
1099{
1100 ram_counters.transferred += save_page_header(rs, rs->f, block,
1101 offset | RAM_SAVE_FLAG_PAGE);
1102 if (async) {
1103 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1104 migrate_release_ram() &
1105 migration_in_postcopy());
1106 } else {
1107 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1108 }
1109 ram_counters.transferred += TARGET_PAGE_SIZE;
1110 ram_counters.normal++;
1111 return 1;
1112}
1113
56e93d26 1114/**
3d0684b2 1115 * ram_save_page: send the given page to the stream
56e93d26 1116 *
3d0684b2 1117 * Returns the number of pages written.
3fd3c4b3
DDAG
1118 * < 0 - error
1119 * >=0 - Number of pages written - this might legally be 0
1120 * if xbzrle noticed the page was the same.
56e93d26 1121 *
6f37bb8b 1122 * @rs: current RAM state
56e93d26
JQ
1123 * @block: block that contains the page we want to send
1124 * @offset: offset inside the block for the page
1125 * @last_stage: if we are at the completion stage
56e93d26 1126 */
a0a8aa14 1127static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
56e93d26
JQ
1128{
1129 int pages = -1;
56e93d26 1130 uint8_t *p;
56e93d26 1131 bool send_async = true;
a08f6890 1132 RAMBlock *block = pss->block;
8bba004c 1133 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
059ff0fb 1134 ram_addr_t current_addr = block->offset + offset;
56e93d26 1135
2f68e399 1136 p = block->host + offset;
1db9d8e5 1137 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
56e93d26 1138
56e93d26 1139 XBZRLE_cache_lock();
d7400a34
XG
1140 if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1141 migrate_use_xbzrle()) {
059ff0fb
XG
1142 pages = save_xbzrle_page(rs, &p, current_addr, block,
1143 offset, last_stage);
1144 if (!last_stage) {
1145 /* Can't send this cached data async, since the cache page
1146 * might get updated before it gets to the wire
56e93d26 1147 */
059ff0fb 1148 send_async = false;
56e93d26
JQ
1149 }
1150 }
1151
1152 /* XBZRLE overflow or normal page */
1153 if (pages == -1) {
65dacaa0 1154 pages = save_normal_page(rs, block, offset, p, send_async);
56e93d26
JQ
1155 }
1156
1157 XBZRLE_cache_unlock();
1158
1159 return pages;
1160}
1161
b9ee2f7d
JQ
1162static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1163 ram_addr_t offset)
1164{
67a4c891 1165 if (multifd_queue_page(rs->f, block, offset) < 0) {
713f762a
IR
1166 return -1;
1167 }
b9ee2f7d
JQ
1168 ram_counters.normal++;
1169
1170 return 1;
1171}
1172
5e5fdcff 1173static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
6ef3771c 1174 ram_addr_t offset, uint8_t *source_buf)
56e93d26 1175{
53518d94 1176 RAMState *rs = ram_state;
a7a9a88f 1177 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
5e5fdcff 1178 bool zero_page = false;
6ef3771c 1179 int ret;
56e93d26 1180
5e5fdcff
XG
1181 if (save_zero_page_to_file(rs, f, block, offset)) {
1182 zero_page = true;
1183 goto exit;
1184 }
1185
6ef3771c 1186 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
34ab9e97
XG
1187
1188 /*
1189 * copy it to a internal buffer to avoid it being modified by VM
1190 * so that we can catch up the error during compression and
1191 * decompression
1192 */
1193 memcpy(source_buf, p, TARGET_PAGE_SIZE);
6ef3771c
XG
1194 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1195 if (ret < 0) {
1196 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
b3be2896 1197 error_report("compressed data failed!");
5e5fdcff 1198 return false;
b3be2896 1199 }
56e93d26 1200
5e5fdcff 1201exit:
6ef3771c 1202 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
5e5fdcff
XG
1203 return zero_page;
1204}
1205
1206static void
1207update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1208{
76e03000
XG
1209 ram_counters.transferred += bytes_xmit;
1210
5e5fdcff
XG
1211 if (param->zero_page) {
1212 ram_counters.duplicate++;
76e03000 1213 return;
5e5fdcff 1214 }
76e03000
XG
1215
1216 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1217 compression_counters.compressed_size += bytes_xmit - 8;
1218 compression_counters.pages++;
56e93d26
JQ
1219}
1220
32b05495
XG
1221static bool save_page_use_compression(RAMState *rs);
1222
ce25d337 1223static void flush_compressed_data(RAMState *rs)
56e93d26
JQ
1224{
1225 int idx, len, thread_count;
1226
32b05495 1227 if (!save_page_use_compression(rs)) {
56e93d26
JQ
1228 return;
1229 }
1230 thread_count = migrate_compress_threads();
a7a9a88f 1231
0d9f9a5c 1232 qemu_mutex_lock(&comp_done_lock);
56e93d26 1233 for (idx = 0; idx < thread_count; idx++) {
a7a9a88f 1234 while (!comp_param[idx].done) {
0d9f9a5c 1235 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26 1236 }
a7a9a88f 1237 }
0d9f9a5c 1238 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
1239
1240 for (idx = 0; idx < thread_count; idx++) {
1241 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 1242 if (!comp_param[idx].quit) {
ce25d337 1243 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
5e5fdcff
XG
1244 /*
1245 * it's safe to fetch zero_page without holding comp_done_lock
1246 * as there is no further request submitted to the thread,
1247 * i.e, the thread should be waiting for a request at this point.
1248 */
1249 update_compress_thread_counts(&comp_param[idx], len);
56e93d26 1250 }
a7a9a88f 1251 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26
JQ
1252 }
1253}
1254
1255static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1256 ram_addr_t offset)
1257{
1258 param->block = block;
1259 param->offset = offset;
1260}
1261
ce25d337
JQ
1262static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1263 ram_addr_t offset)
56e93d26
JQ
1264{
1265 int idx, thread_count, bytes_xmit = -1, pages = -1;
1d58872a 1266 bool wait = migrate_compress_wait_thread();
56e93d26
JQ
1267
1268 thread_count = migrate_compress_threads();
0d9f9a5c 1269 qemu_mutex_lock(&comp_done_lock);
1d58872a
XG
1270retry:
1271 for (idx = 0; idx < thread_count; idx++) {
1272 if (comp_param[idx].done) {
1273 comp_param[idx].done = false;
1274 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1275 qemu_mutex_lock(&comp_param[idx].mutex);
1276 set_compress_params(&comp_param[idx], block, offset);
1277 qemu_cond_signal(&comp_param[idx].cond);
1278 qemu_mutex_unlock(&comp_param[idx].mutex);
1279 pages = 1;
5e5fdcff 1280 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
56e93d26 1281 break;
56e93d26
JQ
1282 }
1283 }
1d58872a
XG
1284
1285 /*
1286 * wait for the free thread if the user specifies 'compress-wait-thread',
1287 * otherwise we will post the page out in the main thread as normal page.
1288 */
1289 if (pages < 0 && wait) {
1290 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1291 goto retry;
1292 }
0d9f9a5c 1293 qemu_mutex_unlock(&comp_done_lock);
56e93d26
JQ
1294
1295 return pages;
1296}
1297
3d0684b2
JQ
1298/**
1299 * find_dirty_block: find the next dirty page and update any state
1300 * associated with the search process.
b9e60928 1301 *
a5f7b1a6 1302 * Returns true if a page is found
b9e60928 1303 *
6f37bb8b 1304 * @rs: current RAM state
3d0684b2
JQ
1305 * @pss: data about the state of the current dirty page scan
1306 * @again: set to false if the search has scanned the whole of RAM
b9e60928 1307 */
f20e2865 1308static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
b9e60928 1309{
f20e2865 1310 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
6f37bb8b 1311 if (pss->complete_round && pss->block == rs->last_seen_block &&
a935e30f 1312 pss->page >= rs->last_page) {
b9e60928
DDAG
1313 /*
1314 * We've been once around the RAM and haven't found anything.
1315 * Give up.
1316 */
1317 *again = false;
1318 return false;
1319 }
8bba004c
AR
1320 if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS)
1321 >= pss->block->used_length) {
b9e60928 1322 /* Didn't find anything in this RAM Block */
a935e30f 1323 pss->page = 0;
b9e60928
DDAG
1324 pss->block = QLIST_NEXT_RCU(pss->block, next);
1325 if (!pss->block) {
48df9d80
XG
1326 /*
1327 * If memory migration starts over, we will meet a dirtied page
1328 * which may still exists in compression threads's ring, so we
1329 * should flush the compressed data to make sure the new page
1330 * is not overwritten by the old one in the destination.
1331 *
1332 * Also If xbzrle is on, stop using the data compression at this
1333 * point. In theory, xbzrle can do better than compression.
1334 */
1335 flush_compressed_data(rs);
1336
b9e60928
DDAG
1337 /* Hit the end of the list */
1338 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1339 /* Flag that we've looped */
1340 pss->complete_round = true;
6f37bb8b 1341 rs->ram_bulk_stage = false;
b9e60928
DDAG
1342 }
1343 /* Didn't find anything this time, but try again on the new block */
1344 *again = true;
1345 return false;
1346 } else {
1347 /* Can go around again, but... */
1348 *again = true;
1349 /* We've found something so probably don't need to */
1350 return true;
1351 }
1352}
1353
3d0684b2
JQ
1354/**
1355 * unqueue_page: gets a page of the queue
1356 *
a82d593b 1357 * Helper for 'get_queued_page' - gets a page off the queue
a82d593b 1358 *
3d0684b2
JQ
1359 * Returns the block of the page (or NULL if none available)
1360 *
ec481c6c 1361 * @rs: current RAM state
3d0684b2 1362 * @offset: used to return the offset within the RAMBlock
a82d593b 1363 */
f20e2865 1364static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
a82d593b
DDAG
1365{
1366 RAMBlock *block = NULL;
1367
ae526e32
XG
1368 if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1369 return NULL;
1370 }
1371
ec481c6c
JQ
1372 qemu_mutex_lock(&rs->src_page_req_mutex);
1373 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1374 struct RAMSrcPageRequest *entry =
1375 QSIMPLEQ_FIRST(&rs->src_page_requests);
a82d593b
DDAG
1376 block = entry->rb;
1377 *offset = entry->offset;
a82d593b
DDAG
1378
1379 if (entry->len > TARGET_PAGE_SIZE) {
1380 entry->len -= TARGET_PAGE_SIZE;
1381 entry->offset += TARGET_PAGE_SIZE;
1382 } else {
1383 memory_region_unref(block->mr);
ec481c6c 1384 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
a82d593b 1385 g_free(entry);
e03a34f8 1386 migration_consume_urgent_request();
a82d593b
DDAG
1387 }
1388 }
ec481c6c 1389 qemu_mutex_unlock(&rs->src_page_req_mutex);
a82d593b
DDAG
1390
1391 return block;
1392}
1393
3d0684b2 1394/**
ff1543af 1395 * get_queued_page: unqueue a page from the postcopy requests
3d0684b2
JQ
1396 *
1397 * Skips pages that are already sent (!dirty)
a82d593b 1398 *
a5f7b1a6 1399 * Returns true if a queued page is found
a82d593b 1400 *
6f37bb8b 1401 * @rs: current RAM state
3d0684b2 1402 * @pss: data about the state of the current dirty page scan
a82d593b 1403 */
f20e2865 1404static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
a82d593b
DDAG
1405{
1406 RAMBlock *block;
1407 ram_addr_t offset;
1408 bool dirty;
1409
1410 do {
f20e2865 1411 block = unqueue_page(rs, &offset);
a82d593b
DDAG
1412 /*
1413 * We're sending this page, and since it's postcopy nothing else
1414 * will dirty it, and we must make sure it doesn't get sent again
1415 * even if this queue request was received after the background
1416 * search already sent it.
1417 */
1418 if (block) {
f20e2865
JQ
1419 unsigned long page;
1420
6b6712ef
JQ
1421 page = offset >> TARGET_PAGE_BITS;
1422 dirty = test_bit(page, block->bmap);
a82d593b 1423 if (!dirty) {
06b10688 1424 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
64737606 1425 page);
a82d593b 1426 } else {
f20e2865 1427 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
a82d593b
DDAG
1428 }
1429 }
1430
1431 } while (block && !dirty);
1432
1433 if (block) {
1434 /*
1435 * As soon as we start servicing pages out of order, then we have
1436 * to kill the bulk stage, since the bulk stage assumes
1437 * in (migration_bitmap_find_and_reset_dirty) that every page is
1438 * dirty, that's no longer true.
1439 */
6f37bb8b 1440 rs->ram_bulk_stage = false;
a82d593b
DDAG
1441
1442 /*
1443 * We want the background search to continue from the queued page
1444 * since the guest is likely to want other pages near to the page
1445 * it just requested.
1446 */
1447 pss->block = block;
a935e30f 1448 pss->page = offset >> TARGET_PAGE_BITS;
422314e7
WY
1449
1450 /*
1451 * This unqueued page would break the "one round" check, even is
1452 * really rare.
1453 */
1454 pss->complete_round = false;
a82d593b
DDAG
1455 }
1456
1457 return !!block;
1458}
1459
6c595cde 1460/**
5e58f968
JQ
1461 * migration_page_queue_free: drop any remaining pages in the ram
1462 * request queue
6c595cde 1463 *
3d0684b2
JQ
1464 * It should be empty at the end anyway, but in error cases there may
1465 * be some left. in case that there is any page left, we drop it.
1466 *
6c595cde 1467 */
83c13382 1468static void migration_page_queue_free(RAMState *rs)
6c595cde 1469{
ec481c6c 1470 struct RAMSrcPageRequest *mspr, *next_mspr;
6c595cde
DDAG
1471 /* This queue generally should be empty - but in the case of a failed
1472 * migration might have some droppings in.
1473 */
89ac5a1d 1474 RCU_READ_LOCK_GUARD();
ec481c6c 1475 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
6c595cde 1476 memory_region_unref(mspr->rb->mr);
ec481c6c 1477 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
6c595cde
DDAG
1478 g_free(mspr);
1479 }
6c595cde
DDAG
1480}
1481
1482/**
3d0684b2
JQ
1483 * ram_save_queue_pages: queue the page for transmission
1484 *
1485 * A request from postcopy destination for example.
1486 *
1487 * Returns zero on success or negative on error
1488 *
3d0684b2
JQ
1489 * @rbname: Name of the RAMBLock of the request. NULL means the
1490 * same that last one.
1491 * @start: starting address from the start of the RAMBlock
1492 * @len: length (in bytes) to send
6c595cde 1493 */
96506894 1494int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
6c595cde
DDAG
1495{
1496 RAMBlock *ramblock;
53518d94 1497 RAMState *rs = ram_state;
6c595cde 1498
9360447d 1499 ram_counters.postcopy_requests++;
89ac5a1d
DDAG
1500 RCU_READ_LOCK_GUARD();
1501
6c595cde
DDAG
1502 if (!rbname) {
1503 /* Reuse last RAMBlock */
68a098f3 1504 ramblock = rs->last_req_rb;
6c595cde
DDAG
1505
1506 if (!ramblock) {
1507 /*
1508 * Shouldn't happen, we can't reuse the last RAMBlock if
1509 * it's the 1st request.
1510 */
1511 error_report("ram_save_queue_pages no previous block");
03acb4e9 1512 return -1;
6c595cde
DDAG
1513 }
1514 } else {
1515 ramblock = qemu_ram_block_by_name(rbname);
1516
1517 if (!ramblock) {
1518 /* We shouldn't be asked for a non-existent RAMBlock */
1519 error_report("ram_save_queue_pages no block '%s'", rbname);
03acb4e9 1520 return -1;
6c595cde 1521 }
68a098f3 1522 rs->last_req_rb = ramblock;
6c595cde
DDAG
1523 }
1524 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1525 if (start+len > ramblock->used_length) {
9458ad6b
JQ
1526 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1527 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
6c595cde 1528 __func__, start, len, ramblock->used_length);
03acb4e9 1529 return -1;
6c595cde
DDAG
1530 }
1531
ec481c6c
JQ
1532 struct RAMSrcPageRequest *new_entry =
1533 g_malloc0(sizeof(struct RAMSrcPageRequest));
6c595cde
DDAG
1534 new_entry->rb = ramblock;
1535 new_entry->offset = start;
1536 new_entry->len = len;
1537
1538 memory_region_ref(ramblock->mr);
ec481c6c
JQ
1539 qemu_mutex_lock(&rs->src_page_req_mutex);
1540 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
e03a34f8 1541 migration_make_urgent_request();
ec481c6c 1542 qemu_mutex_unlock(&rs->src_page_req_mutex);
6c595cde
DDAG
1543
1544 return 0;
6c595cde
DDAG
1545}
1546
d7400a34
XG
1547static bool save_page_use_compression(RAMState *rs)
1548{
1549 if (!migrate_use_compression()) {
1550 return false;
1551 }
1552
1553 /*
1554 * If xbzrle is on, stop using the data compression after first
1555 * round of migration even if compression is enabled. In theory,
1556 * xbzrle can do better than compression.
1557 */
1558 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1559 return true;
1560 }
1561
1562 return false;
1563}
1564
5e5fdcff
XG
1565/*
1566 * try to compress the page before posting it out, return true if the page
1567 * has been properly handled by compression, otherwise needs other
1568 * paths to handle it
1569 */
1570static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1571{
1572 if (!save_page_use_compression(rs)) {
1573 return false;
1574 }
1575
1576 /*
1577 * When starting the process of a new block, the first page of
1578 * the block should be sent out before other pages in the same
1579 * block, and all the pages in last block should have been sent
1580 * out, keeping this order is important, because the 'cont' flag
1581 * is used to avoid resending the block name.
1582 *
1583 * We post the fist page as normal page as compression will take
1584 * much CPU resource.
1585 */
1586 if (block != rs->last_sent_block) {
1587 flush_compressed_data(rs);
1588 return false;
1589 }
1590
1591 if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1592 return true;
1593 }
1594
76e03000 1595 compression_counters.busy++;
5e5fdcff
XG
1596 return false;
1597}
1598
a82d593b 1599/**
3d0684b2 1600 * ram_save_target_page: save one target page
a82d593b 1601 *
3d0684b2 1602 * Returns the number of pages written
a82d593b 1603 *
6f37bb8b 1604 * @rs: current RAM state
3d0684b2 1605 * @pss: data about the page we want to send
a82d593b 1606 * @last_stage: if we are at the completion stage
a82d593b 1607 */
a0a8aa14 1608static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
f20e2865 1609 bool last_stage)
a82d593b 1610{
a8ec91f9 1611 RAMBlock *block = pss->block;
8bba004c 1612 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
a8ec91f9
XG
1613 int res;
1614
1615 if (control_save_page(rs, block, offset, &res)) {
1616 return res;
1617 }
1618
5e5fdcff
XG
1619 if (save_compress_page(rs, block, offset)) {
1620 return 1;
d7400a34
XG
1621 }
1622
1623 res = save_zero_page(rs, block, offset);
1624 if (res > 0) {
1625 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1626 * page would be stale
1627 */
1628 if (!save_page_use_compression(rs)) {
1629 XBZRLE_cache_lock();
1630 xbzrle_cache_zero_page(rs, block->offset + offset);
1631 XBZRLE_cache_unlock();
1632 }
1633 ram_release_pages(block->idstr, offset, res);
1634 return res;
1635 }
1636
da3f56cb 1637 /*
c6b3a2e0
WY
1638 * Do not use multifd for:
1639 * 1. Compression as the first page in the new block should be posted out
1640 * before sending the compressed page
1641 * 2. In postcopy as one whole host page should be placed
da3f56cb 1642 */
c6b3a2e0
WY
1643 if (!save_page_use_compression(rs) && migrate_use_multifd()
1644 && !migration_in_postcopy()) {
b9ee2f7d 1645 return ram_save_multifd_page(rs, block, offset);
a82d593b
DDAG
1646 }
1647
1faa5665 1648 return ram_save_page(rs, pss, last_stage);
a82d593b
DDAG
1649}
1650
1651/**
3d0684b2 1652 * ram_save_host_page: save a whole host page
a82d593b 1653 *
3d0684b2
JQ
1654 * Starting at *offset send pages up to the end of the current host
1655 * page. It's valid for the initial offset to point into the middle of
1656 * a host page in which case the remainder of the hostpage is sent.
1657 * Only dirty target pages are sent. Note that the host page size may
1658 * be a huge page for this block.
1eb3fc0a
DDAG
1659 * The saving stops at the boundary of the used_length of the block
1660 * if the RAMBlock isn't a multiple of the host page size.
a82d593b 1661 *
3d0684b2
JQ
1662 * Returns the number of pages written or negative on error
1663 *
6f37bb8b 1664 * @rs: current RAM state
3d0684b2 1665 * @ms: current migration state
3d0684b2 1666 * @pss: data about the page we want to send
a82d593b 1667 * @last_stage: if we are at the completion stage
a82d593b 1668 */
a0a8aa14 1669static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
f20e2865 1670 bool last_stage)
a82d593b
DDAG
1671{
1672 int tmppages, pages = 0;
a935e30f
JQ
1673 size_t pagesize_bits =
1674 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
4c011c37 1675
fbd162e6 1676 if (ramblock_is_ignored(pss->block)) {
b895de50
CLG
1677 error_report("block %s should not be migrated !", pss->block->idstr);
1678 return 0;
1679 }
1680
a82d593b 1681 do {
1faa5665
XG
1682 /* Check the pages is dirty and if it is send it */
1683 if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1684 pss->page++;
1685 continue;
1686 }
1687
f20e2865 1688 tmppages = ram_save_target_page(rs, pss, last_stage);
a82d593b
DDAG
1689 if (tmppages < 0) {
1690 return tmppages;
1691 }
1692
1693 pages += tmppages;
a935e30f 1694 pss->page++;
97e1e067
DDAG
1695 /* Allow rate limiting to happen in the middle of huge pages */
1696 migration_rate_limit();
1eb3fc0a 1697 } while ((pss->page & (pagesize_bits - 1)) &&
8bba004c
AR
1698 offset_in_ramblock(pss->block,
1699 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
a82d593b
DDAG
1700
1701 /* The offset we leave with is the last one we looked at */
a935e30f 1702 pss->page--;
a82d593b
DDAG
1703 return pages;
1704}
6c595cde 1705
56e93d26 1706/**
3d0684b2 1707 * ram_find_and_save_block: finds a dirty page and sends it to f
56e93d26
JQ
1708 *
1709 * Called within an RCU critical section.
1710 *
e8f3735f
XG
1711 * Returns the number of pages written where zero means no dirty pages,
1712 * or negative on error
56e93d26 1713 *
6f37bb8b 1714 * @rs: current RAM state
56e93d26 1715 * @last_stage: if we are at the completion stage
a82d593b
DDAG
1716 *
1717 * On systems where host-page-size > target-page-size it will send all the
1718 * pages in a host page that are dirty.
56e93d26
JQ
1719 */
1720
ce25d337 1721static int ram_find_and_save_block(RAMState *rs, bool last_stage)
56e93d26 1722{
b8fb8cb7 1723 PageSearchStatus pss;
56e93d26 1724 int pages = 0;
b9e60928 1725 bool again, found;
56e93d26 1726
0827b9e9
AA
1727 /* No dirty page as there is zero RAM */
1728 if (!ram_bytes_total()) {
1729 return pages;
1730 }
1731
6f37bb8b 1732 pss.block = rs->last_seen_block;
a935e30f 1733 pss.page = rs->last_page;
b8fb8cb7
DDAG
1734 pss.complete_round = false;
1735
1736 if (!pss.block) {
1737 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1738 }
56e93d26 1739
b9e60928 1740 do {
a82d593b 1741 again = true;
f20e2865 1742 found = get_queued_page(rs, &pss);
b9e60928 1743
a82d593b
DDAG
1744 if (!found) {
1745 /* priority queue empty, so just search for something dirty */
f20e2865 1746 found = find_dirty_block(rs, &pss, &again);
a82d593b 1747 }
f3f491fc 1748
a82d593b 1749 if (found) {
f20e2865 1750 pages = ram_save_host_page(rs, &pss, last_stage);
56e93d26 1751 }
b9e60928 1752 } while (!pages && again);
56e93d26 1753
6f37bb8b 1754 rs->last_seen_block = pss.block;
a935e30f 1755 rs->last_page = pss.page;
56e93d26
JQ
1756
1757 return pages;
1758}
1759
1760void acct_update_position(QEMUFile *f, size_t size, bool zero)
1761{
1762 uint64_t pages = size / TARGET_PAGE_SIZE;
f7ccd61b 1763
56e93d26 1764 if (zero) {
9360447d 1765 ram_counters.duplicate += pages;
56e93d26 1766 } else {
9360447d
JQ
1767 ram_counters.normal += pages;
1768 ram_counters.transferred += size;
56e93d26
JQ
1769 qemu_update_position(f, size);
1770 }
1771}
1772
fbd162e6 1773static uint64_t ram_bytes_total_common(bool count_ignored)
56e93d26
JQ
1774{
1775 RAMBlock *block;
1776 uint64_t total = 0;
1777
89ac5a1d
DDAG
1778 RCU_READ_LOCK_GUARD();
1779
fbd162e6
YK
1780 if (count_ignored) {
1781 RAMBLOCK_FOREACH_MIGRATABLE(block) {
1782 total += block->used_length;
1783 }
1784 } else {
1785 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1786 total += block->used_length;
1787 }
99e15582 1788 }
56e93d26
JQ
1789 return total;
1790}
1791
fbd162e6
YK
1792uint64_t ram_bytes_total(void)
1793{
1794 return ram_bytes_total_common(false);
1795}
1796
f265e0e4 1797static void xbzrle_load_setup(void)
56e93d26 1798{
f265e0e4 1799 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
56e93d26
JQ
1800}
1801
f265e0e4
JQ
1802static void xbzrle_load_cleanup(void)
1803{
1804 g_free(XBZRLE.decoded_buf);
1805 XBZRLE.decoded_buf = NULL;
1806}
1807
7d7c96be
PX
1808static void ram_state_cleanup(RAMState **rsp)
1809{
b9ccaf6d
DDAG
1810 if (*rsp) {
1811 migration_page_queue_free(*rsp);
1812 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1813 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1814 g_free(*rsp);
1815 *rsp = NULL;
1816 }
7d7c96be
PX
1817}
1818
84593a08
PX
1819static void xbzrle_cleanup(void)
1820{
1821 XBZRLE_cache_lock();
1822 if (XBZRLE.cache) {
1823 cache_fini(XBZRLE.cache);
1824 g_free(XBZRLE.encoded_buf);
1825 g_free(XBZRLE.current_buf);
1826 g_free(XBZRLE.zero_target_page);
1827 XBZRLE.cache = NULL;
1828 XBZRLE.encoded_buf = NULL;
1829 XBZRLE.current_buf = NULL;
1830 XBZRLE.zero_target_page = NULL;
1831 }
1832 XBZRLE_cache_unlock();
1833}
1834
f265e0e4 1835static void ram_save_cleanup(void *opaque)
56e93d26 1836{
53518d94 1837 RAMState **rsp = opaque;
6b6712ef 1838 RAMBlock *block;
eb859c53 1839
2ff64038 1840 /* caller have hold iothread lock or is in a bh, so there is
4633456c 1841 * no writing race against the migration bitmap
2ff64038 1842 */
6b6712ef
JQ
1843 memory_global_dirty_log_stop();
1844
fbd162e6 1845 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
002cad6b
PX
1846 g_free(block->clear_bmap);
1847 block->clear_bmap = NULL;
6b6712ef
JQ
1848 g_free(block->bmap);
1849 block->bmap = NULL;
56e93d26
JQ
1850 }
1851
84593a08 1852 xbzrle_cleanup();
f0afa331 1853 compress_threads_save_cleanup();
7d7c96be 1854 ram_state_cleanup(rsp);
56e93d26
JQ
1855}
1856
6f37bb8b 1857static void ram_state_reset(RAMState *rs)
56e93d26 1858{
6f37bb8b
JQ
1859 rs->last_seen_block = NULL;
1860 rs->last_sent_block = NULL;
269ace29 1861 rs->last_page = 0;
6f37bb8b
JQ
1862 rs->last_version = ram_list.version;
1863 rs->ram_bulk_stage = true;
6eeb63f7 1864 rs->fpo_enabled = false;
56e93d26
JQ
1865}
1866
1867#define MAX_WAIT 50 /* ms, half buffered_file limit */
1868
4f2e4252
DDAG
1869/*
1870 * 'expected' is the value you expect the bitmap mostly to be full
1871 * of; it won't bother printing lines that are all this value.
1872 * If 'todump' is null the migration bitmap is dumped.
1873 */
6b6712ef
JQ
1874void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1875 unsigned long pages)
4f2e4252 1876{
4f2e4252
DDAG
1877 int64_t cur;
1878 int64_t linelen = 128;
1879 char linebuf[129];
1880
6b6712ef 1881 for (cur = 0; cur < pages; cur += linelen) {
4f2e4252
DDAG
1882 int64_t curb;
1883 bool found = false;
1884 /*
1885 * Last line; catch the case where the line length
1886 * is longer than remaining ram
1887 */
6b6712ef
JQ
1888 if (cur + linelen > pages) {
1889 linelen = pages - cur;
4f2e4252
DDAG
1890 }
1891 for (curb = 0; curb < linelen; curb++) {
1892 bool thisbit = test_bit(cur + curb, todump);
1893 linebuf[curb] = thisbit ? '1' : '.';
1894 found = found || (thisbit != expected);
1895 }
1896 if (found) {
1897 linebuf[curb] = '\0';
1898 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
1899 }
1900 }
1901}
1902
e0b266f0
DDAG
1903/* **** functions for postcopy ***** */
1904
ced1c616
PB
1905void ram_postcopy_migrated_memory_release(MigrationState *ms)
1906{
1907 struct RAMBlock *block;
ced1c616 1908
fbd162e6 1909 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
6b6712ef
JQ
1910 unsigned long *bitmap = block->bmap;
1911 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1912 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
ced1c616
PB
1913
1914 while (run_start < range) {
1915 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
8bba004c
AR
1916 ram_discard_range(block->idstr,
1917 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
1918 ((ram_addr_t)(run_end - run_start))
1919 << TARGET_PAGE_BITS);
ced1c616
PB
1920 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1921 }
1922 }
1923}
1924
3d0684b2
JQ
1925/**
1926 * postcopy_send_discard_bm_ram: discard a RAMBlock
1927 *
1928 * Returns zero on success
1929 *
e0b266f0 1930 * Callback from postcopy_each_ram_send_discard for each RAMBlock
3d0684b2
JQ
1931 *
1932 * @ms: current migration state
89dab31b 1933 * @block: RAMBlock to discard
e0b266f0 1934 */
810cf2bb 1935static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
e0b266f0 1936{
6b6712ef 1937 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
e0b266f0 1938 unsigned long current;
1e7cf8c3 1939 unsigned long *bitmap = block->bmap;
e0b266f0 1940
6b6712ef 1941 for (current = 0; current < end; ) {
1e7cf8c3 1942 unsigned long one = find_next_bit(bitmap, end, current);
33a5cb62 1943 unsigned long zero, discard_length;
e0b266f0 1944
33a5cb62
WY
1945 if (one >= end) {
1946 break;
1947 }
e0b266f0 1948
1e7cf8c3 1949 zero = find_next_zero_bit(bitmap, end, one + 1);
33a5cb62
WY
1950
1951 if (zero >= end) {
1952 discard_length = end - one;
e0b266f0 1953 } else {
33a5cb62
WY
1954 discard_length = zero - one;
1955 }
810cf2bb 1956 postcopy_discard_send_range(ms, one, discard_length);
33a5cb62 1957 current = one + discard_length;
e0b266f0
DDAG
1958 }
1959
1960 return 0;
1961}
1962
3d0684b2
JQ
1963/**
1964 * postcopy_each_ram_send_discard: discard all RAMBlocks
1965 *
1966 * Returns 0 for success or negative for error
1967 *
e0b266f0
DDAG
1968 * Utility for the outgoing postcopy code.
1969 * Calls postcopy_send_discard_bm_ram for each RAMBlock
1970 * passing it bitmap indexes and name.
e0b266f0
DDAG
1971 * (qemu_ram_foreach_block ends up passing unscaled lengths
1972 * which would mean postcopy code would have to deal with target page)
3d0684b2
JQ
1973 *
1974 * @ms: current migration state
e0b266f0
DDAG
1975 */
1976static int postcopy_each_ram_send_discard(MigrationState *ms)
1977{
1978 struct RAMBlock *block;
1979 int ret;
1980
fbd162e6 1981 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
810cf2bb 1982 postcopy_discard_send_init(ms, block->idstr);
e0b266f0
DDAG
1983
1984 /*
1985 * Postcopy sends chunks of bitmap over the wire, but it
1986 * just needs indexes at this point, avoids it having
1987 * target page specific code.
1988 */
810cf2bb
WY
1989 ret = postcopy_send_discard_bm_ram(ms, block);
1990 postcopy_discard_send_finish(ms);
e0b266f0
DDAG
1991 if (ret) {
1992 return ret;
1993 }
1994 }
1995
1996 return 0;
1997}
1998
3d0684b2 1999/**
8324ef86 2000 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
3d0684b2
JQ
2001 *
2002 * Helper for postcopy_chunk_hostpages; it's called twice to
2003 * canonicalize the two bitmaps, that are similar, but one is
2004 * inverted.
99e314eb 2005 *
3d0684b2
JQ
2006 * Postcopy requires that all target pages in a hostpage are dirty or
2007 * clean, not a mix. This function canonicalizes the bitmaps.
99e314eb 2008 *
3d0684b2 2009 * @ms: current migration state
3d0684b2 2010 * @block: block that contains the page we want to canonicalize
99e314eb 2011 */
1e7cf8c3 2012static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
99e314eb 2013{
53518d94 2014 RAMState *rs = ram_state;
6b6712ef 2015 unsigned long *bitmap = block->bmap;
29c59172 2016 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
6b6712ef 2017 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
99e314eb
DDAG
2018 unsigned long run_start;
2019
29c59172
DDAG
2020 if (block->page_size == TARGET_PAGE_SIZE) {
2021 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2022 return;
2023 }
2024
1e7cf8c3
WY
2025 /* Find a dirty page */
2026 run_start = find_next_bit(bitmap, pages, 0);
99e314eb 2027
6b6712ef 2028 while (run_start < pages) {
99e314eb
DDAG
2029
2030 /*
2031 * If the start of this run of pages is in the middle of a host
2032 * page, then we need to fixup this host page.
2033 */
9dec3cc3 2034 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
99e314eb 2035 /* Find the end of this run */
1e7cf8c3 2036 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
99e314eb
DDAG
2037 /*
2038 * If the end isn't at the start of a host page, then the
2039 * run doesn't finish at the end of a host page
2040 * and we need to discard.
2041 */
99e314eb
DDAG
2042 }
2043
9dec3cc3 2044 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
99e314eb 2045 unsigned long page;
dad45ab2
WY
2046 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2047 host_ratio);
2048 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
99e314eb 2049
99e314eb
DDAG
2050 /* Clean up the bitmap */
2051 for (page = fixup_start_addr;
2052 page < fixup_start_addr + host_ratio; page++) {
99e314eb
DDAG
2053 /*
2054 * Remark them as dirty, updating the count for any pages
2055 * that weren't previously dirty.
2056 */
0d8ec885 2057 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
99e314eb
DDAG
2058 }
2059 }
2060
1e7cf8c3
WY
2061 /* Find the next dirty page for the next iteration */
2062 run_start = find_next_bit(bitmap, pages, run_start);
99e314eb
DDAG
2063 }
2064}
2065
3d0684b2 2066/**
89dab31b 2067 * postcopy_chunk_hostpages: discard any partially sent host page
3d0684b2 2068 *
99e314eb
DDAG
2069 * Utility for the outgoing postcopy code.
2070 *
2071 * Discard any partially sent host-page size chunks, mark any partially
29c59172
DDAG
2072 * dirty host-page size chunks as all dirty. In this case the host-page
2073 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
99e314eb 2074 *
3d0684b2
JQ
2075 * Returns zero on success
2076 *
2077 * @ms: current migration state
6b6712ef 2078 * @block: block we want to work with
99e314eb 2079 */
6b6712ef 2080static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
99e314eb 2081{
810cf2bb 2082 postcopy_discard_send_init(ms, block->idstr);
99e314eb 2083
6b6712ef 2084 /*
1e7cf8c3 2085 * Ensure that all partially dirty host pages are made fully dirty.
6b6712ef 2086 */
1e7cf8c3 2087 postcopy_chunk_hostpages_pass(ms, block);
99e314eb 2088
810cf2bb 2089 postcopy_discard_send_finish(ms);
99e314eb
DDAG
2090 return 0;
2091}
2092
3d0684b2
JQ
2093/**
2094 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2095 *
2096 * Returns zero on success
2097 *
e0b266f0
DDAG
2098 * Transmit the set of pages to be discarded after precopy to the target
2099 * these are pages that:
2100 * a) Have been previously transmitted but are now dirty again
2101 * b) Pages that have never been transmitted, this ensures that
2102 * any pages on the destination that have been mapped by background
2103 * tasks get discarded (transparent huge pages is the specific concern)
2104 * Hopefully this is pretty sparse
3d0684b2
JQ
2105 *
2106 * @ms: current migration state
e0b266f0
DDAG
2107 */
2108int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2109{
53518d94 2110 RAMState *rs = ram_state;
6b6712ef 2111 RAMBlock *block;
e0b266f0 2112 int ret;
e0b266f0 2113
89ac5a1d 2114 RCU_READ_LOCK_GUARD();
e0b266f0
DDAG
2115
2116 /* This should be our last sync, the src is now paused */
eb859c53 2117 migration_bitmap_sync(rs);
e0b266f0 2118
6b6712ef
JQ
2119 /* Easiest way to make sure we don't resume in the middle of a host-page */
2120 rs->last_seen_block = NULL;
2121 rs->last_sent_block = NULL;
2122 rs->last_page = 0;
e0b266f0 2123
fbd162e6 2124 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
6b6712ef
JQ
2125 /* Deal with TPS != HPS and huge pages */
2126 ret = postcopy_chunk_hostpages(ms, block);
2127 if (ret) {
6b6712ef
JQ
2128 return ret;
2129 }
e0b266f0 2130
e0b266f0 2131#ifdef DEBUG_POSTCOPY
1e7cf8c3
WY
2132 ram_debug_dump_bitmap(block->bmap, true,
2133 block->used_length >> TARGET_PAGE_BITS);
e0b266f0 2134#endif
6b6712ef
JQ
2135 }
2136 trace_ram_postcopy_send_discard_bitmap();
e0b266f0
DDAG
2137
2138 ret = postcopy_each_ram_send_discard(ms);
e0b266f0
DDAG
2139
2140 return ret;
2141}
2142
3d0684b2
JQ
2143/**
2144 * ram_discard_range: discard dirtied pages at the beginning of postcopy
e0b266f0 2145 *
3d0684b2 2146 * Returns zero on success
e0b266f0 2147 *
36449157
JQ
2148 * @rbname: name of the RAMBlock of the request. NULL means the
2149 * same that last one.
3d0684b2
JQ
2150 * @start: RAMBlock starting page
2151 * @length: RAMBlock size
e0b266f0 2152 */
aaa2064c 2153int ram_discard_range(const char *rbname, uint64_t start, size_t length)
e0b266f0 2154{
36449157 2155 trace_ram_discard_range(rbname, start, length);
d3a5038c 2156
89ac5a1d 2157 RCU_READ_LOCK_GUARD();
36449157 2158 RAMBlock *rb = qemu_ram_block_by_name(rbname);
e0b266f0
DDAG
2159
2160 if (!rb) {
36449157 2161 error_report("ram_discard_range: Failed to find block '%s'", rbname);
03acb4e9 2162 return -1;
e0b266f0
DDAG
2163 }
2164
814bb08f
PX
2165 /*
2166 * On source VM, we don't need to update the received bitmap since
2167 * we don't even have one.
2168 */
2169 if (rb->receivedmap) {
2170 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2171 length >> qemu_target_page_bits());
2172 }
2173
03acb4e9 2174 return ram_block_discard_range(rb, start, length);
e0b266f0
DDAG
2175}
2176
84593a08
PX
2177/*
2178 * For every allocation, we will try not to crash the VM if the
2179 * allocation failed.
2180 */
2181static int xbzrle_init(void)
2182{
2183 Error *local_err = NULL;
2184
2185 if (!migrate_use_xbzrle()) {
2186 return 0;
2187 }
2188
2189 XBZRLE_cache_lock();
2190
2191 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2192 if (!XBZRLE.zero_target_page) {
2193 error_report("%s: Error allocating zero page", __func__);
2194 goto err_out;
2195 }
2196
2197 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2198 TARGET_PAGE_SIZE, &local_err);
2199 if (!XBZRLE.cache) {
2200 error_report_err(local_err);
2201 goto free_zero_page;
2202 }
2203
2204 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2205 if (!XBZRLE.encoded_buf) {
2206 error_report("%s: Error allocating encoded_buf", __func__);
2207 goto free_cache;
2208 }
2209
2210 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2211 if (!XBZRLE.current_buf) {
2212 error_report("%s: Error allocating current_buf", __func__);
2213 goto free_encoded_buf;
2214 }
2215
2216 /* We are all good */
2217 XBZRLE_cache_unlock();
2218 return 0;
2219
2220free_encoded_buf:
2221 g_free(XBZRLE.encoded_buf);
2222 XBZRLE.encoded_buf = NULL;
2223free_cache:
2224 cache_fini(XBZRLE.cache);
2225 XBZRLE.cache = NULL;
2226free_zero_page:
2227 g_free(XBZRLE.zero_target_page);
2228 XBZRLE.zero_target_page = NULL;
2229err_out:
2230 XBZRLE_cache_unlock();
2231 return -ENOMEM;
2232}
2233
53518d94 2234static int ram_state_init(RAMState **rsp)
56e93d26 2235{
7d00ee6a
PX
2236 *rsp = g_try_new0(RAMState, 1);
2237
2238 if (!*rsp) {
2239 error_report("%s: Init ramstate fail", __func__);
2240 return -1;
2241 }
53518d94
JQ
2242
2243 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2244 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2245 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
56e93d26 2246
7d00ee6a 2247 /*
40c4d4a8
IR
2248 * Count the total number of pages used by ram blocks not including any
2249 * gaps due to alignment or unplugs.
03158519 2250 * This must match with the initial values of dirty bitmap.
7d00ee6a 2251 */
40c4d4a8 2252 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
7d00ee6a
PX
2253 ram_state_reset(*rsp);
2254
2255 return 0;
2256}
2257
d6eff5d7 2258static void ram_list_init_bitmaps(void)
7d00ee6a 2259{
002cad6b 2260 MigrationState *ms = migrate_get_current();
d6eff5d7
PX
2261 RAMBlock *block;
2262 unsigned long pages;
002cad6b 2263 uint8_t shift;
56e93d26 2264
0827b9e9
AA
2265 /* Skip setting bitmap if there is no RAM */
2266 if (ram_bytes_total()) {
002cad6b
PX
2267 shift = ms->clear_bitmap_shift;
2268 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2269 error_report("clear_bitmap_shift (%u) too big, using "
2270 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2271 shift = CLEAR_BITMAP_SHIFT_MAX;
2272 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2273 error_report("clear_bitmap_shift (%u) too small, using "
2274 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2275 shift = CLEAR_BITMAP_SHIFT_MIN;
2276 }
2277
fbd162e6 2278 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
d6eff5d7 2279 pages = block->max_length >> TARGET_PAGE_BITS;
03158519
WY
2280 /*
2281 * The initial dirty bitmap for migration must be set with all
2282 * ones to make sure we'll migrate every guest RAM page to
2283 * destination.
40c4d4a8
IR
2284 * Here we set RAMBlock.bmap all to 1 because when rebegin a
2285 * new migration after a failed migration, ram_list.
2286 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2287 * guest memory.
03158519 2288 */
6b6712ef 2289 block->bmap = bitmap_new(pages);
40c4d4a8 2290 bitmap_set(block->bmap, 0, pages);
002cad6b
PX
2291 block->clear_bmap_shift = shift;
2292 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
0827b9e9 2293 }
f3f491fc 2294 }
d6eff5d7
PX
2295}
2296
2297static void ram_init_bitmaps(RAMState *rs)
2298{
2299 /* For memory_global_dirty_log_start below. */
2300 qemu_mutex_lock_iothread();
2301 qemu_mutex_lock_ramlist();
f3f491fc 2302
89ac5a1d
DDAG
2303 WITH_RCU_READ_LOCK_GUARD() {
2304 ram_list_init_bitmaps();
2305 memory_global_dirty_log_start();
2306 migration_bitmap_sync_precopy(rs);
2307 }
56e93d26 2308 qemu_mutex_unlock_ramlist();
49877834 2309 qemu_mutex_unlock_iothread();
d6eff5d7
PX
2310}
2311
2312static int ram_init_all(RAMState **rsp)
2313{
2314 if (ram_state_init(rsp)) {
2315 return -1;
2316 }
2317
2318 if (xbzrle_init()) {
2319 ram_state_cleanup(rsp);
2320 return -1;
2321 }
2322
2323 ram_init_bitmaps(*rsp);
a91246c9
HZ
2324
2325 return 0;
2326}
2327
08614f34
PX
2328static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2329{
2330 RAMBlock *block;
2331 uint64_t pages = 0;
2332
2333 /*
2334 * Postcopy is not using xbzrle/compression, so no need for that.
2335 * Also, since source are already halted, we don't need to care
2336 * about dirty page logging as well.
2337 */
2338
fbd162e6 2339 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
08614f34
PX
2340 pages += bitmap_count_one(block->bmap,
2341 block->used_length >> TARGET_PAGE_BITS);
2342 }
2343
2344 /* This may not be aligned with current bitmaps. Recalculate. */
2345 rs->migration_dirty_pages = pages;
2346
2347 rs->last_seen_block = NULL;
2348 rs->last_sent_block = NULL;
2349 rs->last_page = 0;
2350 rs->last_version = ram_list.version;
2351 /*
2352 * Disable the bulk stage, otherwise we'll resend the whole RAM no
2353 * matter what we have sent.
2354 */
2355 rs->ram_bulk_stage = false;
2356
2357 /* Update RAMState cache of output QEMUFile */
2358 rs->f = out;
2359
2360 trace_ram_state_resume_prepare(pages);
2361}
2362
6bcb05fc
WW
2363/*
2364 * This function clears bits of the free pages reported by the caller from the
2365 * migration dirty bitmap. @addr is the host address corresponding to the
2366 * start of the continuous guest free pages, and @len is the total bytes of
2367 * those pages.
2368 */
2369void qemu_guest_free_page_hint(void *addr, size_t len)
2370{
2371 RAMBlock *block;
2372 ram_addr_t offset;
2373 size_t used_len, start, npages;
2374 MigrationState *s = migrate_get_current();
2375
2376 /* This function is currently expected to be used during live migration */
2377 if (!migration_is_setup_or_active(s->state)) {
2378 return;
2379 }
2380
2381 for (; len > 0; len -= used_len, addr += used_len) {
2382 block = qemu_ram_block_from_host(addr, false, &offset);
2383 if (unlikely(!block || offset >= block->used_length)) {
2384 /*
2385 * The implementation might not support RAMBlock resize during
2386 * live migration, but it could happen in theory with future
2387 * updates. So we add a check here to capture that case.
2388 */
2389 error_report_once("%s unexpected error", __func__);
2390 return;
2391 }
2392
2393 if (len <= block->used_length - offset) {
2394 used_len = len;
2395 } else {
2396 used_len = block->used_length - offset;
2397 }
2398
2399 start = offset >> TARGET_PAGE_BITS;
2400 npages = used_len >> TARGET_PAGE_BITS;
2401
2402 qemu_mutex_lock(&ram_state->bitmap_mutex);
2403 ram_state->migration_dirty_pages -=
2404 bitmap_count_one_with_offset(block->bmap, start, npages);
2405 bitmap_clear(block->bmap, start, npages);
2406 qemu_mutex_unlock(&ram_state->bitmap_mutex);
2407 }
2408}
2409
3d0684b2
JQ
2410/*
2411 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
a91246c9
HZ
2412 * long-running RCU critical section. When rcu-reclaims in the code
2413 * start to become numerous it will be necessary to reduce the
2414 * granularity of these critical sections.
2415 */
2416
3d0684b2
JQ
2417/**
2418 * ram_save_setup: Setup RAM for migration
2419 *
2420 * Returns zero to indicate success and negative for error
2421 *
2422 * @f: QEMUFile where to send the data
2423 * @opaque: RAMState pointer
2424 */
a91246c9
HZ
2425static int ram_save_setup(QEMUFile *f, void *opaque)
2426{
53518d94 2427 RAMState **rsp = opaque;
a91246c9
HZ
2428 RAMBlock *block;
2429
dcaf446e
XG
2430 if (compress_threads_save_setup()) {
2431 return -1;
2432 }
2433
a91246c9
HZ
2434 /* migration has already setup the bitmap, reuse it. */
2435 if (!migration_in_colo_state()) {
7d00ee6a 2436 if (ram_init_all(rsp) != 0) {
dcaf446e 2437 compress_threads_save_cleanup();
a91246c9 2438 return -1;
53518d94 2439 }
a91246c9 2440 }
53518d94 2441 (*rsp)->f = f;
a91246c9 2442
0e6ebd48
DDAG
2443 WITH_RCU_READ_LOCK_GUARD() {
2444 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
56e93d26 2445
0e6ebd48
DDAG
2446 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2447 qemu_put_byte(f, strlen(block->idstr));
2448 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2449 qemu_put_be64(f, block->used_length);
2450 if (migrate_postcopy_ram() && block->page_size !=
2451 qemu_host_page_size) {
2452 qemu_put_be64(f, block->page_size);
2453 }
2454 if (migrate_ignore_shared()) {
2455 qemu_put_be64(f, block->mr->addr);
2456 }
fbd162e6 2457 }
56e93d26
JQ
2458 }
2459
56e93d26
JQ
2460 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2461 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2462
99f2c6fb 2463 multifd_send_sync_main(f);
56e93d26 2464 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
35374cbd 2465 qemu_fflush(f);
56e93d26
JQ
2466
2467 return 0;
2468}
2469
3d0684b2
JQ
2470/**
2471 * ram_save_iterate: iterative stage for migration
2472 *
2473 * Returns zero to indicate success and negative for error
2474 *
2475 * @f: QEMUFile where to send the data
2476 * @opaque: RAMState pointer
2477 */
56e93d26
JQ
2478static int ram_save_iterate(QEMUFile *f, void *opaque)
2479{
53518d94
JQ
2480 RAMState **temp = opaque;
2481 RAMState *rs = *temp;
3d4095b2 2482 int ret = 0;
56e93d26
JQ
2483 int i;
2484 int64_t t0;
5c90308f 2485 int done = 0;
56e93d26 2486
b2557345
PL
2487 if (blk_mig_bulk_active()) {
2488 /* Avoid transferring ram during bulk phase of block migration as
2489 * the bulk phase will usually take a long time and transferring
2490 * ram updates during that time is pointless. */
2491 goto out;
2492 }
2493
89ac5a1d
DDAG
2494 WITH_RCU_READ_LOCK_GUARD() {
2495 if (ram_list.version != rs->last_version) {
2496 ram_state_reset(rs);
2497 }
56e93d26 2498
89ac5a1d
DDAG
2499 /* Read version before ram_list.blocks */
2500 smp_rmb();
56e93d26 2501
89ac5a1d 2502 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
56e93d26 2503
89ac5a1d
DDAG
2504 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2505 i = 0;
2506 while ((ret = qemu_file_rate_limit(f)) == 0 ||
2507 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2508 int pages;
e03a34f8 2509
89ac5a1d
DDAG
2510 if (qemu_file_get_error(f)) {
2511 break;
2512 }
e8f3735f 2513
89ac5a1d
DDAG
2514 pages = ram_find_and_save_block(rs, false);
2515 /* no more pages to sent */
2516 if (pages == 0) {
2517 done = 1;
2518 break;
2519 }
e8f3735f 2520
89ac5a1d
DDAG
2521 if (pages < 0) {
2522 qemu_file_set_error(f, pages);
56e93d26
JQ
2523 break;
2524 }
89ac5a1d
DDAG
2525
2526 rs->target_page_count += pages;
2527
644acf99
WY
2528 /*
2529 * During postcopy, it is necessary to make sure one whole host
2530 * page is sent in one chunk.
2531 */
2532 if (migrate_postcopy_ram()) {
2533 flush_compressed_data(rs);
2534 }
2535
89ac5a1d
DDAG
2536 /*
2537 * we want to check in the 1st loop, just in case it was the 1st
2538 * time and we had to sync the dirty bitmap.
2539 * qemu_clock_get_ns() is a bit expensive, so we only check each
2540 * some iterations
2541 */
2542 if ((i & 63) == 0) {
2543 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2544 1000000;
2545 if (t1 > MAX_WAIT) {
2546 trace_ram_save_iterate_big_wait(t1, i);
2547 break;
2548 }
2549 }
2550 i++;
56e93d26 2551 }
56e93d26 2552 }
56e93d26
JQ
2553
2554 /*
2555 * Must occur before EOS (or any QEMUFile operation)
2556 * because of RDMA protocol.
2557 */
2558 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2559
b2557345 2560out:
b69a0227
JQ
2561 if (ret >= 0
2562 && migration_is_setup_or_active(migrate_get_current()->state)) {
99f2c6fb 2563 multifd_send_sync_main(rs->f);
3d4095b2
JQ
2564 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2565 qemu_fflush(f);
2566 ram_counters.transferred += 8;
56e93d26 2567
3d4095b2
JQ
2568 ret = qemu_file_get_error(f);
2569 }
56e93d26
JQ
2570 if (ret < 0) {
2571 return ret;
2572 }
2573
5c90308f 2574 return done;
56e93d26
JQ
2575}
2576
3d0684b2
JQ
2577/**
2578 * ram_save_complete: function called to send the remaining amount of ram
2579 *
e8f3735f 2580 * Returns zero to indicate success or negative on error
3d0684b2
JQ
2581 *
2582 * Called with iothread lock
2583 *
2584 * @f: QEMUFile where to send the data
2585 * @opaque: RAMState pointer
2586 */
56e93d26
JQ
2587static int ram_save_complete(QEMUFile *f, void *opaque)
2588{
53518d94
JQ
2589 RAMState **temp = opaque;
2590 RAMState *rs = *temp;
e8f3735f 2591 int ret = 0;
6f37bb8b 2592
89ac5a1d
DDAG
2593 WITH_RCU_READ_LOCK_GUARD() {
2594 if (!migration_in_postcopy()) {
2595 migration_bitmap_sync_precopy(rs);
2596 }
56e93d26 2597
89ac5a1d 2598 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
56e93d26 2599
89ac5a1d 2600 /* try transferring iterative blocks of memory */
56e93d26 2601
89ac5a1d
DDAG
2602 /* flush all remaining blocks regardless of rate limiting */
2603 while (true) {
2604 int pages;
56e93d26 2605
89ac5a1d
DDAG
2606 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2607 /* no more blocks to sent */
2608 if (pages == 0) {
2609 break;
2610 }
2611 if (pages < 0) {
2612 ret = pages;
2613 break;
2614 }
e8f3735f 2615 }
56e93d26 2616
89ac5a1d
DDAG
2617 flush_compressed_data(rs);
2618 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2619 }
d09a6fde 2620
3d4095b2 2621 if (ret >= 0) {
99f2c6fb 2622 multifd_send_sync_main(rs->f);
3d4095b2
JQ
2623 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2624 qemu_fflush(f);
2625 }
56e93d26 2626
e8f3735f 2627 return ret;
56e93d26
JQ
2628}
2629
c31b098f 2630static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
47995026
VSO
2631 uint64_t *res_precopy_only,
2632 uint64_t *res_compatible,
2633 uint64_t *res_postcopy_only)
56e93d26 2634{
53518d94
JQ
2635 RAMState **temp = opaque;
2636 RAMState *rs = *temp;
56e93d26
JQ
2637 uint64_t remaining_size;
2638
9edabd4d 2639 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 2640
5727309d 2641 if (!migration_in_postcopy() &&
663e6c1d 2642 remaining_size < max_size) {
56e93d26 2643 qemu_mutex_lock_iothread();
89ac5a1d
DDAG
2644 WITH_RCU_READ_LOCK_GUARD() {
2645 migration_bitmap_sync_precopy(rs);
2646 }
56e93d26 2647 qemu_mutex_unlock_iothread();
9edabd4d 2648 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 2649 }
c31b098f 2650
86e1167e
VSO
2651 if (migrate_postcopy_ram()) {
2652 /* We can do postcopy, and all the data is postcopiable */
47995026 2653 *res_compatible += remaining_size;
86e1167e 2654 } else {
47995026 2655 *res_precopy_only += remaining_size;
86e1167e 2656 }
56e93d26
JQ
2657}
2658
2659static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2660{
2661 unsigned int xh_len;
2662 int xh_flags;
063e760a 2663 uint8_t *loaded_data;
56e93d26 2664
56e93d26
JQ
2665 /* extract RLE header */
2666 xh_flags = qemu_get_byte(f);
2667 xh_len = qemu_get_be16(f);
2668
2669 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2670 error_report("Failed to load XBZRLE page - wrong compression!");
2671 return -1;
2672 }
2673
2674 if (xh_len > TARGET_PAGE_SIZE) {
2675 error_report("Failed to load XBZRLE page - len overflow!");
2676 return -1;
2677 }
f265e0e4 2678 loaded_data = XBZRLE.decoded_buf;
56e93d26 2679 /* load data and decode */
f265e0e4 2680 /* it can change loaded_data to point to an internal buffer */
063e760a 2681 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
56e93d26
JQ
2682
2683 /* decode RLE */
063e760a 2684 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
56e93d26
JQ
2685 TARGET_PAGE_SIZE) == -1) {
2686 error_report("Failed to load XBZRLE page - decode error!");
2687 return -1;
2688 }
2689
2690 return 0;
2691}
2692
3d0684b2
JQ
2693/**
2694 * ram_block_from_stream: read a RAMBlock id from the migration stream
2695 *
2696 * Must be called from within a rcu critical section.
2697 *
56e93d26 2698 * Returns a pointer from within the RCU-protected ram_list.
a7180877 2699 *
3d0684b2
JQ
2700 * @f: QEMUFile where to read the data from
2701 * @flags: Page flags (mostly to see if it's a continuation of previous block)
a7180877 2702 */
3d0684b2 2703static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
56e93d26
JQ
2704{
2705 static RAMBlock *block = NULL;
2706 char id[256];
2707 uint8_t len;
2708
2709 if (flags & RAM_SAVE_FLAG_CONTINUE) {
4c4bad48 2710 if (!block) {
56e93d26
JQ
2711 error_report("Ack, bad migration stream!");
2712 return NULL;
2713 }
4c4bad48 2714 return block;
56e93d26
JQ
2715 }
2716
2717 len = qemu_get_byte(f);
2718 qemu_get_buffer(f, (uint8_t *)id, len);
2719 id[len] = 0;
2720
e3dd7493 2721 block = qemu_ram_block_by_name(id);
4c4bad48
HZ
2722 if (!block) {
2723 error_report("Can't find block %s", id);
2724 return NULL;
56e93d26
JQ
2725 }
2726
fbd162e6 2727 if (ramblock_is_ignored(block)) {
b895de50
CLG
2728 error_report("block %s should not be migrated !", id);
2729 return NULL;
2730 }
2731
4c4bad48
HZ
2732 return block;
2733}
2734
2735static inline void *host_from_ram_block_offset(RAMBlock *block,
2736 ram_addr_t offset)
2737{
2738 if (!offset_in_ramblock(block, offset)) {
2739 return NULL;
2740 }
2741
2742 return block->host + offset;
56e93d26
JQ
2743}
2744
13af18f2 2745static inline void *colo_cache_from_block_offset(RAMBlock *block,
8af66371 2746 ram_addr_t offset, bool record_bitmap)
13af18f2
ZC
2747{
2748 if (!offset_in_ramblock(block, offset)) {
2749 return NULL;
2750 }
2751 if (!block->colo_cache) {
2752 error_report("%s: colo_cache is NULL in block :%s",
2753 __func__, block->idstr);
2754 return NULL;
2755 }
7d9acafa
ZC
2756
2757 /*
2758 * During colo checkpoint, we need bitmap of these migrated pages.
2759 * It help us to decide which pages in ram cache should be flushed
2760 * into VM's RAM later.
2761 */
8af66371
HZ
2762 if (record_bitmap &&
2763 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
7d9acafa
ZC
2764 ram_state->migration_dirty_pages++;
2765 }
13af18f2
ZC
2766 return block->colo_cache + offset;
2767}
2768
3d0684b2
JQ
2769/**
2770 * ram_handle_compressed: handle the zero page case
2771 *
56e93d26
JQ
2772 * If a page (or a whole RDMA chunk) has been
2773 * determined to be zero, then zap it.
3d0684b2
JQ
2774 *
2775 * @host: host address for the zero page
2776 * @ch: what the page is filled from. We only support zero
2777 * @size: size of the zero page
56e93d26
JQ
2778 */
2779void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2780{
2781 if (ch != 0 || !is_zero_range(host, size)) {
2782 memset(host, ch, size);
2783 }
2784}
2785
797ca154
XG
2786/* return the size after decompression, or negative value on error */
2787static int
2788qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
2789 const uint8_t *source, size_t source_len)
2790{
2791 int err;
2792
2793 err = inflateReset(stream);
2794 if (err != Z_OK) {
2795 return -1;
2796 }
2797
2798 stream->avail_in = source_len;
2799 stream->next_in = (uint8_t *)source;
2800 stream->avail_out = dest_len;
2801 stream->next_out = dest;
2802
2803 err = inflate(stream, Z_NO_FLUSH);
2804 if (err != Z_STREAM_END) {
2805 return -1;
2806 }
2807
2808 return stream->total_out;
2809}
2810
56e93d26
JQ
2811static void *do_data_decompress(void *opaque)
2812{
2813 DecompressParam *param = opaque;
2814 unsigned long pagesize;
33d151f4 2815 uint8_t *des;
34ab9e97 2816 int len, ret;
56e93d26 2817
33d151f4 2818 qemu_mutex_lock(&param->mutex);
90e56fb4 2819 while (!param->quit) {
33d151f4
LL
2820 if (param->des) {
2821 des = param->des;
2822 len = param->len;
2823 param->des = 0;
2824 qemu_mutex_unlock(&param->mutex);
2825
56e93d26 2826 pagesize = TARGET_PAGE_SIZE;
34ab9e97
XG
2827
2828 ret = qemu_uncompress_data(&param->stream, des, pagesize,
2829 param->compbuf, len);
f548222c 2830 if (ret < 0 && migrate_get_current()->decompress_error_check) {
34ab9e97
XG
2831 error_report("decompress data failed");
2832 qemu_file_set_error(decomp_file, ret);
2833 }
73a8912b 2834
33d151f4
LL
2835 qemu_mutex_lock(&decomp_done_lock);
2836 param->done = true;
2837 qemu_cond_signal(&decomp_done_cond);
2838 qemu_mutex_unlock(&decomp_done_lock);
2839
2840 qemu_mutex_lock(&param->mutex);
2841 } else {
2842 qemu_cond_wait(&param->cond, &param->mutex);
2843 }
56e93d26 2844 }
33d151f4 2845 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
2846
2847 return NULL;
2848}
2849
34ab9e97 2850static int wait_for_decompress_done(void)
5533b2e9
LL
2851{
2852 int idx, thread_count;
2853
2854 if (!migrate_use_compression()) {
34ab9e97 2855 return 0;
5533b2e9
LL
2856 }
2857
2858 thread_count = migrate_decompress_threads();
2859 qemu_mutex_lock(&decomp_done_lock);
2860 for (idx = 0; idx < thread_count; idx++) {
2861 while (!decomp_param[idx].done) {
2862 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2863 }
2864 }
2865 qemu_mutex_unlock(&decomp_done_lock);
34ab9e97 2866 return qemu_file_get_error(decomp_file);
5533b2e9
LL
2867}
2868
f0afa331 2869static void compress_threads_load_cleanup(void)
56e93d26
JQ
2870{
2871 int i, thread_count;
2872
3416ab5b
JQ
2873 if (!migrate_use_compression()) {
2874 return;
2875 }
56e93d26
JQ
2876 thread_count = migrate_decompress_threads();
2877 for (i = 0; i < thread_count; i++) {
797ca154
XG
2878 /*
2879 * we use it as a indicator which shows if the thread is
2880 * properly init'd or not
2881 */
2882 if (!decomp_param[i].compbuf) {
2883 break;
2884 }
2885
56e93d26 2886 qemu_mutex_lock(&decomp_param[i].mutex);
90e56fb4 2887 decomp_param[i].quit = true;
56e93d26
JQ
2888 qemu_cond_signal(&decomp_param[i].cond);
2889 qemu_mutex_unlock(&decomp_param[i].mutex);
2890 }
2891 for (i = 0; i < thread_count; i++) {
797ca154
XG
2892 if (!decomp_param[i].compbuf) {
2893 break;
2894 }
2895
56e93d26
JQ
2896 qemu_thread_join(decompress_threads + i);
2897 qemu_mutex_destroy(&decomp_param[i].mutex);
2898 qemu_cond_destroy(&decomp_param[i].cond);
797ca154 2899 inflateEnd(&decomp_param[i].stream);
56e93d26 2900 g_free(decomp_param[i].compbuf);
797ca154 2901 decomp_param[i].compbuf = NULL;
56e93d26
JQ
2902 }
2903 g_free(decompress_threads);
2904 g_free(decomp_param);
56e93d26
JQ
2905 decompress_threads = NULL;
2906 decomp_param = NULL;
34ab9e97 2907 decomp_file = NULL;
56e93d26
JQ
2908}
2909
34ab9e97 2910static int compress_threads_load_setup(QEMUFile *f)
797ca154
XG
2911{
2912 int i, thread_count;
2913
2914 if (!migrate_use_compression()) {
2915 return 0;
2916 }
2917
2918 thread_count = migrate_decompress_threads();
2919 decompress_threads = g_new0(QemuThread, thread_count);
2920 decomp_param = g_new0(DecompressParam, thread_count);
2921 qemu_mutex_init(&decomp_done_lock);
2922 qemu_cond_init(&decomp_done_cond);
34ab9e97 2923 decomp_file = f;
797ca154
XG
2924 for (i = 0; i < thread_count; i++) {
2925 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
2926 goto exit;
2927 }
2928
2929 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2930 qemu_mutex_init(&decomp_param[i].mutex);
2931 qemu_cond_init(&decomp_param[i].cond);
2932 decomp_param[i].done = true;
2933 decomp_param[i].quit = false;
2934 qemu_thread_create(decompress_threads + i, "decompress",
2935 do_data_decompress, decomp_param + i,
2936 QEMU_THREAD_JOINABLE);
2937 }
2938 return 0;
2939exit:
2940 compress_threads_load_cleanup();
2941 return -1;
2942}
2943
c1bc6626 2944static void decompress_data_with_multi_threads(QEMUFile *f,
56e93d26
JQ
2945 void *host, int len)
2946{
2947 int idx, thread_count;
2948
2949 thread_count = migrate_decompress_threads();
73a8912b 2950 qemu_mutex_lock(&decomp_done_lock);
56e93d26
JQ
2951 while (true) {
2952 for (idx = 0; idx < thread_count; idx++) {
73a8912b 2953 if (decomp_param[idx].done) {
33d151f4
LL
2954 decomp_param[idx].done = false;
2955 qemu_mutex_lock(&decomp_param[idx].mutex);
c1bc6626 2956 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
56e93d26
JQ
2957 decomp_param[idx].des = host;
2958 decomp_param[idx].len = len;
33d151f4
LL
2959 qemu_cond_signal(&decomp_param[idx].cond);
2960 qemu_mutex_unlock(&decomp_param[idx].mutex);
56e93d26
JQ
2961 break;
2962 }
2963 }
2964 if (idx < thread_count) {
2965 break;
73a8912b
LL
2966 } else {
2967 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
56e93d26
JQ
2968 }
2969 }
73a8912b 2970 qemu_mutex_unlock(&decomp_done_lock);
56e93d26
JQ
2971}
2972
13af18f2
ZC
2973/*
2974 * colo cache: this is for secondary VM, we cache the whole
2975 * memory of the secondary VM, it is need to hold the global lock
2976 * to call this helper.
2977 */
2978int colo_init_ram_cache(void)
2979{
2980 RAMBlock *block;
2981
44901b5a
PB
2982 WITH_RCU_READ_LOCK_GUARD() {
2983 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2984 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
2985 NULL,
2986 false);
2987 if (!block->colo_cache) {
2988 error_report("%s: Can't alloc memory for COLO cache of block %s,"
2989 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
2990 block->used_length);
2991 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2992 if (block->colo_cache) {
2993 qemu_anon_ram_free(block->colo_cache, block->used_length);
2994 block->colo_cache = NULL;
2995 }
89ac5a1d 2996 }
44901b5a 2997 return -errno;
89ac5a1d 2998 }
13af18f2 2999 }
13af18f2 3000 }
44901b5a 3001
7d9acafa
ZC
3002 /*
3003 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3004 * with to decide which page in cache should be flushed into SVM's RAM. Here
3005 * we use the same name 'ram_bitmap' as for migration.
3006 */
3007 if (ram_bytes_total()) {
3008 RAMBlock *block;
3009
fbd162e6 3010 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7d9acafa 3011 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
7d9acafa 3012 block->bmap = bitmap_new(pages);
7d9acafa
ZC
3013 }
3014 }
7d9acafa 3015
0393031a 3016 ram_state_init(&ram_state);
13af18f2 3017 return 0;
13af18f2
ZC
3018}
3019
0393031a
HZ
3020/* TODO: duplicated with ram_init_bitmaps */
3021void colo_incoming_start_dirty_log(void)
3022{
3023 RAMBlock *block = NULL;
3024 /* For memory_global_dirty_log_start below. */
3025 qemu_mutex_lock_iothread();
3026 qemu_mutex_lock_ramlist();
3027
3028 memory_global_dirty_log_sync();
3029 WITH_RCU_READ_LOCK_GUARD() {
3030 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3031 ramblock_sync_dirty_bitmap(ram_state, block);
3032 /* Discard this dirty bitmap record */
3033 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3034 }
3035 memory_global_dirty_log_start();
3036 }
3037 ram_state->migration_dirty_pages = 0;
3038 qemu_mutex_unlock_ramlist();
3039 qemu_mutex_unlock_iothread();
3040}
3041
13af18f2
ZC
3042/* It is need to hold the global lock to call this helper */
3043void colo_release_ram_cache(void)
3044{
3045 RAMBlock *block;
3046
d1955d22 3047 memory_global_dirty_log_stop();
fbd162e6 3048 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7d9acafa
ZC
3049 g_free(block->bmap);
3050 block->bmap = NULL;
3051 }
3052
89ac5a1d
DDAG
3053 WITH_RCU_READ_LOCK_GUARD() {
3054 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3055 if (block->colo_cache) {
3056 qemu_anon_ram_free(block->colo_cache, block->used_length);
3057 block->colo_cache = NULL;
3058 }
13af18f2
ZC
3059 }
3060 }
0393031a 3061 ram_state_cleanup(&ram_state);
13af18f2
ZC
3062}
3063
f265e0e4
JQ
3064/**
3065 * ram_load_setup: Setup RAM for migration incoming side
3066 *
3067 * Returns zero to indicate success and negative for error
3068 *
3069 * @f: QEMUFile where to receive the data
3070 * @opaque: RAMState pointer
3071 */
3072static int ram_load_setup(QEMUFile *f, void *opaque)
3073{
34ab9e97 3074 if (compress_threads_load_setup(f)) {
797ca154
XG
3075 return -1;
3076 }
3077
f265e0e4 3078 xbzrle_load_setup();
f9494614 3079 ramblock_recv_map_init();
13af18f2 3080
f265e0e4
JQ
3081 return 0;
3082}
3083
3084static int ram_load_cleanup(void *opaque)
3085{
f9494614 3086 RAMBlock *rb;
56eb90af 3087
fbd162e6 3088 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
bd108a44 3089 qemu_ram_block_writeback(rb);
56eb90af
JH
3090 }
3091
f265e0e4 3092 xbzrle_load_cleanup();
f0afa331 3093 compress_threads_load_cleanup();
f9494614 3094
fbd162e6 3095 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
f9494614
AP
3096 g_free(rb->receivedmap);
3097 rb->receivedmap = NULL;
3098 }
13af18f2 3099
f265e0e4
JQ
3100 return 0;
3101}
3102
3d0684b2
JQ
3103/**
3104 * ram_postcopy_incoming_init: allocate postcopy data structures
3105 *
3106 * Returns 0 for success and negative if there was one error
3107 *
3108 * @mis: current migration incoming state
3109 *
3110 * Allocate data structures etc needed by incoming migration with
3111 * postcopy-ram. postcopy-ram's similarly names
3112 * postcopy_ram_incoming_init does the work.
1caddf8a
DDAG
3113 */
3114int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3115{
c136180c 3116 return postcopy_ram_incoming_init(mis);
1caddf8a
DDAG
3117}
3118
3d0684b2
JQ
3119/**
3120 * ram_load_postcopy: load a page in postcopy case
3121 *
3122 * Returns 0 for success or -errno in case of error
3123 *
a7180877
DDAG
3124 * Called in postcopy mode by ram_load().
3125 * rcu_read_lock is taken prior to this being called.
3d0684b2
JQ
3126 *
3127 * @f: QEMUFile where to send the data
a7180877
DDAG
3128 */
3129static int ram_load_postcopy(QEMUFile *f)
3130{
3131 int flags = 0, ret = 0;
3132 bool place_needed = false;
1aa83678 3133 bool matches_target_page_size = false;
a7180877
DDAG
3134 MigrationIncomingState *mis = migration_incoming_get_current();
3135 /* Temporary page that is later 'placed' */
3414322a 3136 void *postcopy_host_page = mis->postcopy_tmp_page;
91ba442f 3137 void *this_host = NULL;
a3b6ff6d 3138 bool all_zero = false;
4cbb3c63 3139 int target_pages = 0;
a7180877
DDAG
3140
3141 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3142 ram_addr_t addr;
3143 void *host = NULL;
3144 void *page_buffer = NULL;
3145 void *place_source = NULL;
df9ff5e1 3146 RAMBlock *block = NULL;
a7180877 3147 uint8_t ch;
644acf99 3148 int len;
a7180877
DDAG
3149
3150 addr = qemu_get_be64(f);
7a9ddfbf
PX
3151
3152 /*
3153 * If qemu file error, we should stop here, and then "addr"
3154 * may be invalid
3155 */
3156 ret = qemu_file_get_error(f);
3157 if (ret) {
3158 break;
3159 }
3160
a7180877
DDAG
3161 flags = addr & ~TARGET_PAGE_MASK;
3162 addr &= TARGET_PAGE_MASK;
3163
3164 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3165 place_needed = false;
644acf99
WY
3166 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3167 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
df9ff5e1 3168 block = ram_block_from_stream(f, flags);
4c4bad48
HZ
3169
3170 host = host_from_ram_block_offset(block, addr);
a7180877
DDAG
3171 if (!host) {
3172 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3173 ret = -EINVAL;
3174 break;
3175 }
4cbb3c63 3176 target_pages++;
1aa83678 3177 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
a7180877 3178 /*
28abd200
DDAG
3179 * Postcopy requires that we place whole host pages atomically;
3180 * these may be huge pages for RAMBlocks that are backed by
3181 * hugetlbfs.
a7180877
DDAG
3182 * To make it atomic, the data is read into a temporary page
3183 * that's moved into place later.
3184 * The migration protocol uses, possibly smaller, target-pages
3185 * however the source ensures it always sends all the components
91ba442f 3186 * of a host page in one chunk.
a7180877
DDAG
3187 */
3188 page_buffer = postcopy_host_page +
28abd200 3189 ((uintptr_t)host & (block->page_size - 1));
a7180877 3190 /* If all TP are zero then we can optimise the place */
e5e73b0f 3191 if (target_pages == 1) {
a7180877 3192 all_zero = true;
91ba442f
WY
3193 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3194 block->page_size);
c53b7ddc
DDAG
3195 } else {
3196 /* not the 1st TP within the HP */
91ba442f
WY
3197 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) !=
3198 (uintptr_t)this_host) {
3199 error_report("Non-same host page %p/%p",
3200 host, this_host);
c53b7ddc
DDAG
3201 ret = -EINVAL;
3202 break;
3203 }
a7180877
DDAG
3204 }
3205
3206 /*
3207 * If it's the last part of a host page then we place the host
3208 * page
3209 */
4cbb3c63
WY
3210 if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3211 place_needed = true;
3212 target_pages = 0;
3213 }
a7180877
DDAG
3214 place_source = postcopy_host_page;
3215 }
3216
3217 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
bb890ed5 3218 case RAM_SAVE_FLAG_ZERO:
a7180877 3219 ch = qemu_get_byte(f);
2e36bc1b
WY
3220 /*
3221 * Can skip to set page_buffer when
3222 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3223 */
3224 if (ch || !matches_target_page_size) {
3225 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3226 }
a7180877
DDAG
3227 if (ch) {
3228 all_zero = false;
3229 }
3230 break;
3231
3232 case RAM_SAVE_FLAG_PAGE:
3233 all_zero = false;
1aa83678
PX
3234 if (!matches_target_page_size) {
3235 /* For huge pages, we always use temporary buffer */
a7180877
DDAG
3236 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3237 } else {
1aa83678
PX
3238 /*
3239 * For small pages that matches target page size, we
3240 * avoid the qemu_file copy. Instead we directly use
3241 * the buffer of QEMUFile to place the page. Note: we
3242 * cannot do any QEMUFile operation before using that
3243 * buffer to make sure the buffer is valid when
3244 * placing the page.
a7180877
DDAG
3245 */
3246 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3247 TARGET_PAGE_SIZE);
3248 }
3249 break;
644acf99
WY
3250 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3251 all_zero = false;
3252 len = qemu_get_be32(f);
3253 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3254 error_report("Invalid compressed data length: %d", len);
3255 ret = -EINVAL;
3256 break;
3257 }
3258 decompress_data_with_multi_threads(f, page_buffer, len);
3259 break;
3260
a7180877
DDAG
3261 case RAM_SAVE_FLAG_EOS:
3262 /* normal exit */
6df264ac 3263 multifd_recv_sync_main();
a7180877
DDAG
3264 break;
3265 default:
3266 error_report("Unknown combination of migration flags: %#x"
3267 " (postcopy mode)", flags);
3268 ret = -EINVAL;
7a9ddfbf
PX
3269 break;
3270 }
3271
644acf99
WY
3272 /* Got the whole host page, wait for decompress before placing. */
3273 if (place_needed) {
3274 ret |= wait_for_decompress_done();
3275 }
3276
7a9ddfbf
PX
3277 /* Detect for any possible file errors */
3278 if (!ret && qemu_file_get_error(f)) {
3279 ret = qemu_file_get_error(f);
a7180877
DDAG
3280 }
3281
7a9ddfbf 3282 if (!ret && place_needed) {
a7180877 3283 /* This gets called at the last target page in the host page */
91ba442f
WY
3284 void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3285 block->page_size);
df9ff5e1 3286
a7180877 3287 if (all_zero) {
df9ff5e1 3288 ret = postcopy_place_page_zero(mis, place_dest,
8be4620b 3289 block);
a7180877 3290 } else {
df9ff5e1 3291 ret = postcopy_place_page(mis, place_dest,
8be4620b 3292 place_source, block);
a7180877
DDAG
3293 }
3294 }
a7180877
DDAG
3295 }
3296
3297 return ret;
3298}
3299
acab30b8
DHB
3300static bool postcopy_is_advised(void)
3301{
3302 PostcopyState ps = postcopy_state_get();
3303 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3304}
3305
3306static bool postcopy_is_running(void)
3307{
3308 PostcopyState ps = postcopy_state_get();
3309 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3310}
3311
e6f4aa18
ZC
3312/*
3313 * Flush content of RAM cache into SVM's memory.
3314 * Only flush the pages that be dirtied by PVM or SVM or both.
3315 */
3316static void colo_flush_ram_cache(void)
3317{
3318 RAMBlock *block = NULL;
3319 void *dst_host;
3320 void *src_host;
3321 unsigned long offset = 0;
3322
d1955d22 3323 memory_global_dirty_log_sync();
89ac5a1d
DDAG
3324 WITH_RCU_READ_LOCK_GUARD() {
3325 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3326 ramblock_sync_dirty_bitmap(ram_state, block);
3327 }
d1955d22 3328 }
d1955d22 3329
e6f4aa18 3330 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
89ac5a1d
DDAG
3331 WITH_RCU_READ_LOCK_GUARD() {
3332 block = QLIST_FIRST_RCU(&ram_list.blocks);
e6f4aa18 3333
89ac5a1d
DDAG
3334 while (block) {
3335 offset = migration_bitmap_find_dirty(ram_state, block, offset);
e6f4aa18 3336
8bba004c
AR
3337 if (((ram_addr_t)offset) << TARGET_PAGE_BITS
3338 >= block->used_length) {
89ac5a1d
DDAG
3339 offset = 0;
3340 block = QLIST_NEXT_RCU(block, next);
3341 } else {
3342 migration_bitmap_clear_dirty(ram_state, block, offset);
8bba004c
AR
3343 dst_host = block->host
3344 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3345 src_host = block->colo_cache
3346 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
89ac5a1d
DDAG
3347 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3348 }
e6f4aa18
ZC
3349 }
3350 }
e6f4aa18
ZC
3351 trace_colo_flush_ram_cache_end();
3352}
3353
10da4a36
WY
3354/**
3355 * ram_load_precopy: load pages in precopy case
3356 *
3357 * Returns 0 for success or -errno in case of error
3358 *
3359 * Called in precopy mode by ram_load().
3360 * rcu_read_lock is taken prior to this being called.
3361 *
3362 * @f: QEMUFile where to send the data
3363 */
3364static int ram_load_precopy(QEMUFile *f)
56e93d26 3365{
e65cec5e 3366 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
ef08fb38 3367 /* ADVISE is earlier, it shows the source has the postcopy capability on */
acab30b8 3368 bool postcopy_advised = postcopy_is_advised();
edc60127
JQ
3369 if (!migrate_use_compression()) {
3370 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3371 }
a7180877 3372
10da4a36 3373 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
56e93d26 3374 ram_addr_t addr, total_ram_bytes;
0393031a 3375 void *host = NULL, *host_bak = NULL;
56e93d26
JQ
3376 uint8_t ch;
3377
e65cec5e
YK
3378 /*
3379 * Yield periodically to let main loop run, but an iteration of
3380 * the main loop is expensive, so do it each some iterations
3381 */
3382 if ((i & 32767) == 0 && qemu_in_coroutine()) {
3383 aio_co_schedule(qemu_get_current_aio_context(),
3384 qemu_coroutine_self());
3385 qemu_coroutine_yield();
3386 }
3387 i++;
3388
56e93d26
JQ
3389 addr = qemu_get_be64(f);
3390 flags = addr & ~TARGET_PAGE_MASK;
3391 addr &= TARGET_PAGE_MASK;
3392
edc60127
JQ
3393 if (flags & invalid_flags) {
3394 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3395 error_report("Received an unexpected compressed page");
3396 }
3397
3398 ret = -EINVAL;
3399 break;
3400 }
3401
bb890ed5 3402 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
a776aa15 3403 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4c4bad48
HZ
3404 RAMBlock *block = ram_block_from_stream(f, flags);
3405
0393031a 3406 host = host_from_ram_block_offset(block, addr);
13af18f2 3407 /*
0393031a
HZ
3408 * After going into COLO stage, we should not load the page
3409 * into SVM's memory directly, we put them into colo_cache firstly.
3410 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3411 * Previously, we copied all these memory in preparing stage of COLO
3412 * while we need to stop VM, which is a time-consuming process.
3413 * Here we optimize it by a trick, back-up every page while in
3414 * migration process while COLO is enabled, though it affects the
3415 * speed of the migration, but it obviously reduce the downtime of
3416 * back-up all SVM'S memory in COLO preparing stage.
13af18f2 3417 */
0393031a
HZ
3418 if (migration_incoming_colo_enabled()) {
3419 if (migration_incoming_in_colo_state()) {
3420 /* In COLO stage, put all pages into cache temporarily */
8af66371 3421 host = colo_cache_from_block_offset(block, addr, true);
0393031a
HZ
3422 } else {
3423 /*
3424 * In migration stage but before COLO stage,
3425 * Put all pages into both cache and SVM's memory.
3426 */
8af66371 3427 host_bak = colo_cache_from_block_offset(block, addr, false);
0393031a 3428 }
13af18f2 3429 }
a776aa15
DDAG
3430 if (!host) {
3431 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3432 ret = -EINVAL;
3433 break;
3434 }
13af18f2
ZC
3435 if (!migration_incoming_in_colo_state()) {
3436 ramblock_recv_bitmap_set(block, host);
3437 }
3438
1db9d8e5 3439 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
a776aa15
DDAG
3440 }
3441
56e93d26
JQ
3442 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3443 case RAM_SAVE_FLAG_MEM_SIZE:
3444 /* Synchronize RAM block list */
3445 total_ram_bytes = addr;
3446 while (!ret && total_ram_bytes) {
3447 RAMBlock *block;
56e93d26
JQ
3448 char id[256];
3449 ram_addr_t length;
3450
3451 len = qemu_get_byte(f);
3452 qemu_get_buffer(f, (uint8_t *)id, len);
3453 id[len] = 0;
3454 length = qemu_get_be64(f);
3455
e3dd7493 3456 block = qemu_ram_block_by_name(id);
b895de50
CLG
3457 if (block && !qemu_ram_is_migratable(block)) {
3458 error_report("block %s should not be migrated !", id);
3459 ret = -EINVAL;
3460 } else if (block) {
e3dd7493
DDAG
3461 if (length != block->used_length) {
3462 Error *local_err = NULL;
56e93d26 3463
fa53a0e5 3464 ret = qemu_ram_resize(block, length,
e3dd7493
DDAG
3465 &local_err);
3466 if (local_err) {
3467 error_report_err(local_err);
56e93d26 3468 }
56e93d26 3469 }
ef08fb38
DDAG
3470 /* For postcopy we need to check hugepage sizes match */
3471 if (postcopy_advised &&
3472 block->page_size != qemu_host_page_size) {
3473 uint64_t remote_page_size = qemu_get_be64(f);
3474 if (remote_page_size != block->page_size) {
3475 error_report("Mismatched RAM page size %s "
3476 "(local) %zd != %" PRId64,
3477 id, block->page_size,
3478 remote_page_size);
3479 ret = -EINVAL;
3480 }
3481 }
fbd162e6
YK
3482 if (migrate_ignore_shared()) {
3483 hwaddr addr = qemu_get_be64(f);
fbd162e6
YK
3484 if (ramblock_is_ignored(block) &&
3485 block->mr->addr != addr) {
3486 error_report("Mismatched GPAs for block %s "
3487 "%" PRId64 "!= %" PRId64,
3488 id, (uint64_t)addr,
3489 (uint64_t)block->mr->addr);
3490 ret = -EINVAL;
3491 }
3492 }
e3dd7493
DDAG
3493 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3494 block->idstr);
3495 } else {
56e93d26
JQ
3496 error_report("Unknown ramblock \"%s\", cannot "
3497 "accept migration", id);
3498 ret = -EINVAL;
3499 }
3500
3501 total_ram_bytes -= length;
3502 }
3503 break;
a776aa15 3504
bb890ed5 3505 case RAM_SAVE_FLAG_ZERO:
56e93d26
JQ
3506 ch = qemu_get_byte(f);
3507 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3508 break;
a776aa15 3509
56e93d26 3510 case RAM_SAVE_FLAG_PAGE:
56e93d26
JQ
3511 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3512 break;
56e93d26 3513
a776aa15 3514 case RAM_SAVE_FLAG_COMPRESS_PAGE:
56e93d26
JQ
3515 len = qemu_get_be32(f);
3516 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3517 error_report("Invalid compressed data length: %d", len);
3518 ret = -EINVAL;
3519 break;
3520 }
c1bc6626 3521 decompress_data_with_multi_threads(f, host, len);
56e93d26 3522 break;
a776aa15 3523
56e93d26 3524 case RAM_SAVE_FLAG_XBZRLE:
56e93d26
JQ
3525 if (load_xbzrle(f, addr, host) < 0) {
3526 error_report("Failed to decompress XBZRLE page at "
3527 RAM_ADDR_FMT, addr);
3528 ret = -EINVAL;
3529 break;
3530 }
3531 break;
3532 case RAM_SAVE_FLAG_EOS:
3533 /* normal exit */
6df264ac 3534 multifd_recv_sync_main();
56e93d26
JQ
3535 break;
3536 default:
3537 if (flags & RAM_SAVE_FLAG_HOOK) {
632e3a5c 3538 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
56e93d26
JQ
3539 } else {
3540 error_report("Unknown combination of migration flags: %#x",
3541 flags);
3542 ret = -EINVAL;
3543 }
3544 }
3545 if (!ret) {
3546 ret = qemu_file_get_error(f);
3547 }
0393031a
HZ
3548 if (!ret && host_bak) {
3549 memcpy(host_bak, host, TARGET_PAGE_SIZE);
3550 }
56e93d26
JQ
3551 }
3552
ca1a6b70 3553 ret |= wait_for_decompress_done();
10da4a36
WY
3554 return ret;
3555}
3556
3557static int ram_load(QEMUFile *f, void *opaque, int version_id)
3558{
3559 int ret = 0;
3560 static uint64_t seq_iter;
3561 /*
3562 * If system is running in postcopy mode, page inserts to host memory must
3563 * be atomic
3564 */
3565 bool postcopy_running = postcopy_is_running();
3566
3567 seq_iter++;
3568
3569 if (version_id != 4) {
3570 return -EINVAL;
3571 }
3572
3573 /*
3574 * This RCU critical section can be very long running.
3575 * When RCU reclaims in the code start to become numerous,
3576 * it will be necessary to reduce the granularity of this
3577 * critical section.
3578 */
89ac5a1d
DDAG
3579 WITH_RCU_READ_LOCK_GUARD() {
3580 if (postcopy_running) {
3581 ret = ram_load_postcopy(f);
3582 } else {
3583 ret = ram_load_precopy(f);
3584 }
10da4a36 3585 }
55c4446b 3586 trace_ram_load_complete(ret, seq_iter);
e6f4aa18
ZC
3587
3588 if (!ret && migration_incoming_in_colo_state()) {
3589 colo_flush_ram_cache();
3590 }
56e93d26
JQ
3591 return ret;
3592}
3593
c6467627
VSO
3594static bool ram_has_postcopy(void *opaque)
3595{
469dd51b 3596 RAMBlock *rb;
fbd162e6 3597 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
469dd51b
JH
3598 if (ramblock_is_pmem(rb)) {
3599 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3600 "is not supported now!", rb->idstr, rb->host);
3601 return false;
3602 }
3603 }
3604
c6467627
VSO
3605 return migrate_postcopy_ram();
3606}
3607
edd090c7
PX
3608/* Sync all the dirty bitmap with destination VM. */
3609static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3610{
3611 RAMBlock *block;
3612 QEMUFile *file = s->to_dst_file;
3613 int ramblock_count = 0;
3614
3615 trace_ram_dirty_bitmap_sync_start();
3616
fbd162e6 3617 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
edd090c7
PX
3618 qemu_savevm_send_recv_bitmap(file, block->idstr);
3619 trace_ram_dirty_bitmap_request(block->idstr);
3620 ramblock_count++;
3621 }
3622
3623 trace_ram_dirty_bitmap_sync_wait();
3624
3625 /* Wait until all the ramblocks' dirty bitmap synced */
3626 while (ramblock_count--) {
3627 qemu_sem_wait(&s->rp_state.rp_sem);
3628 }
3629
3630 trace_ram_dirty_bitmap_sync_complete();
3631
3632 return 0;
3633}
3634
3635static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3636{
3637 qemu_sem_post(&s->rp_state.rp_sem);
3638}
3639
a335debb
PX
3640/*
3641 * Read the received bitmap, revert it as the initial dirty bitmap.
3642 * This is only used when the postcopy migration is paused but wants
3643 * to resume from a middle point.
3644 */
3645int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3646{
3647 int ret = -EINVAL;
3648 QEMUFile *file = s->rp_state.from_dst_file;
3649 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
a725ef9f 3650 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
3651 uint64_t size, end_mark;
3652
3653 trace_ram_dirty_bitmap_reload_begin(block->idstr);
3654
3655 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3656 error_report("%s: incorrect state %s", __func__,
3657 MigrationStatus_str(s->state));
3658 return -EINVAL;
3659 }
3660
3661 /*
3662 * Note: see comments in ramblock_recv_bitmap_send() on why we
3663 * need the endianess convertion, and the paddings.
3664 */
3665 local_size = ROUND_UP(local_size, 8);
3666
3667 /* Add paddings */
3668 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
3669
3670 size = qemu_get_be64(file);
3671
3672 /* The size of the bitmap should match with our ramblock */
3673 if (size != local_size) {
3674 error_report("%s: ramblock '%s' bitmap size mismatch "
3675 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
3676 block->idstr, size, local_size);
3677 ret = -EINVAL;
3678 goto out;
3679 }
3680
3681 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
3682 end_mark = qemu_get_be64(file);
3683
3684 ret = qemu_file_get_error(file);
3685 if (ret || size != local_size) {
3686 error_report("%s: read bitmap failed for ramblock '%s': %d"
3687 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
3688 __func__, block->idstr, ret, local_size, size);
3689 ret = -EIO;
3690 goto out;
3691 }
3692
3693 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
3694 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
3695 __func__, block->idstr, end_mark);
3696 ret = -EINVAL;
3697 goto out;
3698 }
3699
3700 /*
3701 * Endianess convertion. We are during postcopy (though paused).
3702 * The dirty bitmap won't change. We can directly modify it.
3703 */
3704 bitmap_from_le(block->bmap, le_bitmap, nbits);
3705
3706 /*
3707 * What we received is "received bitmap". Revert it as the initial
3708 * dirty bitmap for this ramblock.
3709 */
3710 bitmap_complement(block->bmap, block->bmap, nbits);
3711
3712 trace_ram_dirty_bitmap_reload_complete(block->idstr);
3713
edd090c7
PX
3714 /*
3715 * We succeeded to sync bitmap for current ramblock. If this is
3716 * the last one to sync, we need to notify the main send thread.
3717 */
3718 ram_dirty_bitmap_reload_notify(s);
3719
a335debb
PX
3720 ret = 0;
3721out:
bf269906 3722 g_free(le_bitmap);
a335debb
PX
3723 return ret;
3724}
3725
edd090c7
PX
3726static int ram_resume_prepare(MigrationState *s, void *opaque)
3727{
3728 RAMState *rs = *(RAMState **)opaque;
08614f34 3729 int ret;
edd090c7 3730
08614f34
PX
3731 ret = ram_dirty_bitmap_sync_all(s, rs);
3732 if (ret) {
3733 return ret;
3734 }
3735
3736 ram_state_resume_prepare(rs, s->to_dst_file);
3737
3738 return 0;
edd090c7
PX
3739}
3740
56e93d26 3741static SaveVMHandlers savevm_ram_handlers = {
9907e842 3742 .save_setup = ram_save_setup,
56e93d26 3743 .save_live_iterate = ram_save_iterate,
763c906b 3744 .save_live_complete_postcopy = ram_save_complete,
a3e06c3d 3745 .save_live_complete_precopy = ram_save_complete,
c6467627 3746 .has_postcopy = ram_has_postcopy,
56e93d26
JQ
3747 .save_live_pending = ram_save_pending,
3748 .load_state = ram_load,
f265e0e4
JQ
3749 .save_cleanup = ram_save_cleanup,
3750 .load_setup = ram_load_setup,
3751 .load_cleanup = ram_load_cleanup,
edd090c7 3752 .resume_prepare = ram_resume_prepare,
56e93d26
JQ
3753};
3754
3755void ram_mig_init(void)
3756{
3757 qemu_mutex_init(&XBZRLE.lock);
ce62df53 3758 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
56e93d26 3759}