]> git.proxmox.com Git - mirror_qemu.git/blame - migration/ram.c
migration: postcopy take proper error return
[mirror_qemu.git] / migration / ram.c
CommitLineData
56e93d26
JQ
1/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
76cc7b58
JQ
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
56e93d26
JQ
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
e688df6b 28
1393a485 29#include "qemu/osdep.h"
33c11879 30#include "cpu.h"
f348b6d1 31#include "qemu/cutils.h"
56e93d26
JQ
32#include "qemu/bitops.h"
33#include "qemu/bitmap.h"
7205c9ec 34#include "qemu/main-loop.h"
709e3fe8 35#include "xbzrle.h"
7b1e1a22 36#include "ram.h"
6666c96a 37#include "migration.h"
f2a8f0a6 38#include "migration/register.h"
7b1e1a22 39#include "migration/misc.h"
08a0aee1 40#include "qemu-file.h"
be07b0ac 41#include "postcopy-ram.h"
53d37d36 42#include "page_cache.h"
56e93d26 43#include "qemu/error-report.h"
e688df6b 44#include "qapi/error.h"
ab7cbb0b 45#include "qapi/qapi-types-migration.h"
9af23989 46#include "qapi/qapi-events-migration.h"
8acabf69 47#include "qapi/qmp/qerror.h"
56e93d26 48#include "trace.h"
56e93d26 49#include "exec/ram_addr.h"
f9494614 50#include "exec/target_page.h"
56e93d26 51#include "qemu/rcu_queue.h"
a91246c9 52#include "migration/colo.h"
53d37d36 53#include "block.h"
af8b7d2b 54#include "sysemu/sysemu.h"
edd090c7 55#include "savevm.h"
b9ee2f7d 56#include "qemu/iov.h"
d32ca5ad 57#include "multifd.h"
56e93d26 58
56e93d26
JQ
59/***********************************************************/
60/* ram save/restore */
61
bb890ed5
JQ
62/* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
63 * worked for pages that where filled with the same char. We switched
64 * it to only search for the zero value. And to avoid confusion with
65 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
66 */
67
56e93d26 68#define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
bb890ed5 69#define RAM_SAVE_FLAG_ZERO 0x02
56e93d26
JQ
70#define RAM_SAVE_FLAG_MEM_SIZE 0x04
71#define RAM_SAVE_FLAG_PAGE 0x08
72#define RAM_SAVE_FLAG_EOS 0x10
73#define RAM_SAVE_FLAG_CONTINUE 0x20
74#define RAM_SAVE_FLAG_XBZRLE 0x40
75/* 0x80 is reserved in migration.h start with 0x100 next */
76#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
77
56e93d26
JQ
78static inline bool is_zero_range(uint8_t *p, uint64_t size)
79{
a1febc49 80 return buffer_is_zero(p, size);
56e93d26
JQ
81}
82
9360447d
JQ
83XBZRLECacheStats xbzrle_counters;
84
56e93d26
JQ
85/* struct contains XBZRLE cache and a static page
86 used by the compression */
87static struct {
88 /* buffer used for XBZRLE encoding */
89 uint8_t *encoded_buf;
90 /* buffer for storing page content */
91 uint8_t *current_buf;
92 /* Cache for XBZRLE, Protected by lock. */
93 PageCache *cache;
94 QemuMutex lock;
c00e0928
JQ
95 /* it will store a page full of zeros */
96 uint8_t *zero_target_page;
f265e0e4
JQ
97 /* buffer used for XBZRLE decoding */
98 uint8_t *decoded_buf;
56e93d26
JQ
99} XBZRLE;
100
56e93d26
JQ
101static void XBZRLE_cache_lock(void)
102{
103 if (migrate_use_xbzrle())
104 qemu_mutex_lock(&XBZRLE.lock);
105}
106
107static void XBZRLE_cache_unlock(void)
108{
109 if (migrate_use_xbzrle())
110 qemu_mutex_unlock(&XBZRLE.lock);
111}
112
3d0684b2
JQ
113/**
114 * xbzrle_cache_resize: resize the xbzrle cache
115 *
116 * This function is called from qmp_migrate_set_cache_size in main
117 * thread, possibly while a migration is in progress. A running
118 * migration may be using the cache and might finish during this call,
119 * hence changes to the cache are protected by XBZRLE.lock().
120 *
c9dede2d 121 * Returns 0 for success or -1 for error
3d0684b2
JQ
122 *
123 * @new_size: new cache size
8acabf69 124 * @errp: set *errp if the check failed, with reason
56e93d26 125 */
c9dede2d 126int xbzrle_cache_resize(int64_t new_size, Error **errp)
56e93d26
JQ
127{
128 PageCache *new_cache;
c9dede2d 129 int64_t ret = 0;
56e93d26 130
8acabf69
JQ
131 /* Check for truncation */
132 if (new_size != (size_t)new_size) {
133 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
134 "exceeding address space");
135 return -1;
136 }
137
2a313e5c
JQ
138 if (new_size == migrate_xbzrle_cache_size()) {
139 /* nothing to do */
c9dede2d 140 return 0;
2a313e5c
JQ
141 }
142
56e93d26
JQ
143 XBZRLE_cache_lock();
144
145 if (XBZRLE.cache != NULL) {
80f8dfde 146 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
56e93d26 147 if (!new_cache) {
56e93d26
JQ
148 ret = -1;
149 goto out;
150 }
151
152 cache_fini(XBZRLE.cache);
153 XBZRLE.cache = new_cache;
154 }
56e93d26
JQ
155out:
156 XBZRLE_cache_unlock();
157 return ret;
158}
159
fbd162e6
YK
160static bool ramblock_is_ignored(RAMBlock *block)
161{
162 return !qemu_ram_is_migratable(block) ||
163 (migrate_ignore_shared() && qemu_ram_is_shared(block));
164}
165
b895de50 166/* Should be holding either ram_list.mutex, or the RCU lock. */
fbd162e6
YK
167#define RAMBLOCK_FOREACH_NOT_IGNORED(block) \
168 INTERNAL_RAMBLOCK_FOREACH(block) \
169 if (ramblock_is_ignored(block)) {} else
170
b895de50 171#define RAMBLOCK_FOREACH_MIGRATABLE(block) \
343f632c 172 INTERNAL_RAMBLOCK_FOREACH(block) \
b895de50
CLG
173 if (!qemu_ram_is_migratable(block)) {} else
174
343f632c
DDAG
175#undef RAMBLOCK_FOREACH
176
fbd162e6
YK
177int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
178{
179 RAMBlock *block;
180 int ret = 0;
181
89ac5a1d
DDAG
182 RCU_READ_LOCK_GUARD();
183
fbd162e6
YK
184 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
185 ret = func(block, opaque);
186 if (ret) {
187 break;
188 }
189 }
fbd162e6
YK
190 return ret;
191}
192
f9494614
AP
193static void ramblock_recv_map_init(void)
194{
195 RAMBlock *rb;
196
fbd162e6 197 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
f9494614
AP
198 assert(!rb->receivedmap);
199 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
200 }
201}
202
203int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
204{
205 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
206 rb->receivedmap);
207}
208
1cba9f6e
DDAG
209bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
210{
211 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
212}
213
f9494614
AP
214void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
215{
216 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
217}
218
219void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
220 size_t nr)
221{
222 bitmap_set_atomic(rb->receivedmap,
223 ramblock_recv_bitmap_offset(host_addr, rb),
224 nr);
225}
226
a335debb
PX
227#define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
228
229/*
230 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
231 *
232 * Returns >0 if success with sent bytes, or <0 if error.
233 */
234int64_t ramblock_recv_bitmap_send(QEMUFile *file,
235 const char *block_name)
236{
237 RAMBlock *block = qemu_ram_block_by_name(block_name);
238 unsigned long *le_bitmap, nbits;
239 uint64_t size;
240
241 if (!block) {
242 error_report("%s: invalid block name: %s", __func__, block_name);
243 return -1;
244 }
245
246 nbits = block->used_length >> TARGET_PAGE_BITS;
247
248 /*
249 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
250 * machines we may need 4 more bytes for padding (see below
251 * comment). So extend it a bit before hand.
252 */
253 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
254
255 /*
256 * Always use little endian when sending the bitmap. This is
257 * required that when source and destination VMs are not using the
258 * same endianess. (Note: big endian won't work.)
259 */
260 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
261
262 /* Size of the bitmap, in bytes */
a725ef9f 263 size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
264
265 /*
266 * size is always aligned to 8 bytes for 64bit machines, but it
267 * may not be true for 32bit machines. We need this padding to
268 * make sure the migration can survive even between 32bit and
269 * 64bit machines.
270 */
271 size = ROUND_UP(size, 8);
272
273 qemu_put_be64(file, size);
274 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
275 /*
276 * Mark as an end, in case the middle part is screwed up due to
277 * some "misterious" reason.
278 */
279 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
280 qemu_fflush(file);
281
bf269906 282 g_free(le_bitmap);
a335debb
PX
283
284 if (qemu_file_get_error(file)) {
285 return qemu_file_get_error(file);
286 }
287
288 return size + sizeof(size);
289}
290
ec481c6c
JQ
291/*
292 * An outstanding page request, on the source, having been received
293 * and queued
294 */
295struct RAMSrcPageRequest {
296 RAMBlock *rb;
297 hwaddr offset;
298 hwaddr len;
299
300 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
301};
302
6f37bb8b
JQ
303/* State of RAM for migration */
304struct RAMState {
204b88b8
JQ
305 /* QEMUFile used for this migration */
306 QEMUFile *f;
6f37bb8b
JQ
307 /* Last block that we have visited searching for dirty pages */
308 RAMBlock *last_seen_block;
309 /* Last block from where we have sent data */
310 RAMBlock *last_sent_block;
269ace29
JQ
311 /* Last dirty target page we have sent */
312 ram_addr_t last_page;
6f37bb8b
JQ
313 /* last ram version we have seen */
314 uint32_t last_version;
315 /* We are in the first round */
316 bool ram_bulk_stage;
6eeb63f7
WW
317 /* The free page optimization is enabled */
318 bool fpo_enabled;
8d820d6f
JQ
319 /* How many times we have dirty too many pages */
320 int dirty_rate_high_cnt;
f664da80
JQ
321 /* these variables are used for bitmap sync */
322 /* last time we did a full bitmap_sync */
323 int64_t time_last_bitmap_sync;
eac74159 324 /* bytes transferred at start_time */
c4bdf0cf 325 uint64_t bytes_xfer_prev;
a66cd90c 326 /* number of dirty pages since start_time */
68908ed6 327 uint64_t num_dirty_pages_period;
b5833fde
JQ
328 /* xbzrle misses since the beginning of the period */
329 uint64_t xbzrle_cache_miss_prev;
e460a4b1
WW
330 /* Amount of xbzrle pages since the beginning of the period */
331 uint64_t xbzrle_pages_prev;
332 /* Amount of xbzrle encoded bytes since the beginning of the period */
333 uint64_t xbzrle_bytes_prev;
76e03000
XG
334
335 /* compression statistics since the beginning of the period */
336 /* amount of count that no free thread to compress data */
337 uint64_t compress_thread_busy_prev;
338 /* amount bytes after compression */
339 uint64_t compressed_size_prev;
340 /* amount of compressed pages */
341 uint64_t compress_pages_prev;
342
be8b02ed
XG
343 /* total handled target pages at the beginning of period */
344 uint64_t target_page_count_prev;
345 /* total handled target pages since start */
346 uint64_t target_page_count;
9360447d 347 /* number of dirty bits in the bitmap */
2dfaf12e 348 uint64_t migration_dirty_pages;
386a907b 349 /* Protects modification of the bitmap and migration dirty pages */
108cfae0 350 QemuMutex bitmap_mutex;
68a098f3
JQ
351 /* The RAMBlock used in the last src_page_requests */
352 RAMBlock *last_req_rb;
ec481c6c
JQ
353 /* Queue of outstanding page requests from the destination */
354 QemuMutex src_page_req_mutex;
b58deb34 355 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
6f37bb8b
JQ
356};
357typedef struct RAMState RAMState;
358
53518d94 359static RAMState *ram_state;
6f37bb8b 360
bd227060
WW
361static NotifierWithReturnList precopy_notifier_list;
362
363void precopy_infrastructure_init(void)
364{
365 notifier_with_return_list_init(&precopy_notifier_list);
366}
367
368void precopy_add_notifier(NotifierWithReturn *n)
369{
370 notifier_with_return_list_add(&precopy_notifier_list, n);
371}
372
373void precopy_remove_notifier(NotifierWithReturn *n)
374{
375 notifier_with_return_remove(n);
376}
377
378int precopy_notify(PrecopyNotifyReason reason, Error **errp)
379{
380 PrecopyNotifyData pnd;
381 pnd.reason = reason;
382 pnd.errp = errp;
383
384 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
385}
386
6eeb63f7
WW
387void precopy_enable_free_page_optimization(void)
388{
389 if (!ram_state) {
390 return;
391 }
392
393 ram_state->fpo_enabled = true;
394}
395
9edabd4d 396uint64_t ram_bytes_remaining(void)
2f4fde93 397{
bae416e5
DDAG
398 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
399 0;
2f4fde93
JQ
400}
401
9360447d 402MigrationStats ram_counters;
96506894 403
b8fb8cb7
DDAG
404/* used by the search for pages to send */
405struct PageSearchStatus {
406 /* Current block being searched */
407 RAMBlock *block;
a935e30f
JQ
408 /* Current page to search from */
409 unsigned long page;
b8fb8cb7
DDAG
410 /* Set once we wrap around */
411 bool complete_round;
412};
413typedef struct PageSearchStatus PageSearchStatus;
414
76e03000
XG
415CompressionStats compression_counters;
416
56e93d26 417struct CompressParam {
56e93d26 418 bool done;
90e56fb4 419 bool quit;
5e5fdcff 420 bool zero_page;
56e93d26
JQ
421 QEMUFile *file;
422 QemuMutex mutex;
423 QemuCond cond;
424 RAMBlock *block;
425 ram_addr_t offset;
34ab9e97
XG
426
427 /* internally used fields */
dcaf446e 428 z_stream stream;
34ab9e97 429 uint8_t *originbuf;
56e93d26
JQ
430};
431typedef struct CompressParam CompressParam;
432
433struct DecompressParam {
73a8912b 434 bool done;
90e56fb4 435 bool quit;
56e93d26
JQ
436 QemuMutex mutex;
437 QemuCond cond;
438 void *des;
d341d9f3 439 uint8_t *compbuf;
56e93d26 440 int len;
797ca154 441 z_stream stream;
56e93d26
JQ
442};
443typedef struct DecompressParam DecompressParam;
444
445static CompressParam *comp_param;
446static QemuThread *compress_threads;
447/* comp_done_cond is used to wake up the migration thread when
448 * one of the compression threads has finished the compression.
449 * comp_done_lock is used to co-work with comp_done_cond.
450 */
0d9f9a5c
LL
451static QemuMutex comp_done_lock;
452static QemuCond comp_done_cond;
56e93d26
JQ
453/* The empty QEMUFileOps will be used by file in CompressParam */
454static const QEMUFileOps empty_ops = { };
455
34ab9e97 456static QEMUFile *decomp_file;
56e93d26
JQ
457static DecompressParam *decomp_param;
458static QemuThread *decompress_threads;
73a8912b
LL
459static QemuMutex decomp_done_lock;
460static QemuCond decomp_done_cond;
56e93d26 461
5e5fdcff 462static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
6ef3771c 463 ram_addr_t offset, uint8_t *source_buf);
56e93d26
JQ
464
465static void *do_data_compress(void *opaque)
466{
467 CompressParam *param = opaque;
a7a9a88f
LL
468 RAMBlock *block;
469 ram_addr_t offset;
5e5fdcff 470 bool zero_page;
56e93d26 471
a7a9a88f 472 qemu_mutex_lock(&param->mutex);
90e56fb4 473 while (!param->quit) {
a7a9a88f
LL
474 if (param->block) {
475 block = param->block;
476 offset = param->offset;
477 param->block = NULL;
478 qemu_mutex_unlock(&param->mutex);
479
5e5fdcff
XG
480 zero_page = do_compress_ram_page(param->file, &param->stream,
481 block, offset, param->originbuf);
a7a9a88f 482
0d9f9a5c 483 qemu_mutex_lock(&comp_done_lock);
a7a9a88f 484 param->done = true;
5e5fdcff 485 param->zero_page = zero_page;
0d9f9a5c
LL
486 qemu_cond_signal(&comp_done_cond);
487 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
488
489 qemu_mutex_lock(&param->mutex);
490 } else {
56e93d26
JQ
491 qemu_cond_wait(&param->cond, &param->mutex);
492 }
56e93d26 493 }
a7a9a88f 494 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
495
496 return NULL;
497}
498
f0afa331 499static void compress_threads_save_cleanup(void)
56e93d26
JQ
500{
501 int i, thread_count;
502
05306935 503 if (!migrate_use_compression() || !comp_param) {
56e93d26
JQ
504 return;
505 }
05306935 506
56e93d26
JQ
507 thread_count = migrate_compress_threads();
508 for (i = 0; i < thread_count; i++) {
dcaf446e
XG
509 /*
510 * we use it as a indicator which shows if the thread is
511 * properly init'd or not
512 */
513 if (!comp_param[i].file) {
514 break;
515 }
05306935
FL
516
517 qemu_mutex_lock(&comp_param[i].mutex);
518 comp_param[i].quit = true;
519 qemu_cond_signal(&comp_param[i].cond);
520 qemu_mutex_unlock(&comp_param[i].mutex);
521
56e93d26 522 qemu_thread_join(compress_threads + i);
56e93d26
JQ
523 qemu_mutex_destroy(&comp_param[i].mutex);
524 qemu_cond_destroy(&comp_param[i].cond);
dcaf446e 525 deflateEnd(&comp_param[i].stream);
34ab9e97 526 g_free(comp_param[i].originbuf);
dcaf446e
XG
527 qemu_fclose(comp_param[i].file);
528 comp_param[i].file = NULL;
56e93d26 529 }
0d9f9a5c
LL
530 qemu_mutex_destroy(&comp_done_lock);
531 qemu_cond_destroy(&comp_done_cond);
56e93d26
JQ
532 g_free(compress_threads);
533 g_free(comp_param);
56e93d26
JQ
534 compress_threads = NULL;
535 comp_param = NULL;
56e93d26
JQ
536}
537
dcaf446e 538static int compress_threads_save_setup(void)
56e93d26
JQ
539{
540 int i, thread_count;
541
542 if (!migrate_use_compression()) {
dcaf446e 543 return 0;
56e93d26 544 }
56e93d26
JQ
545 thread_count = migrate_compress_threads();
546 compress_threads = g_new0(QemuThread, thread_count);
547 comp_param = g_new0(CompressParam, thread_count);
0d9f9a5c
LL
548 qemu_cond_init(&comp_done_cond);
549 qemu_mutex_init(&comp_done_lock);
56e93d26 550 for (i = 0; i < thread_count; i++) {
34ab9e97
XG
551 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
552 if (!comp_param[i].originbuf) {
553 goto exit;
554 }
555
dcaf446e
XG
556 if (deflateInit(&comp_param[i].stream,
557 migrate_compress_level()) != Z_OK) {
34ab9e97 558 g_free(comp_param[i].originbuf);
dcaf446e
XG
559 goto exit;
560 }
561
e110aa91
C
562 /* comp_param[i].file is just used as a dummy buffer to save data,
563 * set its ops to empty.
56e93d26
JQ
564 */
565 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
566 comp_param[i].done = true;
90e56fb4 567 comp_param[i].quit = false;
56e93d26
JQ
568 qemu_mutex_init(&comp_param[i].mutex);
569 qemu_cond_init(&comp_param[i].cond);
570 qemu_thread_create(compress_threads + i, "compress",
571 do_data_compress, comp_param + i,
572 QEMU_THREAD_JOINABLE);
573 }
dcaf446e
XG
574 return 0;
575
576exit:
577 compress_threads_save_cleanup();
578 return -1;
56e93d26
JQ
579}
580
581/**
3d0684b2 582 * save_page_header: write page header to wire
56e93d26
JQ
583 *
584 * If this is the 1st block, it also writes the block identification
585 *
3d0684b2 586 * Returns the number of bytes written
56e93d26
JQ
587 *
588 * @f: QEMUFile where to send the data
589 * @block: block that contains the page we want to send
590 * @offset: offset inside the block for the page
591 * in the lower bits, it contains flags
592 */
2bf3aa85
JQ
593static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
594 ram_addr_t offset)
56e93d26 595{
9f5f380b 596 size_t size, len;
56e93d26 597
24795694
JQ
598 if (block == rs->last_sent_block) {
599 offset |= RAM_SAVE_FLAG_CONTINUE;
600 }
2bf3aa85 601 qemu_put_be64(f, offset);
56e93d26
JQ
602 size = 8;
603
604 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
9f5f380b 605 len = strlen(block->idstr);
2bf3aa85
JQ
606 qemu_put_byte(f, len);
607 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
9f5f380b 608 size += 1 + len;
24795694 609 rs->last_sent_block = block;
56e93d26
JQ
610 }
611 return size;
612}
613
3d0684b2
JQ
614/**
615 * mig_throttle_guest_down: throotle down the guest
616 *
617 * Reduce amount of guest cpu execution to hopefully slow down memory
618 * writes. If guest dirty memory rate is reduced below the rate at
619 * which we can transfer pages to the destination then we should be
620 * able to complete migration. Some workloads dirty memory way too
621 * fast and will not effectively converge, even with auto-converge.
070afca2 622 */
cbbf8182
KZ
623static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
624 uint64_t bytes_dirty_threshold)
070afca2
JH
625{
626 MigrationState *s = migrate_get_current();
2594f56d 627 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
cbbf8182
KZ
628 uint64_t pct_increment = s->parameters.cpu_throttle_increment;
629 bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
4cbc9c7f 630 int pct_max = s->parameters.max_cpu_throttle;
070afca2 631
cbbf8182
KZ
632 uint64_t throttle_now = cpu_throttle_get_percentage();
633 uint64_t cpu_now, cpu_ideal, throttle_inc;
634
070afca2
JH
635 /* We have not started throttling yet. Let's start it. */
636 if (!cpu_throttle_active()) {
637 cpu_throttle_set(pct_initial);
638 } else {
639 /* Throttling already on, just increase the rate */
cbbf8182
KZ
640 if (!pct_tailslow) {
641 throttle_inc = pct_increment;
642 } else {
643 /* Compute the ideal CPU percentage used by Guest, which may
644 * make the dirty rate match the dirty rate threshold. */
645 cpu_now = 100 - throttle_now;
646 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
647 bytes_dirty_period);
648 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
649 }
650 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
070afca2
JH
651 }
652}
653
3d0684b2
JQ
654/**
655 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
656 *
6f37bb8b 657 * @rs: current RAM state
3d0684b2
JQ
658 * @current_addr: address for the zero page
659 *
660 * Update the xbzrle cache to reflect a page that's been sent as all 0.
56e93d26
JQ
661 * The important thing is that a stale (not-yet-0'd) page be replaced
662 * by the new data.
663 * As a bonus, if the page wasn't in the cache it gets added so that
3d0684b2 664 * when a small write is made into the 0'd page it gets XBZRLE sent.
56e93d26 665 */
6f37bb8b 666static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
56e93d26 667{
6f37bb8b 668 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
56e93d26
JQ
669 return;
670 }
671
672 /* We don't care if this fails to allocate a new cache page
673 * as long as it updated an old one */
c00e0928 674 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
9360447d 675 ram_counters.dirty_sync_count);
56e93d26
JQ
676}
677
678#define ENCODING_FLAG_XBZRLE 0x1
679
680/**
681 * save_xbzrle_page: compress and send current page
682 *
683 * Returns: 1 means that we wrote the page
684 * 0 means that page is identical to the one already sent
685 * -1 means that xbzrle would be longer than normal
686 *
5a987738 687 * @rs: current RAM state
3d0684b2
JQ
688 * @current_data: pointer to the address of the page contents
689 * @current_addr: addr of the page
56e93d26
JQ
690 * @block: block that contains the page we want to send
691 * @offset: offset inside the block for the page
692 * @last_stage: if we are at the completion stage
56e93d26 693 */
204b88b8 694static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
56e93d26 695 ram_addr_t current_addr, RAMBlock *block,
072c2511 696 ram_addr_t offset, bool last_stage)
56e93d26
JQ
697{
698 int encoded_len = 0, bytes_xbzrle;
699 uint8_t *prev_cached_page;
700
9360447d
JQ
701 if (!cache_is_cached(XBZRLE.cache, current_addr,
702 ram_counters.dirty_sync_count)) {
703 xbzrle_counters.cache_miss++;
56e93d26
JQ
704 if (!last_stage) {
705 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
9360447d 706 ram_counters.dirty_sync_count) == -1) {
56e93d26
JQ
707 return -1;
708 } else {
709 /* update *current_data when the page has been
710 inserted into cache */
711 *current_data = get_cached_data(XBZRLE.cache, current_addr);
712 }
713 }
714 return -1;
715 }
716
e460a4b1
WW
717 /*
718 * Reaching here means the page has hit the xbzrle cache, no matter what
719 * encoding result it is (normal encoding, overflow or skipping the page),
720 * count the page as encoded. This is used to caculate the encoding rate.
721 *
722 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
723 * 2nd page turns out to be skipped (i.e. no new bytes written to the
724 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
725 * skipped page included. In this way, the encoding rate can tell if the
726 * guest page is good for xbzrle encoding.
727 */
728 xbzrle_counters.pages++;
56e93d26
JQ
729 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
730
731 /* save current buffer into memory */
732 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
733
734 /* XBZRLE encoding (if there is no overflow) */
735 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
736 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
737 TARGET_PAGE_SIZE);
ca353803
WY
738
739 /*
740 * Update the cache contents, so that it corresponds to the data
741 * sent, in all cases except where we skip the page.
742 */
743 if (!last_stage && encoded_len != 0) {
744 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
745 /*
746 * In the case where we couldn't compress, ensure that the caller
747 * sends the data from the cache, since the guest might have
748 * changed the RAM since we copied it.
749 */
750 *current_data = prev_cached_page;
751 }
752
56e93d26 753 if (encoded_len == 0) {
55c4446b 754 trace_save_xbzrle_page_skipping();
56e93d26
JQ
755 return 0;
756 } else if (encoded_len == -1) {
55c4446b 757 trace_save_xbzrle_page_overflow();
9360447d 758 xbzrle_counters.overflow++;
e460a4b1 759 xbzrle_counters.bytes += TARGET_PAGE_SIZE;
56e93d26
JQ
760 return -1;
761 }
762
56e93d26 763 /* Send XBZRLE based compressed page */
2bf3aa85 764 bytes_xbzrle = save_page_header(rs, rs->f, block,
204b88b8
JQ
765 offset | RAM_SAVE_FLAG_XBZRLE);
766 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
767 qemu_put_be16(rs->f, encoded_len);
768 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
56e93d26 769 bytes_xbzrle += encoded_len + 1 + 2;
e460a4b1
WW
770 /*
771 * Like compressed_size (please see update_compress_thread_counts),
772 * the xbzrle encoded bytes don't count the 8 byte header with
773 * RAM_SAVE_FLAG_CONTINUE.
774 */
775 xbzrle_counters.bytes += bytes_xbzrle - 8;
9360447d 776 ram_counters.transferred += bytes_xbzrle;
56e93d26
JQ
777
778 return 1;
779}
780
3d0684b2
JQ
781/**
782 * migration_bitmap_find_dirty: find the next dirty page from start
f3f491fc 783 *
a5f7b1a6 784 * Returns the page offset within memory region of the start of a dirty page
3d0684b2 785 *
6f37bb8b 786 * @rs: current RAM state
3d0684b2 787 * @rb: RAMBlock where to search for dirty pages
a935e30f 788 * @start: page where we start the search
f3f491fc 789 */
56e93d26 790static inline
a935e30f 791unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
f20e2865 792 unsigned long start)
56e93d26 793{
6b6712ef
JQ
794 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
795 unsigned long *bitmap = rb->bmap;
56e93d26
JQ
796 unsigned long next;
797
fbd162e6 798 if (ramblock_is_ignored(rb)) {
b895de50
CLG
799 return size;
800 }
801
6eeb63f7
WW
802 /*
803 * When the free page optimization is enabled, we need to check the bitmap
804 * to send the non-free pages rather than all the pages in the bulk stage.
805 */
806 if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
6b6712ef 807 next = start + 1;
56e93d26 808 } else {
6b6712ef 809 next = find_next_bit(bitmap, size, start);
56e93d26
JQ
810 }
811
6b6712ef 812 return next;
56e93d26
JQ
813}
814
06b10688 815static inline bool migration_bitmap_clear_dirty(RAMState *rs,
f20e2865
JQ
816 RAMBlock *rb,
817 unsigned long page)
a82d593b
DDAG
818{
819 bool ret;
a82d593b 820
386a907b 821 qemu_mutex_lock(&rs->bitmap_mutex);
002cad6b
PX
822
823 /*
824 * Clear dirty bitmap if needed. This _must_ be called before we
825 * send any of the page in the chunk because we need to make sure
826 * we can capture further page content changes when we sync dirty
827 * log the next time. So as long as we are going to send any of
828 * the page in the chunk we clear the remote dirty bitmap for all.
829 * Clearing it earlier won't be a problem, but too late will.
830 */
831 if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
832 uint8_t shift = rb->clear_bmap_shift;
833 hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
8bba004c 834 hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
002cad6b
PX
835
836 /*
837 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
838 * can make things easier sometimes since then start address
839 * of the small chunk will always be 64 pages aligned so the
840 * bitmap will always be aligned to unsigned long. We should
841 * even be able to remove this restriction but I'm simply
842 * keeping it.
843 */
844 assert(shift >= 6);
845 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
846 memory_region_clear_dirty_bitmap(rb->mr, start, size);
847 }
848
6b6712ef 849 ret = test_and_clear_bit(page, rb->bmap);
a82d593b
DDAG
850
851 if (ret) {
0d8ec885 852 rs->migration_dirty_pages--;
a82d593b 853 }
386a907b
WW
854 qemu_mutex_unlock(&rs->bitmap_mutex);
855
a82d593b
DDAG
856 return ret;
857}
858
267691b6 859/* Called with RCU critical section */
7a3e9571 860static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
56e93d26 861{
0d8ec885 862 rs->migration_dirty_pages +=
5d0980a4 863 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length,
0d8ec885 864 &rs->num_dirty_pages_period);
56e93d26
JQ
865}
866
3d0684b2
JQ
867/**
868 * ram_pagesize_summary: calculate all the pagesizes of a VM
869 *
870 * Returns a summary bitmap of the page sizes of all RAMBlocks
871 *
872 * For VMs with just normal pages this is equivalent to the host page
873 * size. If it's got some huge pages then it's the OR of all the
874 * different page sizes.
e8ca1db2
DDAG
875 */
876uint64_t ram_pagesize_summary(void)
877{
878 RAMBlock *block;
879 uint64_t summary = 0;
880
fbd162e6 881 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
e8ca1db2
DDAG
882 summary |= block->page_size;
883 }
884
885 return summary;
886}
887
aecbfe9c
XG
888uint64_t ram_get_total_transferred_pages(void)
889{
890 return ram_counters.normal + ram_counters.duplicate +
891 compression_counters.pages + xbzrle_counters.pages;
892}
893
b734035b
XG
894static void migration_update_rates(RAMState *rs, int64_t end_time)
895{
be8b02ed 896 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
76e03000 897 double compressed_size;
b734035b
XG
898
899 /* calculate period counters */
900 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
901 / (end_time - rs->time_last_bitmap_sync);
902
be8b02ed 903 if (!page_count) {
b734035b
XG
904 return;
905 }
906
907 if (migrate_use_xbzrle()) {
e460a4b1
WW
908 double encoded_size, unencoded_size;
909
b734035b 910 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
be8b02ed 911 rs->xbzrle_cache_miss_prev) / page_count;
b734035b 912 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
e460a4b1
WW
913 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
914 TARGET_PAGE_SIZE;
915 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
92271402 916 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
e460a4b1 917 xbzrle_counters.encoding_rate = 0;
e460a4b1
WW
918 } else {
919 xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
920 }
921 rs->xbzrle_pages_prev = xbzrle_counters.pages;
922 rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
b734035b 923 }
76e03000
XG
924
925 if (migrate_use_compression()) {
926 compression_counters.busy_rate = (double)(compression_counters.busy -
927 rs->compress_thread_busy_prev) / page_count;
928 rs->compress_thread_busy_prev = compression_counters.busy;
929
930 compressed_size = compression_counters.compressed_size -
931 rs->compressed_size_prev;
932 if (compressed_size) {
933 double uncompressed_size = (compression_counters.pages -
934 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
935
936 /* Compression-Ratio = Uncompressed-size / Compressed-size */
937 compression_counters.compression_rate =
938 uncompressed_size / compressed_size;
939
940 rs->compress_pages_prev = compression_counters.pages;
941 rs->compressed_size_prev = compression_counters.compressed_size;
942 }
943 }
b734035b
XG
944}
945
dc14a470
KZ
946static void migration_trigger_throttle(RAMState *rs)
947{
948 MigrationState *s = migrate_get_current();
949 uint64_t threshold = s->parameters.throttle_trigger_threshold;
950
951 uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
952 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
953 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
954
955 /* During block migration the auto-converge logic incorrectly detects
956 * that ram migration makes no progress. Avoid this by disabling the
957 * throttling logic during the bulk phase of block migration. */
958 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
959 /* The following detection logic can be refined later. For now:
960 Check to see if the ratio between dirtied bytes and the approx.
961 amount of bytes that just got transferred since the last time
962 we were in this routine reaches the threshold. If that happens
963 twice, start or increase throttling. */
964
965 if ((bytes_dirty_period > bytes_dirty_threshold) &&
966 (++rs->dirty_rate_high_cnt >= 2)) {
967 trace_migration_throttle();
968 rs->dirty_rate_high_cnt = 0;
cbbf8182
KZ
969 mig_throttle_guest_down(bytes_dirty_period,
970 bytes_dirty_threshold);
dc14a470
KZ
971 }
972 }
973}
974
8d820d6f 975static void migration_bitmap_sync(RAMState *rs)
56e93d26
JQ
976{
977 RAMBlock *block;
56e93d26 978 int64_t end_time;
56e93d26 979
9360447d 980 ram_counters.dirty_sync_count++;
56e93d26 981
f664da80
JQ
982 if (!rs->time_last_bitmap_sync) {
983 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
56e93d26
JQ
984 }
985
986 trace_migration_bitmap_sync_start();
9c1f8f44 987 memory_global_dirty_log_sync();
56e93d26 988
108cfae0 989 qemu_mutex_lock(&rs->bitmap_mutex);
89ac5a1d
DDAG
990 WITH_RCU_READ_LOCK_GUARD() {
991 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
992 ramblock_sync_dirty_bitmap(rs, block);
993 }
994 ram_counters.remaining = ram_bytes_remaining();
56e93d26 995 }
108cfae0 996 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26 997
9458a9a1 998 memory_global_after_dirty_log_sync();
a66cd90c 999 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1ffb5dfd 1000
56e93d26
JQ
1001 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1002
1003 /* more than 1 second = 1000 millisecons */
f664da80 1004 if (end_time > rs->time_last_bitmap_sync + 1000) {
dc14a470 1005 migration_trigger_throttle(rs);
070afca2 1006
b734035b
XG
1007 migration_update_rates(rs, end_time);
1008
be8b02ed 1009 rs->target_page_count_prev = rs->target_page_count;
d693c6f1
FF
1010
1011 /* reset period counters */
f664da80 1012 rs->time_last_bitmap_sync = end_time;
a66cd90c 1013 rs->num_dirty_pages_period = 0;
dc14a470 1014 rs->bytes_xfer_prev = ram_counters.transferred;
56e93d26 1015 }
4addcd4f 1016 if (migrate_use_events()) {
3ab72385 1017 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
4addcd4f 1018 }
56e93d26
JQ
1019}
1020
bd227060
WW
1021static void migration_bitmap_sync_precopy(RAMState *rs)
1022{
1023 Error *local_err = NULL;
1024
1025 /*
1026 * The current notifier usage is just an optimization to migration, so we
1027 * don't stop the normal migration process in the error case.
1028 */
1029 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1030 error_report_err(local_err);
b4a1733c 1031 local_err = NULL;
bd227060
WW
1032 }
1033
1034 migration_bitmap_sync(rs);
1035
1036 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1037 error_report_err(local_err);
1038 }
1039}
1040
6c97ec5f
XG
1041/**
1042 * save_zero_page_to_file: send the zero page to the file
1043 *
1044 * Returns the size of data written to the file, 0 means the page is not
1045 * a zero page
1046 *
1047 * @rs: current RAM state
1048 * @file: the file where the data is saved
1049 * @block: block that contains the page we want to send
1050 * @offset: offset inside the block for the page
1051 */
1052static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1053 RAMBlock *block, ram_addr_t offset)
1054{
1055 uint8_t *p = block->host + offset;
1056 int len = 0;
1057
1058 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1059 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1060 qemu_put_byte(file, 0);
1061 len += 1;
1062 }
1063 return len;
1064}
1065
56e93d26 1066/**
3d0684b2 1067 * save_zero_page: send the zero page to the stream
56e93d26 1068 *
3d0684b2 1069 * Returns the number of pages written.
56e93d26 1070 *
f7ccd61b 1071 * @rs: current RAM state
56e93d26
JQ
1072 * @block: block that contains the page we want to send
1073 * @offset: offset inside the block for the page
56e93d26 1074 */
7faccdc3 1075static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
56e93d26 1076{
6c97ec5f 1077 int len = save_zero_page_to_file(rs, rs->f, block, offset);
56e93d26 1078
6c97ec5f 1079 if (len) {
9360447d 1080 ram_counters.duplicate++;
6c97ec5f
XG
1081 ram_counters.transferred += len;
1082 return 1;
56e93d26 1083 }
6c97ec5f 1084 return -1;
56e93d26
JQ
1085}
1086
5727309d 1087static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
53f09a10 1088{
5727309d 1089 if (!migrate_release_ram() || !migration_in_postcopy()) {
53f09a10
PB
1090 return;
1091 }
1092
8bba004c 1093 ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
53f09a10
PB
1094}
1095
059ff0fb
XG
1096/*
1097 * @pages: the number of pages written by the control path,
1098 * < 0 - error
1099 * > 0 - number of pages written
1100 *
1101 * Return true if the pages has been saved, otherwise false is returned.
1102 */
1103static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1104 int *pages)
1105{
1106 uint64_t bytes_xmit = 0;
1107 int ret;
1108
1109 *pages = -1;
1110 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1111 &bytes_xmit);
1112 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1113 return false;
1114 }
1115
1116 if (bytes_xmit) {
1117 ram_counters.transferred += bytes_xmit;
1118 *pages = 1;
1119 }
1120
1121 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1122 return true;
1123 }
1124
1125 if (bytes_xmit > 0) {
1126 ram_counters.normal++;
1127 } else if (bytes_xmit == 0) {
1128 ram_counters.duplicate++;
1129 }
1130
1131 return true;
1132}
1133
65dacaa0
XG
1134/*
1135 * directly send the page to the stream
1136 *
1137 * Returns the number of pages written.
1138 *
1139 * @rs: current RAM state
1140 * @block: block that contains the page we want to send
1141 * @offset: offset inside the block for the page
1142 * @buf: the page to be sent
1143 * @async: send to page asyncly
1144 */
1145static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1146 uint8_t *buf, bool async)
1147{
1148 ram_counters.transferred += save_page_header(rs, rs->f, block,
1149 offset | RAM_SAVE_FLAG_PAGE);
1150 if (async) {
1151 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1152 migrate_release_ram() &
1153 migration_in_postcopy());
1154 } else {
1155 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1156 }
1157 ram_counters.transferred += TARGET_PAGE_SIZE;
1158 ram_counters.normal++;
1159 return 1;
1160}
1161
56e93d26 1162/**
3d0684b2 1163 * ram_save_page: send the given page to the stream
56e93d26 1164 *
3d0684b2 1165 * Returns the number of pages written.
3fd3c4b3
DDAG
1166 * < 0 - error
1167 * >=0 - Number of pages written - this might legally be 0
1168 * if xbzrle noticed the page was the same.
56e93d26 1169 *
6f37bb8b 1170 * @rs: current RAM state
56e93d26
JQ
1171 * @block: block that contains the page we want to send
1172 * @offset: offset inside the block for the page
1173 * @last_stage: if we are at the completion stage
56e93d26 1174 */
a0a8aa14 1175static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
56e93d26
JQ
1176{
1177 int pages = -1;
56e93d26 1178 uint8_t *p;
56e93d26 1179 bool send_async = true;
a08f6890 1180 RAMBlock *block = pss->block;
8bba004c 1181 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
059ff0fb 1182 ram_addr_t current_addr = block->offset + offset;
56e93d26 1183
2f68e399 1184 p = block->host + offset;
1db9d8e5 1185 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
56e93d26 1186
56e93d26 1187 XBZRLE_cache_lock();
d7400a34
XG
1188 if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1189 migrate_use_xbzrle()) {
059ff0fb
XG
1190 pages = save_xbzrle_page(rs, &p, current_addr, block,
1191 offset, last_stage);
1192 if (!last_stage) {
1193 /* Can't send this cached data async, since the cache page
1194 * might get updated before it gets to the wire
56e93d26 1195 */
059ff0fb 1196 send_async = false;
56e93d26
JQ
1197 }
1198 }
1199
1200 /* XBZRLE overflow or normal page */
1201 if (pages == -1) {
65dacaa0 1202 pages = save_normal_page(rs, block, offset, p, send_async);
56e93d26
JQ
1203 }
1204
1205 XBZRLE_cache_unlock();
1206
1207 return pages;
1208}
1209
b9ee2f7d
JQ
1210static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1211 ram_addr_t offset)
1212{
67a4c891 1213 if (multifd_queue_page(rs->f, block, offset) < 0) {
713f762a
IR
1214 return -1;
1215 }
b9ee2f7d
JQ
1216 ram_counters.normal++;
1217
1218 return 1;
1219}
1220
5e5fdcff 1221static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
6ef3771c 1222 ram_addr_t offset, uint8_t *source_buf)
56e93d26 1223{
53518d94 1224 RAMState *rs = ram_state;
a7a9a88f 1225 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
5e5fdcff 1226 bool zero_page = false;
6ef3771c 1227 int ret;
56e93d26 1228
5e5fdcff
XG
1229 if (save_zero_page_to_file(rs, f, block, offset)) {
1230 zero_page = true;
1231 goto exit;
1232 }
1233
6ef3771c 1234 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
34ab9e97
XG
1235
1236 /*
1237 * copy it to a internal buffer to avoid it being modified by VM
1238 * so that we can catch up the error during compression and
1239 * decompression
1240 */
1241 memcpy(source_buf, p, TARGET_PAGE_SIZE);
6ef3771c
XG
1242 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1243 if (ret < 0) {
1244 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
b3be2896 1245 error_report("compressed data failed!");
5e5fdcff 1246 return false;
b3be2896 1247 }
56e93d26 1248
5e5fdcff 1249exit:
6ef3771c 1250 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
5e5fdcff
XG
1251 return zero_page;
1252}
1253
1254static void
1255update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1256{
76e03000
XG
1257 ram_counters.transferred += bytes_xmit;
1258
5e5fdcff
XG
1259 if (param->zero_page) {
1260 ram_counters.duplicate++;
76e03000 1261 return;
5e5fdcff 1262 }
76e03000
XG
1263
1264 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1265 compression_counters.compressed_size += bytes_xmit - 8;
1266 compression_counters.pages++;
56e93d26
JQ
1267}
1268
32b05495
XG
1269static bool save_page_use_compression(RAMState *rs);
1270
ce25d337 1271static void flush_compressed_data(RAMState *rs)
56e93d26
JQ
1272{
1273 int idx, len, thread_count;
1274
32b05495 1275 if (!save_page_use_compression(rs)) {
56e93d26
JQ
1276 return;
1277 }
1278 thread_count = migrate_compress_threads();
a7a9a88f 1279
0d9f9a5c 1280 qemu_mutex_lock(&comp_done_lock);
56e93d26 1281 for (idx = 0; idx < thread_count; idx++) {
a7a9a88f 1282 while (!comp_param[idx].done) {
0d9f9a5c 1283 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26 1284 }
a7a9a88f 1285 }
0d9f9a5c 1286 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
1287
1288 for (idx = 0; idx < thread_count; idx++) {
1289 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 1290 if (!comp_param[idx].quit) {
ce25d337 1291 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
5e5fdcff
XG
1292 /*
1293 * it's safe to fetch zero_page without holding comp_done_lock
1294 * as there is no further request submitted to the thread,
1295 * i.e, the thread should be waiting for a request at this point.
1296 */
1297 update_compress_thread_counts(&comp_param[idx], len);
56e93d26 1298 }
a7a9a88f 1299 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26
JQ
1300 }
1301}
1302
1303static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1304 ram_addr_t offset)
1305{
1306 param->block = block;
1307 param->offset = offset;
1308}
1309
ce25d337
JQ
1310static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1311 ram_addr_t offset)
56e93d26
JQ
1312{
1313 int idx, thread_count, bytes_xmit = -1, pages = -1;
1d58872a 1314 bool wait = migrate_compress_wait_thread();
56e93d26
JQ
1315
1316 thread_count = migrate_compress_threads();
0d9f9a5c 1317 qemu_mutex_lock(&comp_done_lock);
1d58872a
XG
1318retry:
1319 for (idx = 0; idx < thread_count; idx++) {
1320 if (comp_param[idx].done) {
1321 comp_param[idx].done = false;
1322 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1323 qemu_mutex_lock(&comp_param[idx].mutex);
1324 set_compress_params(&comp_param[idx], block, offset);
1325 qemu_cond_signal(&comp_param[idx].cond);
1326 qemu_mutex_unlock(&comp_param[idx].mutex);
1327 pages = 1;
5e5fdcff 1328 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
56e93d26 1329 break;
56e93d26
JQ
1330 }
1331 }
1d58872a
XG
1332
1333 /*
1334 * wait for the free thread if the user specifies 'compress-wait-thread',
1335 * otherwise we will post the page out in the main thread as normal page.
1336 */
1337 if (pages < 0 && wait) {
1338 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1339 goto retry;
1340 }
0d9f9a5c 1341 qemu_mutex_unlock(&comp_done_lock);
56e93d26
JQ
1342
1343 return pages;
1344}
1345
3d0684b2
JQ
1346/**
1347 * find_dirty_block: find the next dirty page and update any state
1348 * associated with the search process.
b9e60928 1349 *
a5f7b1a6 1350 * Returns true if a page is found
b9e60928 1351 *
6f37bb8b 1352 * @rs: current RAM state
3d0684b2
JQ
1353 * @pss: data about the state of the current dirty page scan
1354 * @again: set to false if the search has scanned the whole of RAM
b9e60928 1355 */
f20e2865 1356static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
b9e60928 1357{
f20e2865 1358 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
6f37bb8b 1359 if (pss->complete_round && pss->block == rs->last_seen_block &&
a935e30f 1360 pss->page >= rs->last_page) {
b9e60928
DDAG
1361 /*
1362 * We've been once around the RAM and haven't found anything.
1363 * Give up.
1364 */
1365 *again = false;
1366 return false;
1367 }
8bba004c
AR
1368 if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS)
1369 >= pss->block->used_length) {
b9e60928 1370 /* Didn't find anything in this RAM Block */
a935e30f 1371 pss->page = 0;
b9e60928
DDAG
1372 pss->block = QLIST_NEXT_RCU(pss->block, next);
1373 if (!pss->block) {
48df9d80
XG
1374 /*
1375 * If memory migration starts over, we will meet a dirtied page
1376 * which may still exists in compression threads's ring, so we
1377 * should flush the compressed data to make sure the new page
1378 * is not overwritten by the old one in the destination.
1379 *
1380 * Also If xbzrle is on, stop using the data compression at this
1381 * point. In theory, xbzrle can do better than compression.
1382 */
1383 flush_compressed_data(rs);
1384
b9e60928
DDAG
1385 /* Hit the end of the list */
1386 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1387 /* Flag that we've looped */
1388 pss->complete_round = true;
6f37bb8b 1389 rs->ram_bulk_stage = false;
b9e60928
DDAG
1390 }
1391 /* Didn't find anything this time, but try again on the new block */
1392 *again = true;
1393 return false;
1394 } else {
1395 /* Can go around again, but... */
1396 *again = true;
1397 /* We've found something so probably don't need to */
1398 return true;
1399 }
1400}
1401
3d0684b2
JQ
1402/**
1403 * unqueue_page: gets a page of the queue
1404 *
a82d593b 1405 * Helper for 'get_queued_page' - gets a page off the queue
a82d593b 1406 *
3d0684b2
JQ
1407 * Returns the block of the page (or NULL if none available)
1408 *
ec481c6c 1409 * @rs: current RAM state
3d0684b2 1410 * @offset: used to return the offset within the RAMBlock
a82d593b 1411 */
f20e2865 1412static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
a82d593b
DDAG
1413{
1414 RAMBlock *block = NULL;
1415
ae526e32
XG
1416 if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1417 return NULL;
1418 }
1419
6e8a355d 1420 QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
ec481c6c
JQ
1421 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1422 struct RAMSrcPageRequest *entry =
1423 QSIMPLEQ_FIRST(&rs->src_page_requests);
a82d593b
DDAG
1424 block = entry->rb;
1425 *offset = entry->offset;
a82d593b
DDAG
1426
1427 if (entry->len > TARGET_PAGE_SIZE) {
1428 entry->len -= TARGET_PAGE_SIZE;
1429 entry->offset += TARGET_PAGE_SIZE;
1430 } else {
1431 memory_region_unref(block->mr);
ec481c6c 1432 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
a82d593b 1433 g_free(entry);
e03a34f8 1434 migration_consume_urgent_request();
a82d593b
DDAG
1435 }
1436 }
a82d593b
DDAG
1437
1438 return block;
1439}
1440
3d0684b2 1441/**
ff1543af 1442 * get_queued_page: unqueue a page from the postcopy requests
3d0684b2
JQ
1443 *
1444 * Skips pages that are already sent (!dirty)
a82d593b 1445 *
a5f7b1a6 1446 * Returns true if a queued page is found
a82d593b 1447 *
6f37bb8b 1448 * @rs: current RAM state
3d0684b2 1449 * @pss: data about the state of the current dirty page scan
a82d593b 1450 */
f20e2865 1451static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
a82d593b
DDAG
1452{
1453 RAMBlock *block;
1454 ram_addr_t offset;
1455 bool dirty;
1456
1457 do {
f20e2865 1458 block = unqueue_page(rs, &offset);
a82d593b
DDAG
1459 /*
1460 * We're sending this page, and since it's postcopy nothing else
1461 * will dirty it, and we must make sure it doesn't get sent again
1462 * even if this queue request was received after the background
1463 * search already sent it.
1464 */
1465 if (block) {
f20e2865
JQ
1466 unsigned long page;
1467
6b6712ef
JQ
1468 page = offset >> TARGET_PAGE_BITS;
1469 dirty = test_bit(page, block->bmap);
a82d593b 1470 if (!dirty) {
06b10688 1471 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
64737606 1472 page);
a82d593b 1473 } else {
f20e2865 1474 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
a82d593b
DDAG
1475 }
1476 }
1477
1478 } while (block && !dirty);
1479
1480 if (block) {
1481 /*
1482 * As soon as we start servicing pages out of order, then we have
1483 * to kill the bulk stage, since the bulk stage assumes
1484 * in (migration_bitmap_find_and_reset_dirty) that every page is
1485 * dirty, that's no longer true.
1486 */
6f37bb8b 1487 rs->ram_bulk_stage = false;
a82d593b
DDAG
1488
1489 /*
1490 * We want the background search to continue from the queued page
1491 * since the guest is likely to want other pages near to the page
1492 * it just requested.
1493 */
1494 pss->block = block;
a935e30f 1495 pss->page = offset >> TARGET_PAGE_BITS;
422314e7
WY
1496
1497 /*
1498 * This unqueued page would break the "one round" check, even is
1499 * really rare.
1500 */
1501 pss->complete_round = false;
a82d593b
DDAG
1502 }
1503
1504 return !!block;
1505}
1506
6c595cde 1507/**
5e58f968
JQ
1508 * migration_page_queue_free: drop any remaining pages in the ram
1509 * request queue
6c595cde 1510 *
3d0684b2
JQ
1511 * It should be empty at the end anyway, but in error cases there may
1512 * be some left. in case that there is any page left, we drop it.
1513 *
6c595cde 1514 */
83c13382 1515static void migration_page_queue_free(RAMState *rs)
6c595cde 1516{
ec481c6c 1517 struct RAMSrcPageRequest *mspr, *next_mspr;
6c595cde
DDAG
1518 /* This queue generally should be empty - but in the case of a failed
1519 * migration might have some droppings in.
1520 */
89ac5a1d 1521 RCU_READ_LOCK_GUARD();
ec481c6c 1522 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
6c595cde 1523 memory_region_unref(mspr->rb->mr);
ec481c6c 1524 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
6c595cde
DDAG
1525 g_free(mspr);
1526 }
6c595cde
DDAG
1527}
1528
1529/**
3d0684b2
JQ
1530 * ram_save_queue_pages: queue the page for transmission
1531 *
1532 * A request from postcopy destination for example.
1533 *
1534 * Returns zero on success or negative on error
1535 *
3d0684b2
JQ
1536 * @rbname: Name of the RAMBLock of the request. NULL means the
1537 * same that last one.
1538 * @start: starting address from the start of the RAMBlock
1539 * @len: length (in bytes) to send
6c595cde 1540 */
96506894 1541int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
6c595cde
DDAG
1542{
1543 RAMBlock *ramblock;
53518d94 1544 RAMState *rs = ram_state;
6c595cde 1545
9360447d 1546 ram_counters.postcopy_requests++;
89ac5a1d
DDAG
1547 RCU_READ_LOCK_GUARD();
1548
6c595cde
DDAG
1549 if (!rbname) {
1550 /* Reuse last RAMBlock */
68a098f3 1551 ramblock = rs->last_req_rb;
6c595cde
DDAG
1552
1553 if (!ramblock) {
1554 /*
1555 * Shouldn't happen, we can't reuse the last RAMBlock if
1556 * it's the 1st request.
1557 */
1558 error_report("ram_save_queue_pages no previous block");
03acb4e9 1559 return -1;
6c595cde
DDAG
1560 }
1561 } else {
1562 ramblock = qemu_ram_block_by_name(rbname);
1563
1564 if (!ramblock) {
1565 /* We shouldn't be asked for a non-existent RAMBlock */
1566 error_report("ram_save_queue_pages no block '%s'", rbname);
03acb4e9 1567 return -1;
6c595cde 1568 }
68a098f3 1569 rs->last_req_rb = ramblock;
6c595cde
DDAG
1570 }
1571 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1572 if (start+len > ramblock->used_length) {
9458ad6b
JQ
1573 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1574 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
6c595cde 1575 __func__, start, len, ramblock->used_length);
03acb4e9 1576 return -1;
6c595cde
DDAG
1577 }
1578
ec481c6c
JQ
1579 struct RAMSrcPageRequest *new_entry =
1580 g_malloc0(sizeof(struct RAMSrcPageRequest));
6c595cde
DDAG
1581 new_entry->rb = ramblock;
1582 new_entry->offset = start;
1583 new_entry->len = len;
1584
1585 memory_region_ref(ramblock->mr);
ec481c6c
JQ
1586 qemu_mutex_lock(&rs->src_page_req_mutex);
1587 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
e03a34f8 1588 migration_make_urgent_request();
ec481c6c 1589 qemu_mutex_unlock(&rs->src_page_req_mutex);
6c595cde
DDAG
1590
1591 return 0;
6c595cde
DDAG
1592}
1593
d7400a34
XG
1594static bool save_page_use_compression(RAMState *rs)
1595{
1596 if (!migrate_use_compression()) {
1597 return false;
1598 }
1599
1600 /*
1601 * If xbzrle is on, stop using the data compression after first
1602 * round of migration even if compression is enabled. In theory,
1603 * xbzrle can do better than compression.
1604 */
1605 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1606 return true;
1607 }
1608
1609 return false;
1610}
1611
5e5fdcff
XG
1612/*
1613 * try to compress the page before posting it out, return true if the page
1614 * has been properly handled by compression, otherwise needs other
1615 * paths to handle it
1616 */
1617static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1618{
1619 if (!save_page_use_compression(rs)) {
1620 return false;
1621 }
1622
1623 /*
1624 * When starting the process of a new block, the first page of
1625 * the block should be sent out before other pages in the same
1626 * block, and all the pages in last block should have been sent
1627 * out, keeping this order is important, because the 'cont' flag
1628 * is used to avoid resending the block name.
1629 *
1630 * We post the fist page as normal page as compression will take
1631 * much CPU resource.
1632 */
1633 if (block != rs->last_sent_block) {
1634 flush_compressed_data(rs);
1635 return false;
1636 }
1637
1638 if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1639 return true;
1640 }
1641
76e03000 1642 compression_counters.busy++;
5e5fdcff
XG
1643 return false;
1644}
1645
a82d593b 1646/**
3d0684b2 1647 * ram_save_target_page: save one target page
a82d593b 1648 *
3d0684b2 1649 * Returns the number of pages written
a82d593b 1650 *
6f37bb8b 1651 * @rs: current RAM state
3d0684b2 1652 * @pss: data about the page we want to send
a82d593b 1653 * @last_stage: if we are at the completion stage
a82d593b 1654 */
a0a8aa14 1655static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
f20e2865 1656 bool last_stage)
a82d593b 1657{
a8ec91f9 1658 RAMBlock *block = pss->block;
8bba004c 1659 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
a8ec91f9
XG
1660 int res;
1661
1662 if (control_save_page(rs, block, offset, &res)) {
1663 return res;
1664 }
1665
5e5fdcff
XG
1666 if (save_compress_page(rs, block, offset)) {
1667 return 1;
d7400a34
XG
1668 }
1669
1670 res = save_zero_page(rs, block, offset);
1671 if (res > 0) {
1672 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1673 * page would be stale
1674 */
1675 if (!save_page_use_compression(rs)) {
1676 XBZRLE_cache_lock();
1677 xbzrle_cache_zero_page(rs, block->offset + offset);
1678 XBZRLE_cache_unlock();
1679 }
1680 ram_release_pages(block->idstr, offset, res);
1681 return res;
1682 }
1683
da3f56cb 1684 /*
c6b3a2e0
WY
1685 * Do not use multifd for:
1686 * 1. Compression as the first page in the new block should be posted out
1687 * before sending the compressed page
1688 * 2. In postcopy as one whole host page should be placed
da3f56cb 1689 */
c6b3a2e0
WY
1690 if (!save_page_use_compression(rs) && migrate_use_multifd()
1691 && !migration_in_postcopy()) {
b9ee2f7d 1692 return ram_save_multifd_page(rs, block, offset);
a82d593b
DDAG
1693 }
1694
1faa5665 1695 return ram_save_page(rs, pss, last_stage);
a82d593b
DDAG
1696}
1697
1698/**
3d0684b2 1699 * ram_save_host_page: save a whole host page
a82d593b 1700 *
3d0684b2
JQ
1701 * Starting at *offset send pages up to the end of the current host
1702 * page. It's valid for the initial offset to point into the middle of
1703 * a host page in which case the remainder of the hostpage is sent.
1704 * Only dirty target pages are sent. Note that the host page size may
1705 * be a huge page for this block.
1eb3fc0a
DDAG
1706 * The saving stops at the boundary of the used_length of the block
1707 * if the RAMBlock isn't a multiple of the host page size.
a82d593b 1708 *
3d0684b2
JQ
1709 * Returns the number of pages written or negative on error
1710 *
6f37bb8b 1711 * @rs: current RAM state
3d0684b2 1712 * @ms: current migration state
3d0684b2 1713 * @pss: data about the page we want to send
a82d593b 1714 * @last_stage: if we are at the completion stage
a82d593b 1715 */
a0a8aa14 1716static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
f20e2865 1717 bool last_stage)
a82d593b
DDAG
1718{
1719 int tmppages, pages = 0;
a935e30f
JQ
1720 size_t pagesize_bits =
1721 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
4c011c37 1722
fbd162e6 1723 if (ramblock_is_ignored(pss->block)) {
b895de50
CLG
1724 error_report("block %s should not be migrated !", pss->block->idstr);
1725 return 0;
1726 }
1727
a82d593b 1728 do {
1faa5665
XG
1729 /* Check the pages is dirty and if it is send it */
1730 if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1731 pss->page++;
1732 continue;
1733 }
1734
f20e2865 1735 tmppages = ram_save_target_page(rs, pss, last_stage);
a82d593b
DDAG
1736 if (tmppages < 0) {
1737 return tmppages;
1738 }
1739
1740 pages += tmppages;
a935e30f 1741 pss->page++;
97e1e067
DDAG
1742 /* Allow rate limiting to happen in the middle of huge pages */
1743 migration_rate_limit();
1eb3fc0a 1744 } while ((pss->page & (pagesize_bits - 1)) &&
8bba004c
AR
1745 offset_in_ramblock(pss->block,
1746 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
a82d593b
DDAG
1747
1748 /* The offset we leave with is the last one we looked at */
a935e30f 1749 pss->page--;
a82d593b
DDAG
1750 return pages;
1751}
6c595cde 1752
56e93d26 1753/**
3d0684b2 1754 * ram_find_and_save_block: finds a dirty page and sends it to f
56e93d26
JQ
1755 *
1756 * Called within an RCU critical section.
1757 *
e8f3735f
XG
1758 * Returns the number of pages written where zero means no dirty pages,
1759 * or negative on error
56e93d26 1760 *
6f37bb8b 1761 * @rs: current RAM state
56e93d26 1762 * @last_stage: if we are at the completion stage
a82d593b
DDAG
1763 *
1764 * On systems where host-page-size > target-page-size it will send all the
1765 * pages in a host page that are dirty.
56e93d26
JQ
1766 */
1767
ce25d337 1768static int ram_find_and_save_block(RAMState *rs, bool last_stage)
56e93d26 1769{
b8fb8cb7 1770 PageSearchStatus pss;
56e93d26 1771 int pages = 0;
b9e60928 1772 bool again, found;
56e93d26 1773
0827b9e9
AA
1774 /* No dirty page as there is zero RAM */
1775 if (!ram_bytes_total()) {
1776 return pages;
1777 }
1778
6f37bb8b 1779 pss.block = rs->last_seen_block;
a935e30f 1780 pss.page = rs->last_page;
b8fb8cb7
DDAG
1781 pss.complete_round = false;
1782
1783 if (!pss.block) {
1784 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1785 }
56e93d26 1786
b9e60928 1787 do {
a82d593b 1788 again = true;
f20e2865 1789 found = get_queued_page(rs, &pss);
b9e60928 1790
a82d593b
DDAG
1791 if (!found) {
1792 /* priority queue empty, so just search for something dirty */
f20e2865 1793 found = find_dirty_block(rs, &pss, &again);
a82d593b 1794 }
f3f491fc 1795
a82d593b 1796 if (found) {
f20e2865 1797 pages = ram_save_host_page(rs, &pss, last_stage);
56e93d26 1798 }
b9e60928 1799 } while (!pages && again);
56e93d26 1800
6f37bb8b 1801 rs->last_seen_block = pss.block;
a935e30f 1802 rs->last_page = pss.page;
56e93d26
JQ
1803
1804 return pages;
1805}
1806
1807void acct_update_position(QEMUFile *f, size_t size, bool zero)
1808{
1809 uint64_t pages = size / TARGET_PAGE_SIZE;
f7ccd61b 1810
56e93d26 1811 if (zero) {
9360447d 1812 ram_counters.duplicate += pages;
56e93d26 1813 } else {
9360447d
JQ
1814 ram_counters.normal += pages;
1815 ram_counters.transferred += size;
56e93d26
JQ
1816 qemu_update_position(f, size);
1817 }
1818}
1819
fbd162e6 1820static uint64_t ram_bytes_total_common(bool count_ignored)
56e93d26
JQ
1821{
1822 RAMBlock *block;
1823 uint64_t total = 0;
1824
89ac5a1d
DDAG
1825 RCU_READ_LOCK_GUARD();
1826
fbd162e6
YK
1827 if (count_ignored) {
1828 RAMBLOCK_FOREACH_MIGRATABLE(block) {
1829 total += block->used_length;
1830 }
1831 } else {
1832 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1833 total += block->used_length;
1834 }
99e15582 1835 }
56e93d26
JQ
1836 return total;
1837}
1838
fbd162e6
YK
1839uint64_t ram_bytes_total(void)
1840{
1841 return ram_bytes_total_common(false);
1842}
1843
f265e0e4 1844static void xbzrle_load_setup(void)
56e93d26 1845{
f265e0e4 1846 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
56e93d26
JQ
1847}
1848
f265e0e4
JQ
1849static void xbzrle_load_cleanup(void)
1850{
1851 g_free(XBZRLE.decoded_buf);
1852 XBZRLE.decoded_buf = NULL;
1853}
1854
7d7c96be
PX
1855static void ram_state_cleanup(RAMState **rsp)
1856{
b9ccaf6d
DDAG
1857 if (*rsp) {
1858 migration_page_queue_free(*rsp);
1859 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1860 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1861 g_free(*rsp);
1862 *rsp = NULL;
1863 }
7d7c96be
PX
1864}
1865
84593a08
PX
1866static void xbzrle_cleanup(void)
1867{
1868 XBZRLE_cache_lock();
1869 if (XBZRLE.cache) {
1870 cache_fini(XBZRLE.cache);
1871 g_free(XBZRLE.encoded_buf);
1872 g_free(XBZRLE.current_buf);
1873 g_free(XBZRLE.zero_target_page);
1874 XBZRLE.cache = NULL;
1875 XBZRLE.encoded_buf = NULL;
1876 XBZRLE.current_buf = NULL;
1877 XBZRLE.zero_target_page = NULL;
1878 }
1879 XBZRLE_cache_unlock();
1880}
1881
f265e0e4 1882static void ram_save_cleanup(void *opaque)
56e93d26 1883{
53518d94 1884 RAMState **rsp = opaque;
6b6712ef 1885 RAMBlock *block;
eb859c53 1886
2ff64038 1887 /* caller have hold iothread lock or is in a bh, so there is
4633456c 1888 * no writing race against the migration bitmap
2ff64038 1889 */
6b6712ef
JQ
1890 memory_global_dirty_log_stop();
1891
fbd162e6 1892 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
002cad6b
PX
1893 g_free(block->clear_bmap);
1894 block->clear_bmap = NULL;
6b6712ef
JQ
1895 g_free(block->bmap);
1896 block->bmap = NULL;
56e93d26
JQ
1897 }
1898
84593a08 1899 xbzrle_cleanup();
f0afa331 1900 compress_threads_save_cleanup();
7d7c96be 1901 ram_state_cleanup(rsp);
56e93d26
JQ
1902}
1903
6f37bb8b 1904static void ram_state_reset(RAMState *rs)
56e93d26 1905{
6f37bb8b
JQ
1906 rs->last_seen_block = NULL;
1907 rs->last_sent_block = NULL;
269ace29 1908 rs->last_page = 0;
6f37bb8b
JQ
1909 rs->last_version = ram_list.version;
1910 rs->ram_bulk_stage = true;
6eeb63f7 1911 rs->fpo_enabled = false;
56e93d26
JQ
1912}
1913
1914#define MAX_WAIT 50 /* ms, half buffered_file limit */
1915
4f2e4252
DDAG
1916/*
1917 * 'expected' is the value you expect the bitmap mostly to be full
1918 * of; it won't bother printing lines that are all this value.
1919 * If 'todump' is null the migration bitmap is dumped.
1920 */
6b6712ef
JQ
1921void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1922 unsigned long pages)
4f2e4252 1923{
4f2e4252
DDAG
1924 int64_t cur;
1925 int64_t linelen = 128;
1926 char linebuf[129];
1927
6b6712ef 1928 for (cur = 0; cur < pages; cur += linelen) {
4f2e4252
DDAG
1929 int64_t curb;
1930 bool found = false;
1931 /*
1932 * Last line; catch the case where the line length
1933 * is longer than remaining ram
1934 */
6b6712ef
JQ
1935 if (cur + linelen > pages) {
1936 linelen = pages - cur;
4f2e4252
DDAG
1937 }
1938 for (curb = 0; curb < linelen; curb++) {
1939 bool thisbit = test_bit(cur + curb, todump);
1940 linebuf[curb] = thisbit ? '1' : '.';
1941 found = found || (thisbit != expected);
1942 }
1943 if (found) {
1944 linebuf[curb] = '\0';
1945 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
1946 }
1947 }
1948}
1949
e0b266f0
DDAG
1950/* **** functions for postcopy ***** */
1951
ced1c616
PB
1952void ram_postcopy_migrated_memory_release(MigrationState *ms)
1953{
1954 struct RAMBlock *block;
ced1c616 1955
fbd162e6 1956 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
6b6712ef
JQ
1957 unsigned long *bitmap = block->bmap;
1958 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1959 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
ced1c616
PB
1960
1961 while (run_start < range) {
1962 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
8bba004c
AR
1963 ram_discard_range(block->idstr,
1964 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
1965 ((ram_addr_t)(run_end - run_start))
1966 << TARGET_PAGE_BITS);
ced1c616
PB
1967 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1968 }
1969 }
1970}
1971
3d0684b2
JQ
1972/**
1973 * postcopy_send_discard_bm_ram: discard a RAMBlock
1974 *
1975 * Returns zero on success
1976 *
e0b266f0 1977 * Callback from postcopy_each_ram_send_discard for each RAMBlock
3d0684b2
JQ
1978 *
1979 * @ms: current migration state
89dab31b 1980 * @block: RAMBlock to discard
e0b266f0 1981 */
810cf2bb 1982static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
e0b266f0 1983{
6b6712ef 1984 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
e0b266f0 1985 unsigned long current;
1e7cf8c3 1986 unsigned long *bitmap = block->bmap;
e0b266f0 1987
6b6712ef 1988 for (current = 0; current < end; ) {
1e7cf8c3 1989 unsigned long one = find_next_bit(bitmap, end, current);
33a5cb62 1990 unsigned long zero, discard_length;
e0b266f0 1991
33a5cb62
WY
1992 if (one >= end) {
1993 break;
1994 }
e0b266f0 1995
1e7cf8c3 1996 zero = find_next_zero_bit(bitmap, end, one + 1);
33a5cb62
WY
1997
1998 if (zero >= end) {
1999 discard_length = end - one;
e0b266f0 2000 } else {
33a5cb62
WY
2001 discard_length = zero - one;
2002 }
810cf2bb 2003 postcopy_discard_send_range(ms, one, discard_length);
33a5cb62 2004 current = one + discard_length;
e0b266f0
DDAG
2005 }
2006
2007 return 0;
2008}
2009
3d0684b2
JQ
2010/**
2011 * postcopy_each_ram_send_discard: discard all RAMBlocks
2012 *
2013 * Returns 0 for success or negative for error
2014 *
e0b266f0
DDAG
2015 * Utility for the outgoing postcopy code.
2016 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2017 * passing it bitmap indexes and name.
e0b266f0
DDAG
2018 * (qemu_ram_foreach_block ends up passing unscaled lengths
2019 * which would mean postcopy code would have to deal with target page)
3d0684b2
JQ
2020 *
2021 * @ms: current migration state
e0b266f0
DDAG
2022 */
2023static int postcopy_each_ram_send_discard(MigrationState *ms)
2024{
2025 struct RAMBlock *block;
2026 int ret;
2027
fbd162e6 2028 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
810cf2bb 2029 postcopy_discard_send_init(ms, block->idstr);
e0b266f0
DDAG
2030
2031 /*
2032 * Postcopy sends chunks of bitmap over the wire, but it
2033 * just needs indexes at this point, avoids it having
2034 * target page specific code.
2035 */
810cf2bb
WY
2036 ret = postcopy_send_discard_bm_ram(ms, block);
2037 postcopy_discard_send_finish(ms);
e0b266f0
DDAG
2038 if (ret) {
2039 return ret;
2040 }
2041 }
2042
2043 return 0;
2044}
2045
3d0684b2 2046/**
8324ef86 2047 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
3d0684b2
JQ
2048 *
2049 * Helper for postcopy_chunk_hostpages; it's called twice to
2050 * canonicalize the two bitmaps, that are similar, but one is
2051 * inverted.
99e314eb 2052 *
3d0684b2
JQ
2053 * Postcopy requires that all target pages in a hostpage are dirty or
2054 * clean, not a mix. This function canonicalizes the bitmaps.
99e314eb 2055 *
3d0684b2 2056 * @ms: current migration state
3d0684b2 2057 * @block: block that contains the page we want to canonicalize
99e314eb 2058 */
1e7cf8c3 2059static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
99e314eb 2060{
53518d94 2061 RAMState *rs = ram_state;
6b6712ef 2062 unsigned long *bitmap = block->bmap;
29c59172 2063 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
6b6712ef 2064 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
99e314eb
DDAG
2065 unsigned long run_start;
2066
29c59172
DDAG
2067 if (block->page_size == TARGET_PAGE_SIZE) {
2068 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2069 return;
2070 }
2071
1e7cf8c3
WY
2072 /* Find a dirty page */
2073 run_start = find_next_bit(bitmap, pages, 0);
99e314eb 2074
6b6712ef 2075 while (run_start < pages) {
99e314eb
DDAG
2076
2077 /*
2078 * If the start of this run of pages is in the middle of a host
2079 * page, then we need to fixup this host page.
2080 */
9dec3cc3 2081 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
99e314eb 2082 /* Find the end of this run */
1e7cf8c3 2083 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
99e314eb
DDAG
2084 /*
2085 * If the end isn't at the start of a host page, then the
2086 * run doesn't finish at the end of a host page
2087 * and we need to discard.
2088 */
99e314eb
DDAG
2089 }
2090
9dec3cc3 2091 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
99e314eb 2092 unsigned long page;
dad45ab2
WY
2093 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2094 host_ratio);
2095 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
99e314eb 2096
99e314eb
DDAG
2097 /* Clean up the bitmap */
2098 for (page = fixup_start_addr;
2099 page < fixup_start_addr + host_ratio; page++) {
99e314eb
DDAG
2100 /*
2101 * Remark them as dirty, updating the count for any pages
2102 * that weren't previously dirty.
2103 */
0d8ec885 2104 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
99e314eb
DDAG
2105 }
2106 }
2107
1e7cf8c3
WY
2108 /* Find the next dirty page for the next iteration */
2109 run_start = find_next_bit(bitmap, pages, run_start);
99e314eb
DDAG
2110 }
2111}
2112
3d0684b2 2113/**
89dab31b 2114 * postcopy_chunk_hostpages: discard any partially sent host page
3d0684b2 2115 *
99e314eb
DDAG
2116 * Utility for the outgoing postcopy code.
2117 *
2118 * Discard any partially sent host-page size chunks, mark any partially
29c59172
DDAG
2119 * dirty host-page size chunks as all dirty. In this case the host-page
2120 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
99e314eb 2121 *
3d0684b2
JQ
2122 * Returns zero on success
2123 *
2124 * @ms: current migration state
6b6712ef 2125 * @block: block we want to work with
99e314eb 2126 */
6b6712ef 2127static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
99e314eb 2128{
810cf2bb 2129 postcopy_discard_send_init(ms, block->idstr);
99e314eb 2130
6b6712ef 2131 /*
1e7cf8c3 2132 * Ensure that all partially dirty host pages are made fully dirty.
6b6712ef 2133 */
1e7cf8c3 2134 postcopy_chunk_hostpages_pass(ms, block);
99e314eb 2135
810cf2bb 2136 postcopy_discard_send_finish(ms);
99e314eb
DDAG
2137 return 0;
2138}
2139
3d0684b2
JQ
2140/**
2141 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2142 *
2143 * Returns zero on success
2144 *
e0b266f0
DDAG
2145 * Transmit the set of pages to be discarded after precopy to the target
2146 * these are pages that:
2147 * a) Have been previously transmitted but are now dirty again
2148 * b) Pages that have never been transmitted, this ensures that
2149 * any pages on the destination that have been mapped by background
2150 * tasks get discarded (transparent huge pages is the specific concern)
2151 * Hopefully this is pretty sparse
3d0684b2
JQ
2152 *
2153 * @ms: current migration state
e0b266f0
DDAG
2154 */
2155int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2156{
53518d94 2157 RAMState *rs = ram_state;
6b6712ef 2158 RAMBlock *block;
e0b266f0 2159 int ret;
e0b266f0 2160
89ac5a1d 2161 RCU_READ_LOCK_GUARD();
e0b266f0
DDAG
2162
2163 /* This should be our last sync, the src is now paused */
eb859c53 2164 migration_bitmap_sync(rs);
e0b266f0 2165
6b6712ef
JQ
2166 /* Easiest way to make sure we don't resume in the middle of a host-page */
2167 rs->last_seen_block = NULL;
2168 rs->last_sent_block = NULL;
2169 rs->last_page = 0;
e0b266f0 2170
fbd162e6 2171 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
6b6712ef
JQ
2172 /* Deal with TPS != HPS and huge pages */
2173 ret = postcopy_chunk_hostpages(ms, block);
2174 if (ret) {
6b6712ef
JQ
2175 return ret;
2176 }
e0b266f0 2177
e0b266f0 2178#ifdef DEBUG_POSTCOPY
1e7cf8c3
WY
2179 ram_debug_dump_bitmap(block->bmap, true,
2180 block->used_length >> TARGET_PAGE_BITS);
e0b266f0 2181#endif
6b6712ef
JQ
2182 }
2183 trace_ram_postcopy_send_discard_bitmap();
e0b266f0 2184
b3ac2b94 2185 return postcopy_each_ram_send_discard(ms);
e0b266f0
DDAG
2186}
2187
3d0684b2
JQ
2188/**
2189 * ram_discard_range: discard dirtied pages at the beginning of postcopy
e0b266f0 2190 *
3d0684b2 2191 * Returns zero on success
e0b266f0 2192 *
36449157
JQ
2193 * @rbname: name of the RAMBlock of the request. NULL means the
2194 * same that last one.
3d0684b2
JQ
2195 * @start: RAMBlock starting page
2196 * @length: RAMBlock size
e0b266f0 2197 */
aaa2064c 2198int ram_discard_range(const char *rbname, uint64_t start, size_t length)
e0b266f0 2199{
36449157 2200 trace_ram_discard_range(rbname, start, length);
d3a5038c 2201
89ac5a1d 2202 RCU_READ_LOCK_GUARD();
36449157 2203 RAMBlock *rb = qemu_ram_block_by_name(rbname);
e0b266f0
DDAG
2204
2205 if (!rb) {
36449157 2206 error_report("ram_discard_range: Failed to find block '%s'", rbname);
03acb4e9 2207 return -1;
e0b266f0
DDAG
2208 }
2209
814bb08f
PX
2210 /*
2211 * On source VM, we don't need to update the received bitmap since
2212 * we don't even have one.
2213 */
2214 if (rb->receivedmap) {
2215 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2216 length >> qemu_target_page_bits());
2217 }
2218
03acb4e9 2219 return ram_block_discard_range(rb, start, length);
e0b266f0
DDAG
2220}
2221
84593a08
PX
2222/*
2223 * For every allocation, we will try not to crash the VM if the
2224 * allocation failed.
2225 */
2226static int xbzrle_init(void)
2227{
2228 Error *local_err = NULL;
2229
2230 if (!migrate_use_xbzrle()) {
2231 return 0;
2232 }
2233
2234 XBZRLE_cache_lock();
2235
2236 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2237 if (!XBZRLE.zero_target_page) {
2238 error_report("%s: Error allocating zero page", __func__);
2239 goto err_out;
2240 }
2241
2242 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2243 TARGET_PAGE_SIZE, &local_err);
2244 if (!XBZRLE.cache) {
2245 error_report_err(local_err);
2246 goto free_zero_page;
2247 }
2248
2249 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2250 if (!XBZRLE.encoded_buf) {
2251 error_report("%s: Error allocating encoded_buf", __func__);
2252 goto free_cache;
2253 }
2254
2255 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2256 if (!XBZRLE.current_buf) {
2257 error_report("%s: Error allocating current_buf", __func__);
2258 goto free_encoded_buf;
2259 }
2260
2261 /* We are all good */
2262 XBZRLE_cache_unlock();
2263 return 0;
2264
2265free_encoded_buf:
2266 g_free(XBZRLE.encoded_buf);
2267 XBZRLE.encoded_buf = NULL;
2268free_cache:
2269 cache_fini(XBZRLE.cache);
2270 XBZRLE.cache = NULL;
2271free_zero_page:
2272 g_free(XBZRLE.zero_target_page);
2273 XBZRLE.zero_target_page = NULL;
2274err_out:
2275 XBZRLE_cache_unlock();
2276 return -ENOMEM;
2277}
2278
53518d94 2279static int ram_state_init(RAMState **rsp)
56e93d26 2280{
7d00ee6a
PX
2281 *rsp = g_try_new0(RAMState, 1);
2282
2283 if (!*rsp) {
2284 error_report("%s: Init ramstate fail", __func__);
2285 return -1;
2286 }
53518d94
JQ
2287
2288 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2289 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2290 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
56e93d26 2291
7d00ee6a 2292 /*
40c4d4a8
IR
2293 * Count the total number of pages used by ram blocks not including any
2294 * gaps due to alignment or unplugs.
03158519 2295 * This must match with the initial values of dirty bitmap.
7d00ee6a 2296 */
40c4d4a8 2297 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
7d00ee6a
PX
2298 ram_state_reset(*rsp);
2299
2300 return 0;
2301}
2302
d6eff5d7 2303static void ram_list_init_bitmaps(void)
7d00ee6a 2304{
002cad6b 2305 MigrationState *ms = migrate_get_current();
d6eff5d7
PX
2306 RAMBlock *block;
2307 unsigned long pages;
002cad6b 2308 uint8_t shift;
56e93d26 2309
0827b9e9
AA
2310 /* Skip setting bitmap if there is no RAM */
2311 if (ram_bytes_total()) {
002cad6b
PX
2312 shift = ms->clear_bitmap_shift;
2313 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2314 error_report("clear_bitmap_shift (%u) too big, using "
2315 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2316 shift = CLEAR_BITMAP_SHIFT_MAX;
2317 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2318 error_report("clear_bitmap_shift (%u) too small, using "
2319 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2320 shift = CLEAR_BITMAP_SHIFT_MIN;
2321 }
2322
fbd162e6 2323 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
d6eff5d7 2324 pages = block->max_length >> TARGET_PAGE_BITS;
03158519
WY
2325 /*
2326 * The initial dirty bitmap for migration must be set with all
2327 * ones to make sure we'll migrate every guest RAM page to
2328 * destination.
40c4d4a8
IR
2329 * Here we set RAMBlock.bmap all to 1 because when rebegin a
2330 * new migration after a failed migration, ram_list.
2331 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2332 * guest memory.
03158519 2333 */
6b6712ef 2334 block->bmap = bitmap_new(pages);
40c4d4a8 2335 bitmap_set(block->bmap, 0, pages);
002cad6b
PX
2336 block->clear_bmap_shift = shift;
2337 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
0827b9e9 2338 }
f3f491fc 2339 }
d6eff5d7
PX
2340}
2341
2342static void ram_init_bitmaps(RAMState *rs)
2343{
2344 /* For memory_global_dirty_log_start below. */
2345 qemu_mutex_lock_iothread();
2346 qemu_mutex_lock_ramlist();
f3f491fc 2347
89ac5a1d
DDAG
2348 WITH_RCU_READ_LOCK_GUARD() {
2349 ram_list_init_bitmaps();
2350 memory_global_dirty_log_start();
2351 migration_bitmap_sync_precopy(rs);
2352 }
56e93d26 2353 qemu_mutex_unlock_ramlist();
49877834 2354 qemu_mutex_unlock_iothread();
d6eff5d7
PX
2355}
2356
2357static int ram_init_all(RAMState **rsp)
2358{
2359 if (ram_state_init(rsp)) {
2360 return -1;
2361 }
2362
2363 if (xbzrle_init()) {
2364 ram_state_cleanup(rsp);
2365 return -1;
2366 }
2367
2368 ram_init_bitmaps(*rsp);
a91246c9
HZ
2369
2370 return 0;
2371}
2372
08614f34
PX
2373static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2374{
2375 RAMBlock *block;
2376 uint64_t pages = 0;
2377
2378 /*
2379 * Postcopy is not using xbzrle/compression, so no need for that.
2380 * Also, since source are already halted, we don't need to care
2381 * about dirty page logging as well.
2382 */
2383
fbd162e6 2384 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
08614f34
PX
2385 pages += bitmap_count_one(block->bmap,
2386 block->used_length >> TARGET_PAGE_BITS);
2387 }
2388
2389 /* This may not be aligned with current bitmaps. Recalculate. */
2390 rs->migration_dirty_pages = pages;
2391
2392 rs->last_seen_block = NULL;
2393 rs->last_sent_block = NULL;
2394 rs->last_page = 0;
2395 rs->last_version = ram_list.version;
2396 /*
2397 * Disable the bulk stage, otherwise we'll resend the whole RAM no
2398 * matter what we have sent.
2399 */
2400 rs->ram_bulk_stage = false;
2401
2402 /* Update RAMState cache of output QEMUFile */
2403 rs->f = out;
2404
2405 trace_ram_state_resume_prepare(pages);
2406}
2407
6bcb05fc
WW
2408/*
2409 * This function clears bits of the free pages reported by the caller from the
2410 * migration dirty bitmap. @addr is the host address corresponding to the
2411 * start of the continuous guest free pages, and @len is the total bytes of
2412 * those pages.
2413 */
2414void qemu_guest_free_page_hint(void *addr, size_t len)
2415{
2416 RAMBlock *block;
2417 ram_addr_t offset;
2418 size_t used_len, start, npages;
2419 MigrationState *s = migrate_get_current();
2420
2421 /* This function is currently expected to be used during live migration */
2422 if (!migration_is_setup_or_active(s->state)) {
2423 return;
2424 }
2425
2426 for (; len > 0; len -= used_len, addr += used_len) {
2427 block = qemu_ram_block_from_host(addr, false, &offset);
2428 if (unlikely(!block || offset >= block->used_length)) {
2429 /*
2430 * The implementation might not support RAMBlock resize during
2431 * live migration, but it could happen in theory with future
2432 * updates. So we add a check here to capture that case.
2433 */
2434 error_report_once("%s unexpected error", __func__);
2435 return;
2436 }
2437
2438 if (len <= block->used_length - offset) {
2439 used_len = len;
2440 } else {
2441 used_len = block->used_length - offset;
2442 }
2443
2444 start = offset >> TARGET_PAGE_BITS;
2445 npages = used_len >> TARGET_PAGE_BITS;
2446
2447 qemu_mutex_lock(&ram_state->bitmap_mutex);
2448 ram_state->migration_dirty_pages -=
2449 bitmap_count_one_with_offset(block->bmap, start, npages);
2450 bitmap_clear(block->bmap, start, npages);
2451 qemu_mutex_unlock(&ram_state->bitmap_mutex);
2452 }
2453}
2454
3d0684b2
JQ
2455/*
2456 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
a91246c9
HZ
2457 * long-running RCU critical section. When rcu-reclaims in the code
2458 * start to become numerous it will be necessary to reduce the
2459 * granularity of these critical sections.
2460 */
2461
3d0684b2
JQ
2462/**
2463 * ram_save_setup: Setup RAM for migration
2464 *
2465 * Returns zero to indicate success and negative for error
2466 *
2467 * @f: QEMUFile where to send the data
2468 * @opaque: RAMState pointer
2469 */
a91246c9
HZ
2470static int ram_save_setup(QEMUFile *f, void *opaque)
2471{
53518d94 2472 RAMState **rsp = opaque;
a91246c9
HZ
2473 RAMBlock *block;
2474
dcaf446e
XG
2475 if (compress_threads_save_setup()) {
2476 return -1;
2477 }
2478
a91246c9
HZ
2479 /* migration has already setup the bitmap, reuse it. */
2480 if (!migration_in_colo_state()) {
7d00ee6a 2481 if (ram_init_all(rsp) != 0) {
dcaf446e 2482 compress_threads_save_cleanup();
a91246c9 2483 return -1;
53518d94 2484 }
a91246c9 2485 }
53518d94 2486 (*rsp)->f = f;
a91246c9 2487
0e6ebd48
DDAG
2488 WITH_RCU_READ_LOCK_GUARD() {
2489 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
56e93d26 2490
0e6ebd48
DDAG
2491 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2492 qemu_put_byte(f, strlen(block->idstr));
2493 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2494 qemu_put_be64(f, block->used_length);
2495 if (migrate_postcopy_ram() && block->page_size !=
2496 qemu_host_page_size) {
2497 qemu_put_be64(f, block->page_size);
2498 }
2499 if (migrate_ignore_shared()) {
2500 qemu_put_be64(f, block->mr->addr);
2501 }
fbd162e6 2502 }
56e93d26
JQ
2503 }
2504
56e93d26
JQ
2505 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2506 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2507
99f2c6fb 2508 multifd_send_sync_main(f);
56e93d26 2509 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
35374cbd 2510 qemu_fflush(f);
56e93d26
JQ
2511
2512 return 0;
2513}
2514
3d0684b2
JQ
2515/**
2516 * ram_save_iterate: iterative stage for migration
2517 *
2518 * Returns zero to indicate success and negative for error
2519 *
2520 * @f: QEMUFile where to send the data
2521 * @opaque: RAMState pointer
2522 */
56e93d26
JQ
2523static int ram_save_iterate(QEMUFile *f, void *opaque)
2524{
53518d94
JQ
2525 RAMState **temp = opaque;
2526 RAMState *rs = *temp;
3d4095b2 2527 int ret = 0;
56e93d26
JQ
2528 int i;
2529 int64_t t0;
5c90308f 2530 int done = 0;
56e93d26 2531
b2557345
PL
2532 if (blk_mig_bulk_active()) {
2533 /* Avoid transferring ram during bulk phase of block migration as
2534 * the bulk phase will usually take a long time and transferring
2535 * ram updates during that time is pointless. */
2536 goto out;
2537 }
2538
89ac5a1d
DDAG
2539 WITH_RCU_READ_LOCK_GUARD() {
2540 if (ram_list.version != rs->last_version) {
2541 ram_state_reset(rs);
2542 }
56e93d26 2543
89ac5a1d
DDAG
2544 /* Read version before ram_list.blocks */
2545 smp_rmb();
56e93d26 2546
89ac5a1d 2547 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
56e93d26 2548
89ac5a1d
DDAG
2549 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2550 i = 0;
2551 while ((ret = qemu_file_rate_limit(f)) == 0 ||
2552 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2553 int pages;
e03a34f8 2554
89ac5a1d
DDAG
2555 if (qemu_file_get_error(f)) {
2556 break;
2557 }
e8f3735f 2558
89ac5a1d
DDAG
2559 pages = ram_find_and_save_block(rs, false);
2560 /* no more pages to sent */
2561 if (pages == 0) {
2562 done = 1;
2563 break;
2564 }
e8f3735f 2565
89ac5a1d
DDAG
2566 if (pages < 0) {
2567 qemu_file_set_error(f, pages);
56e93d26
JQ
2568 break;
2569 }
89ac5a1d
DDAG
2570
2571 rs->target_page_count += pages;
2572
644acf99
WY
2573 /*
2574 * During postcopy, it is necessary to make sure one whole host
2575 * page is sent in one chunk.
2576 */
2577 if (migrate_postcopy_ram()) {
2578 flush_compressed_data(rs);
2579 }
2580
89ac5a1d
DDAG
2581 /*
2582 * we want to check in the 1st loop, just in case it was the 1st
2583 * time and we had to sync the dirty bitmap.
2584 * qemu_clock_get_ns() is a bit expensive, so we only check each
2585 * some iterations
2586 */
2587 if ((i & 63) == 0) {
2588 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2589 1000000;
2590 if (t1 > MAX_WAIT) {
2591 trace_ram_save_iterate_big_wait(t1, i);
2592 break;
2593 }
2594 }
2595 i++;
56e93d26 2596 }
56e93d26 2597 }
56e93d26
JQ
2598
2599 /*
2600 * Must occur before EOS (or any QEMUFile operation)
2601 * because of RDMA protocol.
2602 */
2603 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2604
b2557345 2605out:
b69a0227
JQ
2606 if (ret >= 0
2607 && migration_is_setup_or_active(migrate_get_current()->state)) {
99f2c6fb 2608 multifd_send_sync_main(rs->f);
3d4095b2
JQ
2609 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2610 qemu_fflush(f);
2611 ram_counters.transferred += 8;
56e93d26 2612
3d4095b2
JQ
2613 ret = qemu_file_get_error(f);
2614 }
56e93d26
JQ
2615 if (ret < 0) {
2616 return ret;
2617 }
2618
5c90308f 2619 return done;
56e93d26
JQ
2620}
2621
3d0684b2
JQ
2622/**
2623 * ram_save_complete: function called to send the remaining amount of ram
2624 *
e8f3735f 2625 * Returns zero to indicate success or negative on error
3d0684b2
JQ
2626 *
2627 * Called with iothread lock
2628 *
2629 * @f: QEMUFile where to send the data
2630 * @opaque: RAMState pointer
2631 */
56e93d26
JQ
2632static int ram_save_complete(QEMUFile *f, void *opaque)
2633{
53518d94
JQ
2634 RAMState **temp = opaque;
2635 RAMState *rs = *temp;
e8f3735f 2636 int ret = 0;
6f37bb8b 2637
89ac5a1d
DDAG
2638 WITH_RCU_READ_LOCK_GUARD() {
2639 if (!migration_in_postcopy()) {
2640 migration_bitmap_sync_precopy(rs);
2641 }
56e93d26 2642
89ac5a1d 2643 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
56e93d26 2644
89ac5a1d 2645 /* try transferring iterative blocks of memory */
56e93d26 2646
89ac5a1d
DDAG
2647 /* flush all remaining blocks regardless of rate limiting */
2648 while (true) {
2649 int pages;
56e93d26 2650
89ac5a1d
DDAG
2651 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2652 /* no more blocks to sent */
2653 if (pages == 0) {
2654 break;
2655 }
2656 if (pages < 0) {
2657 ret = pages;
2658 break;
2659 }
e8f3735f 2660 }
56e93d26 2661
89ac5a1d
DDAG
2662 flush_compressed_data(rs);
2663 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2664 }
d09a6fde 2665
3d4095b2 2666 if (ret >= 0) {
99f2c6fb 2667 multifd_send_sync_main(rs->f);
3d4095b2
JQ
2668 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2669 qemu_fflush(f);
2670 }
56e93d26 2671
e8f3735f 2672 return ret;
56e93d26
JQ
2673}
2674
c31b098f 2675static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
47995026
VSO
2676 uint64_t *res_precopy_only,
2677 uint64_t *res_compatible,
2678 uint64_t *res_postcopy_only)
56e93d26 2679{
53518d94
JQ
2680 RAMState **temp = opaque;
2681 RAMState *rs = *temp;
56e93d26
JQ
2682 uint64_t remaining_size;
2683
9edabd4d 2684 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 2685
5727309d 2686 if (!migration_in_postcopy() &&
663e6c1d 2687 remaining_size < max_size) {
56e93d26 2688 qemu_mutex_lock_iothread();
89ac5a1d
DDAG
2689 WITH_RCU_READ_LOCK_GUARD() {
2690 migration_bitmap_sync_precopy(rs);
2691 }
56e93d26 2692 qemu_mutex_unlock_iothread();
9edabd4d 2693 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 2694 }
c31b098f 2695
86e1167e
VSO
2696 if (migrate_postcopy_ram()) {
2697 /* We can do postcopy, and all the data is postcopiable */
47995026 2698 *res_compatible += remaining_size;
86e1167e 2699 } else {
47995026 2700 *res_precopy_only += remaining_size;
86e1167e 2701 }
56e93d26
JQ
2702}
2703
2704static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2705{
2706 unsigned int xh_len;
2707 int xh_flags;
063e760a 2708 uint8_t *loaded_data;
56e93d26 2709
56e93d26
JQ
2710 /* extract RLE header */
2711 xh_flags = qemu_get_byte(f);
2712 xh_len = qemu_get_be16(f);
2713
2714 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2715 error_report("Failed to load XBZRLE page - wrong compression!");
2716 return -1;
2717 }
2718
2719 if (xh_len > TARGET_PAGE_SIZE) {
2720 error_report("Failed to load XBZRLE page - len overflow!");
2721 return -1;
2722 }
f265e0e4 2723 loaded_data = XBZRLE.decoded_buf;
56e93d26 2724 /* load data and decode */
f265e0e4 2725 /* it can change loaded_data to point to an internal buffer */
063e760a 2726 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
56e93d26
JQ
2727
2728 /* decode RLE */
063e760a 2729 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
56e93d26
JQ
2730 TARGET_PAGE_SIZE) == -1) {
2731 error_report("Failed to load XBZRLE page - decode error!");
2732 return -1;
2733 }
2734
2735 return 0;
2736}
2737
3d0684b2
JQ
2738/**
2739 * ram_block_from_stream: read a RAMBlock id from the migration stream
2740 *
2741 * Must be called from within a rcu critical section.
2742 *
56e93d26 2743 * Returns a pointer from within the RCU-protected ram_list.
a7180877 2744 *
3d0684b2
JQ
2745 * @f: QEMUFile where to read the data from
2746 * @flags: Page flags (mostly to see if it's a continuation of previous block)
a7180877 2747 */
3d0684b2 2748static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
56e93d26
JQ
2749{
2750 static RAMBlock *block = NULL;
2751 char id[256];
2752 uint8_t len;
2753
2754 if (flags & RAM_SAVE_FLAG_CONTINUE) {
4c4bad48 2755 if (!block) {
56e93d26
JQ
2756 error_report("Ack, bad migration stream!");
2757 return NULL;
2758 }
4c4bad48 2759 return block;
56e93d26
JQ
2760 }
2761
2762 len = qemu_get_byte(f);
2763 qemu_get_buffer(f, (uint8_t *)id, len);
2764 id[len] = 0;
2765
e3dd7493 2766 block = qemu_ram_block_by_name(id);
4c4bad48
HZ
2767 if (!block) {
2768 error_report("Can't find block %s", id);
2769 return NULL;
56e93d26
JQ
2770 }
2771
fbd162e6 2772 if (ramblock_is_ignored(block)) {
b895de50
CLG
2773 error_report("block %s should not be migrated !", id);
2774 return NULL;
2775 }
2776
4c4bad48
HZ
2777 return block;
2778}
2779
2780static inline void *host_from_ram_block_offset(RAMBlock *block,
2781 ram_addr_t offset)
2782{
2783 if (!offset_in_ramblock(block, offset)) {
2784 return NULL;
2785 }
2786
2787 return block->host + offset;
56e93d26
JQ
2788}
2789
13af18f2 2790static inline void *colo_cache_from_block_offset(RAMBlock *block,
8af66371 2791 ram_addr_t offset, bool record_bitmap)
13af18f2
ZC
2792{
2793 if (!offset_in_ramblock(block, offset)) {
2794 return NULL;
2795 }
2796 if (!block->colo_cache) {
2797 error_report("%s: colo_cache is NULL in block :%s",
2798 __func__, block->idstr);
2799 return NULL;
2800 }
7d9acafa
ZC
2801
2802 /*
2803 * During colo checkpoint, we need bitmap of these migrated pages.
2804 * It help us to decide which pages in ram cache should be flushed
2805 * into VM's RAM later.
2806 */
8af66371
HZ
2807 if (record_bitmap &&
2808 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
7d9acafa
ZC
2809 ram_state->migration_dirty_pages++;
2810 }
13af18f2
ZC
2811 return block->colo_cache + offset;
2812}
2813
3d0684b2
JQ
2814/**
2815 * ram_handle_compressed: handle the zero page case
2816 *
56e93d26
JQ
2817 * If a page (or a whole RDMA chunk) has been
2818 * determined to be zero, then zap it.
3d0684b2
JQ
2819 *
2820 * @host: host address for the zero page
2821 * @ch: what the page is filled from. We only support zero
2822 * @size: size of the zero page
56e93d26
JQ
2823 */
2824void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2825{
2826 if (ch != 0 || !is_zero_range(host, size)) {
2827 memset(host, ch, size);
2828 }
2829}
2830
797ca154
XG
2831/* return the size after decompression, or negative value on error */
2832static int
2833qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
2834 const uint8_t *source, size_t source_len)
2835{
2836 int err;
2837
2838 err = inflateReset(stream);
2839 if (err != Z_OK) {
2840 return -1;
2841 }
2842
2843 stream->avail_in = source_len;
2844 stream->next_in = (uint8_t *)source;
2845 stream->avail_out = dest_len;
2846 stream->next_out = dest;
2847
2848 err = inflate(stream, Z_NO_FLUSH);
2849 if (err != Z_STREAM_END) {
2850 return -1;
2851 }
2852
2853 return stream->total_out;
2854}
2855
56e93d26
JQ
2856static void *do_data_decompress(void *opaque)
2857{
2858 DecompressParam *param = opaque;
2859 unsigned long pagesize;
33d151f4 2860 uint8_t *des;
34ab9e97 2861 int len, ret;
56e93d26 2862
33d151f4 2863 qemu_mutex_lock(&param->mutex);
90e56fb4 2864 while (!param->quit) {
33d151f4
LL
2865 if (param->des) {
2866 des = param->des;
2867 len = param->len;
2868 param->des = 0;
2869 qemu_mutex_unlock(&param->mutex);
2870
56e93d26 2871 pagesize = TARGET_PAGE_SIZE;
34ab9e97
XG
2872
2873 ret = qemu_uncompress_data(&param->stream, des, pagesize,
2874 param->compbuf, len);
f548222c 2875 if (ret < 0 && migrate_get_current()->decompress_error_check) {
34ab9e97
XG
2876 error_report("decompress data failed");
2877 qemu_file_set_error(decomp_file, ret);
2878 }
73a8912b 2879
33d151f4
LL
2880 qemu_mutex_lock(&decomp_done_lock);
2881 param->done = true;
2882 qemu_cond_signal(&decomp_done_cond);
2883 qemu_mutex_unlock(&decomp_done_lock);
2884
2885 qemu_mutex_lock(&param->mutex);
2886 } else {
2887 qemu_cond_wait(&param->cond, &param->mutex);
2888 }
56e93d26 2889 }
33d151f4 2890 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
2891
2892 return NULL;
2893}
2894
34ab9e97 2895static int wait_for_decompress_done(void)
5533b2e9
LL
2896{
2897 int idx, thread_count;
2898
2899 if (!migrate_use_compression()) {
34ab9e97 2900 return 0;
5533b2e9
LL
2901 }
2902
2903 thread_count = migrate_decompress_threads();
2904 qemu_mutex_lock(&decomp_done_lock);
2905 for (idx = 0; idx < thread_count; idx++) {
2906 while (!decomp_param[idx].done) {
2907 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2908 }
2909 }
2910 qemu_mutex_unlock(&decomp_done_lock);
34ab9e97 2911 return qemu_file_get_error(decomp_file);
5533b2e9
LL
2912}
2913
f0afa331 2914static void compress_threads_load_cleanup(void)
56e93d26
JQ
2915{
2916 int i, thread_count;
2917
3416ab5b
JQ
2918 if (!migrate_use_compression()) {
2919 return;
2920 }
56e93d26
JQ
2921 thread_count = migrate_decompress_threads();
2922 for (i = 0; i < thread_count; i++) {
797ca154
XG
2923 /*
2924 * we use it as a indicator which shows if the thread is
2925 * properly init'd or not
2926 */
2927 if (!decomp_param[i].compbuf) {
2928 break;
2929 }
2930
56e93d26 2931 qemu_mutex_lock(&decomp_param[i].mutex);
90e56fb4 2932 decomp_param[i].quit = true;
56e93d26
JQ
2933 qemu_cond_signal(&decomp_param[i].cond);
2934 qemu_mutex_unlock(&decomp_param[i].mutex);
2935 }
2936 for (i = 0; i < thread_count; i++) {
797ca154
XG
2937 if (!decomp_param[i].compbuf) {
2938 break;
2939 }
2940
56e93d26
JQ
2941 qemu_thread_join(decompress_threads + i);
2942 qemu_mutex_destroy(&decomp_param[i].mutex);
2943 qemu_cond_destroy(&decomp_param[i].cond);
797ca154 2944 inflateEnd(&decomp_param[i].stream);
56e93d26 2945 g_free(decomp_param[i].compbuf);
797ca154 2946 decomp_param[i].compbuf = NULL;
56e93d26
JQ
2947 }
2948 g_free(decompress_threads);
2949 g_free(decomp_param);
56e93d26
JQ
2950 decompress_threads = NULL;
2951 decomp_param = NULL;
34ab9e97 2952 decomp_file = NULL;
56e93d26
JQ
2953}
2954
34ab9e97 2955static int compress_threads_load_setup(QEMUFile *f)
797ca154
XG
2956{
2957 int i, thread_count;
2958
2959 if (!migrate_use_compression()) {
2960 return 0;
2961 }
2962
2963 thread_count = migrate_decompress_threads();
2964 decompress_threads = g_new0(QemuThread, thread_count);
2965 decomp_param = g_new0(DecompressParam, thread_count);
2966 qemu_mutex_init(&decomp_done_lock);
2967 qemu_cond_init(&decomp_done_cond);
34ab9e97 2968 decomp_file = f;
797ca154
XG
2969 for (i = 0; i < thread_count; i++) {
2970 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
2971 goto exit;
2972 }
2973
2974 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2975 qemu_mutex_init(&decomp_param[i].mutex);
2976 qemu_cond_init(&decomp_param[i].cond);
2977 decomp_param[i].done = true;
2978 decomp_param[i].quit = false;
2979 qemu_thread_create(decompress_threads + i, "decompress",
2980 do_data_decompress, decomp_param + i,
2981 QEMU_THREAD_JOINABLE);
2982 }
2983 return 0;
2984exit:
2985 compress_threads_load_cleanup();
2986 return -1;
2987}
2988
c1bc6626 2989static void decompress_data_with_multi_threads(QEMUFile *f,
56e93d26
JQ
2990 void *host, int len)
2991{
2992 int idx, thread_count;
2993
2994 thread_count = migrate_decompress_threads();
73a8912b 2995 qemu_mutex_lock(&decomp_done_lock);
56e93d26
JQ
2996 while (true) {
2997 for (idx = 0; idx < thread_count; idx++) {
73a8912b 2998 if (decomp_param[idx].done) {
33d151f4
LL
2999 decomp_param[idx].done = false;
3000 qemu_mutex_lock(&decomp_param[idx].mutex);
c1bc6626 3001 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
56e93d26
JQ
3002 decomp_param[idx].des = host;
3003 decomp_param[idx].len = len;
33d151f4
LL
3004 qemu_cond_signal(&decomp_param[idx].cond);
3005 qemu_mutex_unlock(&decomp_param[idx].mutex);
56e93d26
JQ
3006 break;
3007 }
3008 }
3009 if (idx < thread_count) {
3010 break;
73a8912b
LL
3011 } else {
3012 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
56e93d26
JQ
3013 }
3014 }
73a8912b 3015 qemu_mutex_unlock(&decomp_done_lock);
56e93d26
JQ
3016}
3017
13af18f2
ZC
3018/*
3019 * colo cache: this is for secondary VM, we cache the whole
3020 * memory of the secondary VM, it is need to hold the global lock
3021 * to call this helper.
3022 */
3023int colo_init_ram_cache(void)
3024{
3025 RAMBlock *block;
3026
44901b5a
PB
3027 WITH_RCU_READ_LOCK_GUARD() {
3028 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3029 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3030 NULL,
3031 false);
3032 if (!block->colo_cache) {
3033 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3034 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3035 block->used_length);
3036 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3037 if (block->colo_cache) {
3038 qemu_anon_ram_free(block->colo_cache, block->used_length);
3039 block->colo_cache = NULL;
3040 }
89ac5a1d 3041 }
44901b5a 3042 return -errno;
89ac5a1d 3043 }
13af18f2 3044 }
13af18f2 3045 }
44901b5a 3046
7d9acafa
ZC
3047 /*
3048 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3049 * with to decide which page in cache should be flushed into SVM's RAM. Here
3050 * we use the same name 'ram_bitmap' as for migration.
3051 */
3052 if (ram_bytes_total()) {
3053 RAMBlock *block;
3054
fbd162e6 3055 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7d9acafa 3056 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
7d9acafa 3057 block->bmap = bitmap_new(pages);
7d9acafa
ZC
3058 }
3059 }
7d9acafa 3060
0393031a 3061 ram_state_init(&ram_state);
13af18f2 3062 return 0;
13af18f2
ZC
3063}
3064
0393031a
HZ
3065/* TODO: duplicated with ram_init_bitmaps */
3066void colo_incoming_start_dirty_log(void)
3067{
3068 RAMBlock *block = NULL;
3069 /* For memory_global_dirty_log_start below. */
3070 qemu_mutex_lock_iothread();
3071 qemu_mutex_lock_ramlist();
3072
3073 memory_global_dirty_log_sync();
3074 WITH_RCU_READ_LOCK_GUARD() {
3075 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3076 ramblock_sync_dirty_bitmap(ram_state, block);
3077 /* Discard this dirty bitmap record */
3078 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3079 }
3080 memory_global_dirty_log_start();
3081 }
3082 ram_state->migration_dirty_pages = 0;
3083 qemu_mutex_unlock_ramlist();
3084 qemu_mutex_unlock_iothread();
3085}
3086
13af18f2
ZC
3087/* It is need to hold the global lock to call this helper */
3088void colo_release_ram_cache(void)
3089{
3090 RAMBlock *block;
3091
d1955d22 3092 memory_global_dirty_log_stop();
fbd162e6 3093 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7d9acafa
ZC
3094 g_free(block->bmap);
3095 block->bmap = NULL;
3096 }
3097
89ac5a1d
DDAG
3098 WITH_RCU_READ_LOCK_GUARD() {
3099 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3100 if (block->colo_cache) {
3101 qemu_anon_ram_free(block->colo_cache, block->used_length);
3102 block->colo_cache = NULL;
3103 }
13af18f2
ZC
3104 }
3105 }
0393031a 3106 ram_state_cleanup(&ram_state);
13af18f2
ZC
3107}
3108
f265e0e4
JQ
3109/**
3110 * ram_load_setup: Setup RAM for migration incoming side
3111 *
3112 * Returns zero to indicate success and negative for error
3113 *
3114 * @f: QEMUFile where to receive the data
3115 * @opaque: RAMState pointer
3116 */
3117static int ram_load_setup(QEMUFile *f, void *opaque)
3118{
34ab9e97 3119 if (compress_threads_load_setup(f)) {
797ca154
XG
3120 return -1;
3121 }
3122
f265e0e4 3123 xbzrle_load_setup();
f9494614 3124 ramblock_recv_map_init();
13af18f2 3125
f265e0e4
JQ
3126 return 0;
3127}
3128
3129static int ram_load_cleanup(void *opaque)
3130{
f9494614 3131 RAMBlock *rb;
56eb90af 3132
fbd162e6 3133 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
bd108a44 3134 qemu_ram_block_writeback(rb);
56eb90af
JH
3135 }
3136
f265e0e4 3137 xbzrle_load_cleanup();
f0afa331 3138 compress_threads_load_cleanup();
f9494614 3139
fbd162e6 3140 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
f9494614
AP
3141 g_free(rb->receivedmap);
3142 rb->receivedmap = NULL;
3143 }
13af18f2 3144
f265e0e4
JQ
3145 return 0;
3146}
3147
3d0684b2
JQ
3148/**
3149 * ram_postcopy_incoming_init: allocate postcopy data structures
3150 *
3151 * Returns 0 for success and negative if there was one error
3152 *
3153 * @mis: current migration incoming state
3154 *
3155 * Allocate data structures etc needed by incoming migration with
3156 * postcopy-ram. postcopy-ram's similarly names
3157 * postcopy_ram_incoming_init does the work.
1caddf8a
DDAG
3158 */
3159int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3160{
c136180c 3161 return postcopy_ram_incoming_init(mis);
1caddf8a
DDAG
3162}
3163
3d0684b2
JQ
3164/**
3165 * ram_load_postcopy: load a page in postcopy case
3166 *
3167 * Returns 0 for success or -errno in case of error
3168 *
a7180877
DDAG
3169 * Called in postcopy mode by ram_load().
3170 * rcu_read_lock is taken prior to this being called.
3d0684b2
JQ
3171 *
3172 * @f: QEMUFile where to send the data
a7180877
DDAG
3173 */
3174static int ram_load_postcopy(QEMUFile *f)
3175{
3176 int flags = 0, ret = 0;
3177 bool place_needed = false;
1aa83678 3178 bool matches_target_page_size = false;
a7180877
DDAG
3179 MigrationIncomingState *mis = migration_incoming_get_current();
3180 /* Temporary page that is later 'placed' */
3414322a 3181 void *postcopy_host_page = mis->postcopy_tmp_page;
91ba442f 3182 void *this_host = NULL;
ddf35bdf 3183 bool all_zero = true;
4cbb3c63 3184 int target_pages = 0;
a7180877
DDAG
3185
3186 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3187 ram_addr_t addr;
3188 void *host = NULL;
3189 void *page_buffer = NULL;
3190 void *place_source = NULL;
df9ff5e1 3191 RAMBlock *block = NULL;
a7180877 3192 uint8_t ch;
644acf99 3193 int len;
a7180877
DDAG
3194
3195 addr = qemu_get_be64(f);
7a9ddfbf
PX
3196
3197 /*
3198 * If qemu file error, we should stop here, and then "addr"
3199 * may be invalid
3200 */
3201 ret = qemu_file_get_error(f);
3202 if (ret) {
3203 break;
3204 }
3205
a7180877
DDAG
3206 flags = addr & ~TARGET_PAGE_MASK;
3207 addr &= TARGET_PAGE_MASK;
3208
3209 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
644acf99
WY
3210 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3211 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
df9ff5e1 3212 block = ram_block_from_stream(f, flags);
4c4bad48
HZ
3213
3214 host = host_from_ram_block_offset(block, addr);
a7180877
DDAG
3215 if (!host) {
3216 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3217 ret = -EINVAL;
3218 break;
3219 }
4cbb3c63 3220 target_pages++;
1aa83678 3221 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
a7180877 3222 /*
28abd200
DDAG
3223 * Postcopy requires that we place whole host pages atomically;
3224 * these may be huge pages for RAMBlocks that are backed by
3225 * hugetlbfs.
a7180877
DDAG
3226 * To make it atomic, the data is read into a temporary page
3227 * that's moved into place later.
3228 * The migration protocol uses, possibly smaller, target-pages
3229 * however the source ensures it always sends all the components
91ba442f 3230 * of a host page in one chunk.
a7180877
DDAG
3231 */
3232 page_buffer = postcopy_host_page +
28abd200 3233 ((uintptr_t)host & (block->page_size - 1));
e5e73b0f 3234 if (target_pages == 1) {
91ba442f
WY
3235 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3236 block->page_size);
c53b7ddc
DDAG
3237 } else {
3238 /* not the 1st TP within the HP */
91ba442f
WY
3239 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) !=
3240 (uintptr_t)this_host) {
3241 error_report("Non-same host page %p/%p",
3242 host, this_host);
c53b7ddc
DDAG
3243 ret = -EINVAL;
3244 break;
3245 }
a7180877
DDAG
3246 }
3247
3248 /*
3249 * If it's the last part of a host page then we place the host
3250 * page
3251 */
4cbb3c63
WY
3252 if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3253 place_needed = true;
4cbb3c63 3254 }
a7180877
DDAG
3255 place_source = postcopy_host_page;
3256 }
3257
3258 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
bb890ed5 3259 case RAM_SAVE_FLAG_ZERO:
a7180877 3260 ch = qemu_get_byte(f);
2e36bc1b
WY
3261 /*
3262 * Can skip to set page_buffer when
3263 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3264 */
3265 if (ch || !matches_target_page_size) {
3266 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3267 }
a7180877
DDAG
3268 if (ch) {
3269 all_zero = false;
3270 }
3271 break;
3272
3273 case RAM_SAVE_FLAG_PAGE:
3274 all_zero = false;
1aa83678
PX
3275 if (!matches_target_page_size) {
3276 /* For huge pages, we always use temporary buffer */
a7180877
DDAG
3277 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3278 } else {
1aa83678
PX
3279 /*
3280 * For small pages that matches target page size, we
3281 * avoid the qemu_file copy. Instead we directly use
3282 * the buffer of QEMUFile to place the page. Note: we
3283 * cannot do any QEMUFile operation before using that
3284 * buffer to make sure the buffer is valid when
3285 * placing the page.
a7180877
DDAG
3286 */
3287 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3288 TARGET_PAGE_SIZE);
3289 }
3290 break;
644acf99
WY
3291 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3292 all_zero = false;
3293 len = qemu_get_be32(f);
3294 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3295 error_report("Invalid compressed data length: %d", len);
3296 ret = -EINVAL;
3297 break;
3298 }
3299 decompress_data_with_multi_threads(f, page_buffer, len);
3300 break;
3301
a7180877
DDAG
3302 case RAM_SAVE_FLAG_EOS:
3303 /* normal exit */
6df264ac 3304 multifd_recv_sync_main();
a7180877
DDAG
3305 break;
3306 default:
3307 error_report("Unknown combination of migration flags: %#x"
3308 " (postcopy mode)", flags);
3309 ret = -EINVAL;
7a9ddfbf
PX
3310 break;
3311 }
3312
644acf99
WY
3313 /* Got the whole host page, wait for decompress before placing. */
3314 if (place_needed) {
3315 ret |= wait_for_decompress_done();
3316 }
3317
7a9ddfbf
PX
3318 /* Detect for any possible file errors */
3319 if (!ret && qemu_file_get_error(f)) {
3320 ret = qemu_file_get_error(f);
a7180877
DDAG
3321 }
3322
7a9ddfbf 3323 if (!ret && place_needed) {
a7180877 3324 /* This gets called at the last target page in the host page */
91ba442f
WY
3325 void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3326 block->page_size);
df9ff5e1 3327
a7180877 3328 if (all_zero) {
df9ff5e1 3329 ret = postcopy_place_page_zero(mis, place_dest,
8be4620b 3330 block);
a7180877 3331 } else {
df9ff5e1 3332 ret = postcopy_place_page(mis, place_dest,
8be4620b 3333 place_source, block);
a7180877 3334 }
ddf35bdf
DH
3335 place_needed = false;
3336 target_pages = 0;
3337 /* Assume we have a zero page until we detect something different */
3338 all_zero = true;
a7180877 3339 }
a7180877
DDAG
3340 }
3341
3342 return ret;
3343}
3344
acab30b8
DHB
3345static bool postcopy_is_advised(void)
3346{
3347 PostcopyState ps = postcopy_state_get();
3348 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3349}
3350
3351static bool postcopy_is_running(void)
3352{
3353 PostcopyState ps = postcopy_state_get();
3354 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3355}
3356
e6f4aa18
ZC
3357/*
3358 * Flush content of RAM cache into SVM's memory.
3359 * Only flush the pages that be dirtied by PVM or SVM or both.
3360 */
24fa16f8 3361void colo_flush_ram_cache(void)
e6f4aa18
ZC
3362{
3363 RAMBlock *block = NULL;
3364 void *dst_host;
3365 void *src_host;
3366 unsigned long offset = 0;
3367
d1955d22 3368 memory_global_dirty_log_sync();
89ac5a1d
DDAG
3369 WITH_RCU_READ_LOCK_GUARD() {
3370 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3371 ramblock_sync_dirty_bitmap(ram_state, block);
3372 }
d1955d22 3373 }
d1955d22 3374
e6f4aa18 3375 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
89ac5a1d
DDAG
3376 WITH_RCU_READ_LOCK_GUARD() {
3377 block = QLIST_FIRST_RCU(&ram_list.blocks);
e6f4aa18 3378
89ac5a1d
DDAG
3379 while (block) {
3380 offset = migration_bitmap_find_dirty(ram_state, block, offset);
e6f4aa18 3381
8bba004c
AR
3382 if (((ram_addr_t)offset) << TARGET_PAGE_BITS
3383 >= block->used_length) {
89ac5a1d
DDAG
3384 offset = 0;
3385 block = QLIST_NEXT_RCU(block, next);
3386 } else {
3387 migration_bitmap_clear_dirty(ram_state, block, offset);
8bba004c
AR
3388 dst_host = block->host
3389 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3390 src_host = block->colo_cache
3391 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
89ac5a1d
DDAG
3392 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3393 }
e6f4aa18
ZC
3394 }
3395 }
e6f4aa18
ZC
3396 trace_colo_flush_ram_cache_end();
3397}
3398
10da4a36
WY
3399/**
3400 * ram_load_precopy: load pages in precopy case
3401 *
3402 * Returns 0 for success or -errno in case of error
3403 *
3404 * Called in precopy mode by ram_load().
3405 * rcu_read_lock is taken prior to this being called.
3406 *
3407 * @f: QEMUFile where to send the data
3408 */
3409static int ram_load_precopy(QEMUFile *f)
56e93d26 3410{
e65cec5e 3411 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
ef08fb38 3412 /* ADVISE is earlier, it shows the source has the postcopy capability on */
acab30b8 3413 bool postcopy_advised = postcopy_is_advised();
edc60127
JQ
3414 if (!migrate_use_compression()) {
3415 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3416 }
a7180877 3417
10da4a36 3418 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
56e93d26 3419 ram_addr_t addr, total_ram_bytes;
0393031a 3420 void *host = NULL, *host_bak = NULL;
56e93d26
JQ
3421 uint8_t ch;
3422
e65cec5e
YK
3423 /*
3424 * Yield periodically to let main loop run, but an iteration of
3425 * the main loop is expensive, so do it each some iterations
3426 */
3427 if ((i & 32767) == 0 && qemu_in_coroutine()) {
3428 aio_co_schedule(qemu_get_current_aio_context(),
3429 qemu_coroutine_self());
3430 qemu_coroutine_yield();
3431 }
3432 i++;
3433
56e93d26
JQ
3434 addr = qemu_get_be64(f);
3435 flags = addr & ~TARGET_PAGE_MASK;
3436 addr &= TARGET_PAGE_MASK;
3437
edc60127
JQ
3438 if (flags & invalid_flags) {
3439 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3440 error_report("Received an unexpected compressed page");
3441 }
3442
3443 ret = -EINVAL;
3444 break;
3445 }
3446
bb890ed5 3447 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
a776aa15 3448 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4c4bad48
HZ
3449 RAMBlock *block = ram_block_from_stream(f, flags);
3450
0393031a 3451 host = host_from_ram_block_offset(block, addr);
13af18f2 3452 /*
0393031a
HZ
3453 * After going into COLO stage, we should not load the page
3454 * into SVM's memory directly, we put them into colo_cache firstly.
3455 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3456 * Previously, we copied all these memory in preparing stage of COLO
3457 * while we need to stop VM, which is a time-consuming process.
3458 * Here we optimize it by a trick, back-up every page while in
3459 * migration process while COLO is enabled, though it affects the
3460 * speed of the migration, but it obviously reduce the downtime of
3461 * back-up all SVM'S memory in COLO preparing stage.
13af18f2 3462 */
0393031a
HZ
3463 if (migration_incoming_colo_enabled()) {
3464 if (migration_incoming_in_colo_state()) {
3465 /* In COLO stage, put all pages into cache temporarily */
8af66371 3466 host = colo_cache_from_block_offset(block, addr, true);
0393031a
HZ
3467 } else {
3468 /*
3469 * In migration stage but before COLO stage,
3470 * Put all pages into both cache and SVM's memory.
3471 */
8af66371 3472 host_bak = colo_cache_from_block_offset(block, addr, false);
0393031a 3473 }
13af18f2 3474 }
a776aa15
DDAG
3475 if (!host) {
3476 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3477 ret = -EINVAL;
3478 break;
3479 }
13af18f2
ZC
3480 if (!migration_incoming_in_colo_state()) {
3481 ramblock_recv_bitmap_set(block, host);
3482 }
3483
1db9d8e5 3484 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
a776aa15
DDAG
3485 }
3486
56e93d26
JQ
3487 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3488 case RAM_SAVE_FLAG_MEM_SIZE:
3489 /* Synchronize RAM block list */
3490 total_ram_bytes = addr;
3491 while (!ret && total_ram_bytes) {
3492 RAMBlock *block;
56e93d26
JQ
3493 char id[256];
3494 ram_addr_t length;
3495
3496 len = qemu_get_byte(f);
3497 qemu_get_buffer(f, (uint8_t *)id, len);
3498 id[len] = 0;
3499 length = qemu_get_be64(f);
3500
e3dd7493 3501 block = qemu_ram_block_by_name(id);
b895de50
CLG
3502 if (block && !qemu_ram_is_migratable(block)) {
3503 error_report("block %s should not be migrated !", id);
3504 ret = -EINVAL;
3505 } else if (block) {
e3dd7493
DDAG
3506 if (length != block->used_length) {
3507 Error *local_err = NULL;
56e93d26 3508
fa53a0e5 3509 ret = qemu_ram_resize(block, length,
e3dd7493
DDAG
3510 &local_err);
3511 if (local_err) {
3512 error_report_err(local_err);
56e93d26 3513 }
56e93d26 3514 }
ef08fb38
DDAG
3515 /* For postcopy we need to check hugepage sizes match */
3516 if (postcopy_advised &&
3517 block->page_size != qemu_host_page_size) {
3518 uint64_t remote_page_size = qemu_get_be64(f);
3519 if (remote_page_size != block->page_size) {
3520 error_report("Mismatched RAM page size %s "
3521 "(local) %zd != %" PRId64,
3522 id, block->page_size,
3523 remote_page_size);
3524 ret = -EINVAL;
3525 }
3526 }
fbd162e6
YK
3527 if (migrate_ignore_shared()) {
3528 hwaddr addr = qemu_get_be64(f);
fbd162e6
YK
3529 if (ramblock_is_ignored(block) &&
3530 block->mr->addr != addr) {
3531 error_report("Mismatched GPAs for block %s "
3532 "%" PRId64 "!= %" PRId64,
3533 id, (uint64_t)addr,
3534 (uint64_t)block->mr->addr);
3535 ret = -EINVAL;
3536 }
3537 }
e3dd7493
DDAG
3538 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3539 block->idstr);
3540 } else {
56e93d26
JQ
3541 error_report("Unknown ramblock \"%s\", cannot "
3542 "accept migration", id);
3543 ret = -EINVAL;
3544 }
3545
3546 total_ram_bytes -= length;
3547 }
3548 break;
a776aa15 3549
bb890ed5 3550 case RAM_SAVE_FLAG_ZERO:
56e93d26
JQ
3551 ch = qemu_get_byte(f);
3552 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3553 break;
a776aa15 3554
56e93d26 3555 case RAM_SAVE_FLAG_PAGE:
56e93d26
JQ
3556 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3557 break;
56e93d26 3558
a776aa15 3559 case RAM_SAVE_FLAG_COMPRESS_PAGE:
56e93d26
JQ
3560 len = qemu_get_be32(f);
3561 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3562 error_report("Invalid compressed data length: %d", len);
3563 ret = -EINVAL;
3564 break;
3565 }
c1bc6626 3566 decompress_data_with_multi_threads(f, host, len);
56e93d26 3567 break;
a776aa15 3568
56e93d26 3569 case RAM_SAVE_FLAG_XBZRLE:
56e93d26
JQ
3570 if (load_xbzrle(f, addr, host) < 0) {
3571 error_report("Failed to decompress XBZRLE page at "
3572 RAM_ADDR_FMT, addr);
3573 ret = -EINVAL;
3574 break;
3575 }
3576 break;
3577 case RAM_SAVE_FLAG_EOS:
3578 /* normal exit */
6df264ac 3579 multifd_recv_sync_main();
56e93d26
JQ
3580 break;
3581 default:
3582 if (flags & RAM_SAVE_FLAG_HOOK) {
632e3a5c 3583 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
56e93d26
JQ
3584 } else {
3585 error_report("Unknown combination of migration flags: %#x",
3586 flags);
3587 ret = -EINVAL;
3588 }
3589 }
3590 if (!ret) {
3591 ret = qemu_file_get_error(f);
3592 }
0393031a
HZ
3593 if (!ret && host_bak) {
3594 memcpy(host_bak, host, TARGET_PAGE_SIZE);
3595 }
56e93d26
JQ
3596 }
3597
ca1a6b70 3598 ret |= wait_for_decompress_done();
10da4a36
WY
3599 return ret;
3600}
3601
3602static int ram_load(QEMUFile *f, void *opaque, int version_id)
3603{
3604 int ret = 0;
3605 static uint64_t seq_iter;
3606 /*
3607 * If system is running in postcopy mode, page inserts to host memory must
3608 * be atomic
3609 */
3610 bool postcopy_running = postcopy_is_running();
3611
3612 seq_iter++;
3613
3614 if (version_id != 4) {
3615 return -EINVAL;
3616 }
3617
3618 /*
3619 * This RCU critical section can be very long running.
3620 * When RCU reclaims in the code start to become numerous,
3621 * it will be necessary to reduce the granularity of this
3622 * critical section.
3623 */
89ac5a1d
DDAG
3624 WITH_RCU_READ_LOCK_GUARD() {
3625 if (postcopy_running) {
3626 ret = ram_load_postcopy(f);
3627 } else {
3628 ret = ram_load_precopy(f);
3629 }
10da4a36 3630 }
55c4446b 3631 trace_ram_load_complete(ret, seq_iter);
e6f4aa18 3632
56e93d26
JQ
3633 return ret;
3634}
3635
c6467627
VSO
3636static bool ram_has_postcopy(void *opaque)
3637{
469dd51b 3638 RAMBlock *rb;
fbd162e6 3639 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
469dd51b
JH
3640 if (ramblock_is_pmem(rb)) {
3641 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3642 "is not supported now!", rb->idstr, rb->host);
3643 return false;
3644 }
3645 }
3646
c6467627
VSO
3647 return migrate_postcopy_ram();
3648}
3649
edd090c7
PX
3650/* Sync all the dirty bitmap with destination VM. */
3651static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3652{
3653 RAMBlock *block;
3654 QEMUFile *file = s->to_dst_file;
3655 int ramblock_count = 0;
3656
3657 trace_ram_dirty_bitmap_sync_start();
3658
fbd162e6 3659 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
edd090c7
PX
3660 qemu_savevm_send_recv_bitmap(file, block->idstr);
3661 trace_ram_dirty_bitmap_request(block->idstr);
3662 ramblock_count++;
3663 }
3664
3665 trace_ram_dirty_bitmap_sync_wait();
3666
3667 /* Wait until all the ramblocks' dirty bitmap synced */
3668 while (ramblock_count--) {
3669 qemu_sem_wait(&s->rp_state.rp_sem);
3670 }
3671
3672 trace_ram_dirty_bitmap_sync_complete();
3673
3674 return 0;
3675}
3676
3677static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3678{
3679 qemu_sem_post(&s->rp_state.rp_sem);
3680}
3681
a335debb
PX
3682/*
3683 * Read the received bitmap, revert it as the initial dirty bitmap.
3684 * This is only used when the postcopy migration is paused but wants
3685 * to resume from a middle point.
3686 */
3687int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3688{
3689 int ret = -EINVAL;
3690 QEMUFile *file = s->rp_state.from_dst_file;
3691 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
a725ef9f 3692 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
3693 uint64_t size, end_mark;
3694
3695 trace_ram_dirty_bitmap_reload_begin(block->idstr);
3696
3697 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3698 error_report("%s: incorrect state %s", __func__,
3699 MigrationStatus_str(s->state));
3700 return -EINVAL;
3701 }
3702
3703 /*
3704 * Note: see comments in ramblock_recv_bitmap_send() on why we
3705 * need the endianess convertion, and the paddings.
3706 */
3707 local_size = ROUND_UP(local_size, 8);
3708
3709 /* Add paddings */
3710 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
3711
3712 size = qemu_get_be64(file);
3713
3714 /* The size of the bitmap should match with our ramblock */
3715 if (size != local_size) {
3716 error_report("%s: ramblock '%s' bitmap size mismatch "
3717 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
3718 block->idstr, size, local_size);
3719 ret = -EINVAL;
3720 goto out;
3721 }
3722
3723 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
3724 end_mark = qemu_get_be64(file);
3725
3726 ret = qemu_file_get_error(file);
3727 if (ret || size != local_size) {
3728 error_report("%s: read bitmap failed for ramblock '%s': %d"
3729 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
3730 __func__, block->idstr, ret, local_size, size);
3731 ret = -EIO;
3732 goto out;
3733 }
3734
3735 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
3736 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
3737 __func__, block->idstr, end_mark);
3738 ret = -EINVAL;
3739 goto out;
3740 }
3741
3742 /*
3743 * Endianess convertion. We are during postcopy (though paused).
3744 * The dirty bitmap won't change. We can directly modify it.
3745 */
3746 bitmap_from_le(block->bmap, le_bitmap, nbits);
3747
3748 /*
3749 * What we received is "received bitmap". Revert it as the initial
3750 * dirty bitmap for this ramblock.
3751 */
3752 bitmap_complement(block->bmap, block->bmap, nbits);
3753
3754 trace_ram_dirty_bitmap_reload_complete(block->idstr);
3755
edd090c7
PX
3756 /*
3757 * We succeeded to sync bitmap for current ramblock. If this is
3758 * the last one to sync, we need to notify the main send thread.
3759 */
3760 ram_dirty_bitmap_reload_notify(s);
3761
a335debb
PX
3762 ret = 0;
3763out:
bf269906 3764 g_free(le_bitmap);
a335debb
PX
3765 return ret;
3766}
3767
edd090c7
PX
3768static int ram_resume_prepare(MigrationState *s, void *opaque)
3769{
3770 RAMState *rs = *(RAMState **)opaque;
08614f34 3771 int ret;
edd090c7 3772
08614f34
PX
3773 ret = ram_dirty_bitmap_sync_all(s, rs);
3774 if (ret) {
3775 return ret;
3776 }
3777
3778 ram_state_resume_prepare(rs, s->to_dst_file);
3779
3780 return 0;
edd090c7
PX
3781}
3782
56e93d26 3783static SaveVMHandlers savevm_ram_handlers = {
9907e842 3784 .save_setup = ram_save_setup,
56e93d26 3785 .save_live_iterate = ram_save_iterate,
763c906b 3786 .save_live_complete_postcopy = ram_save_complete,
a3e06c3d 3787 .save_live_complete_precopy = ram_save_complete,
c6467627 3788 .has_postcopy = ram_has_postcopy,
56e93d26
JQ
3789 .save_live_pending = ram_save_pending,
3790 .load_state = ram_load,
f265e0e4
JQ
3791 .save_cleanup = ram_save_cleanup,
3792 .load_setup = ram_load_setup,
3793 .load_cleanup = ram_load_cleanup,
edd090c7 3794 .resume_prepare = ram_resume_prepare,
56e93d26
JQ
3795};
3796
3797void ram_mig_init(void)
3798{
3799 qemu_mutex_init(&XBZRLE.lock);
ce62df53 3800 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
56e93d26 3801}