]> git.proxmox.com Git - mirror_qemu.git/blame - migration/ram.c
migration: Add support for modules
[mirror_qemu.git] / migration / ram.c
CommitLineData
56e93d26
JQ
1/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
76cc7b58
JQ
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
56e93d26
JQ
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
e688df6b 28
1393a485 29#include "qemu/osdep.h"
33c11879 30#include "cpu.h"
56e93d26 31#include <zlib.h>
f348b6d1 32#include "qemu/cutils.h"
56e93d26
JQ
33#include "qemu/bitops.h"
34#include "qemu/bitmap.h"
7205c9ec 35#include "qemu/main-loop.h"
709e3fe8 36#include "xbzrle.h"
7b1e1a22 37#include "ram.h"
6666c96a 38#include "migration.h"
f2a8f0a6 39#include "migration/register.h"
7b1e1a22 40#include "migration/misc.h"
08a0aee1 41#include "qemu-file.h"
be07b0ac 42#include "postcopy-ram.h"
53d37d36 43#include "page_cache.h"
56e93d26 44#include "qemu/error-report.h"
e688df6b 45#include "qapi/error.h"
9af23989 46#include "qapi/qapi-events-migration.h"
8acabf69 47#include "qapi/qmp/qerror.h"
56e93d26 48#include "trace.h"
56e93d26 49#include "exec/ram_addr.h"
f9494614 50#include "exec/target_page.h"
56e93d26 51#include "qemu/rcu_queue.h"
a91246c9 52#include "migration/colo.h"
53d37d36 53#include "block.h"
af8b7d2b 54#include "sysemu/sysemu.h"
edd090c7 55#include "savevm.h"
b9ee2f7d 56#include "qemu/iov.h"
d32ca5ad 57#include "multifd.h"
56e93d26 58
56e93d26
JQ
59/***********************************************************/
60/* ram save/restore */
61
bb890ed5
JQ
62/* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
63 * worked for pages that where filled with the same char. We switched
64 * it to only search for the zero value. And to avoid confusion with
65 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
66 */
67
56e93d26 68#define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
bb890ed5 69#define RAM_SAVE_FLAG_ZERO 0x02
56e93d26
JQ
70#define RAM_SAVE_FLAG_MEM_SIZE 0x04
71#define RAM_SAVE_FLAG_PAGE 0x08
72#define RAM_SAVE_FLAG_EOS 0x10
73#define RAM_SAVE_FLAG_CONTINUE 0x20
74#define RAM_SAVE_FLAG_XBZRLE 0x40
75/* 0x80 is reserved in migration.h start with 0x100 next */
76#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
77
56e93d26
JQ
78static inline bool is_zero_range(uint8_t *p, uint64_t size)
79{
a1febc49 80 return buffer_is_zero(p, size);
56e93d26
JQ
81}
82
9360447d
JQ
83XBZRLECacheStats xbzrle_counters;
84
56e93d26
JQ
85/* struct contains XBZRLE cache and a static page
86 used by the compression */
87static struct {
88 /* buffer used for XBZRLE encoding */
89 uint8_t *encoded_buf;
90 /* buffer for storing page content */
91 uint8_t *current_buf;
92 /* Cache for XBZRLE, Protected by lock. */
93 PageCache *cache;
94 QemuMutex lock;
c00e0928
JQ
95 /* it will store a page full of zeros */
96 uint8_t *zero_target_page;
f265e0e4
JQ
97 /* buffer used for XBZRLE decoding */
98 uint8_t *decoded_buf;
56e93d26
JQ
99} XBZRLE;
100
56e93d26
JQ
101static void XBZRLE_cache_lock(void)
102{
103 if (migrate_use_xbzrle())
104 qemu_mutex_lock(&XBZRLE.lock);
105}
106
107static void XBZRLE_cache_unlock(void)
108{
109 if (migrate_use_xbzrle())
110 qemu_mutex_unlock(&XBZRLE.lock);
111}
112
3d0684b2
JQ
113/**
114 * xbzrle_cache_resize: resize the xbzrle cache
115 *
116 * This function is called from qmp_migrate_set_cache_size in main
117 * thread, possibly while a migration is in progress. A running
118 * migration may be using the cache and might finish during this call,
119 * hence changes to the cache are protected by XBZRLE.lock().
120 *
c9dede2d 121 * Returns 0 for success or -1 for error
3d0684b2
JQ
122 *
123 * @new_size: new cache size
8acabf69 124 * @errp: set *errp if the check failed, with reason
56e93d26 125 */
c9dede2d 126int xbzrle_cache_resize(int64_t new_size, Error **errp)
56e93d26
JQ
127{
128 PageCache *new_cache;
c9dede2d 129 int64_t ret = 0;
56e93d26 130
8acabf69
JQ
131 /* Check for truncation */
132 if (new_size != (size_t)new_size) {
133 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
134 "exceeding address space");
135 return -1;
136 }
137
2a313e5c
JQ
138 if (new_size == migrate_xbzrle_cache_size()) {
139 /* nothing to do */
c9dede2d 140 return 0;
2a313e5c
JQ
141 }
142
56e93d26
JQ
143 XBZRLE_cache_lock();
144
145 if (XBZRLE.cache != NULL) {
80f8dfde 146 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
56e93d26 147 if (!new_cache) {
56e93d26
JQ
148 ret = -1;
149 goto out;
150 }
151
152 cache_fini(XBZRLE.cache);
153 XBZRLE.cache = new_cache;
154 }
56e93d26
JQ
155out:
156 XBZRLE_cache_unlock();
157 return ret;
158}
159
fbd162e6
YK
160static bool ramblock_is_ignored(RAMBlock *block)
161{
162 return !qemu_ram_is_migratable(block) ||
163 (migrate_ignore_shared() && qemu_ram_is_shared(block));
164}
165
b895de50 166/* Should be holding either ram_list.mutex, or the RCU lock. */
fbd162e6
YK
167#define RAMBLOCK_FOREACH_NOT_IGNORED(block) \
168 INTERNAL_RAMBLOCK_FOREACH(block) \
169 if (ramblock_is_ignored(block)) {} else
170
b895de50 171#define RAMBLOCK_FOREACH_MIGRATABLE(block) \
343f632c 172 INTERNAL_RAMBLOCK_FOREACH(block) \
b895de50
CLG
173 if (!qemu_ram_is_migratable(block)) {} else
174
343f632c
DDAG
175#undef RAMBLOCK_FOREACH
176
fbd162e6
YK
177int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
178{
179 RAMBlock *block;
180 int ret = 0;
181
89ac5a1d
DDAG
182 RCU_READ_LOCK_GUARD();
183
fbd162e6
YK
184 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
185 ret = func(block, opaque);
186 if (ret) {
187 break;
188 }
189 }
fbd162e6
YK
190 return ret;
191}
192
f9494614
AP
193static void ramblock_recv_map_init(void)
194{
195 RAMBlock *rb;
196
fbd162e6 197 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
f9494614
AP
198 assert(!rb->receivedmap);
199 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
200 }
201}
202
203int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
204{
205 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
206 rb->receivedmap);
207}
208
1cba9f6e
DDAG
209bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
210{
211 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
212}
213
f9494614
AP
214void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
215{
216 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
217}
218
219void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
220 size_t nr)
221{
222 bitmap_set_atomic(rb->receivedmap,
223 ramblock_recv_bitmap_offset(host_addr, rb),
224 nr);
225}
226
a335debb
PX
227#define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
228
229/*
230 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
231 *
232 * Returns >0 if success with sent bytes, or <0 if error.
233 */
234int64_t ramblock_recv_bitmap_send(QEMUFile *file,
235 const char *block_name)
236{
237 RAMBlock *block = qemu_ram_block_by_name(block_name);
238 unsigned long *le_bitmap, nbits;
239 uint64_t size;
240
241 if (!block) {
242 error_report("%s: invalid block name: %s", __func__, block_name);
243 return -1;
244 }
245
246 nbits = block->used_length >> TARGET_PAGE_BITS;
247
248 /*
249 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
250 * machines we may need 4 more bytes for padding (see below
251 * comment). So extend it a bit before hand.
252 */
253 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
254
255 /*
256 * Always use little endian when sending the bitmap. This is
257 * required that when source and destination VMs are not using the
258 * same endianess. (Note: big endian won't work.)
259 */
260 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
261
262 /* Size of the bitmap, in bytes */
a725ef9f 263 size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
264
265 /*
266 * size is always aligned to 8 bytes for 64bit machines, but it
267 * may not be true for 32bit machines. We need this padding to
268 * make sure the migration can survive even between 32bit and
269 * 64bit machines.
270 */
271 size = ROUND_UP(size, 8);
272
273 qemu_put_be64(file, size);
274 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
275 /*
276 * Mark as an end, in case the middle part is screwed up due to
277 * some "misterious" reason.
278 */
279 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
280 qemu_fflush(file);
281
bf269906 282 g_free(le_bitmap);
a335debb
PX
283
284 if (qemu_file_get_error(file)) {
285 return qemu_file_get_error(file);
286 }
287
288 return size + sizeof(size);
289}
290
ec481c6c
JQ
291/*
292 * An outstanding page request, on the source, having been received
293 * and queued
294 */
295struct RAMSrcPageRequest {
296 RAMBlock *rb;
297 hwaddr offset;
298 hwaddr len;
299
300 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
301};
302
6f37bb8b
JQ
303/* State of RAM for migration */
304struct RAMState {
204b88b8
JQ
305 /* QEMUFile used for this migration */
306 QEMUFile *f;
6f37bb8b
JQ
307 /* Last block that we have visited searching for dirty pages */
308 RAMBlock *last_seen_block;
309 /* Last block from where we have sent data */
310 RAMBlock *last_sent_block;
269ace29
JQ
311 /* Last dirty target page we have sent */
312 ram_addr_t last_page;
6f37bb8b
JQ
313 /* last ram version we have seen */
314 uint32_t last_version;
315 /* We are in the first round */
316 bool ram_bulk_stage;
6eeb63f7
WW
317 /* The free page optimization is enabled */
318 bool fpo_enabled;
8d820d6f
JQ
319 /* How many times we have dirty too many pages */
320 int dirty_rate_high_cnt;
f664da80
JQ
321 /* these variables are used for bitmap sync */
322 /* last time we did a full bitmap_sync */
323 int64_t time_last_bitmap_sync;
eac74159 324 /* bytes transferred at start_time */
c4bdf0cf 325 uint64_t bytes_xfer_prev;
a66cd90c 326 /* number of dirty pages since start_time */
68908ed6 327 uint64_t num_dirty_pages_period;
b5833fde
JQ
328 /* xbzrle misses since the beginning of the period */
329 uint64_t xbzrle_cache_miss_prev;
76e03000
XG
330
331 /* compression statistics since the beginning of the period */
332 /* amount of count that no free thread to compress data */
333 uint64_t compress_thread_busy_prev;
334 /* amount bytes after compression */
335 uint64_t compressed_size_prev;
336 /* amount of compressed pages */
337 uint64_t compress_pages_prev;
338
be8b02ed
XG
339 /* total handled target pages at the beginning of period */
340 uint64_t target_page_count_prev;
341 /* total handled target pages since start */
342 uint64_t target_page_count;
9360447d 343 /* number of dirty bits in the bitmap */
2dfaf12e 344 uint64_t migration_dirty_pages;
386a907b 345 /* Protects modification of the bitmap and migration dirty pages */
108cfae0 346 QemuMutex bitmap_mutex;
68a098f3
JQ
347 /* The RAMBlock used in the last src_page_requests */
348 RAMBlock *last_req_rb;
ec481c6c
JQ
349 /* Queue of outstanding page requests from the destination */
350 QemuMutex src_page_req_mutex;
b58deb34 351 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
6f37bb8b
JQ
352};
353typedef struct RAMState RAMState;
354
53518d94 355static RAMState *ram_state;
6f37bb8b 356
bd227060
WW
357static NotifierWithReturnList precopy_notifier_list;
358
359void precopy_infrastructure_init(void)
360{
361 notifier_with_return_list_init(&precopy_notifier_list);
362}
363
364void precopy_add_notifier(NotifierWithReturn *n)
365{
366 notifier_with_return_list_add(&precopy_notifier_list, n);
367}
368
369void precopy_remove_notifier(NotifierWithReturn *n)
370{
371 notifier_with_return_remove(n);
372}
373
374int precopy_notify(PrecopyNotifyReason reason, Error **errp)
375{
376 PrecopyNotifyData pnd;
377 pnd.reason = reason;
378 pnd.errp = errp;
379
380 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
381}
382
6eeb63f7
WW
383void precopy_enable_free_page_optimization(void)
384{
385 if (!ram_state) {
386 return;
387 }
388
389 ram_state->fpo_enabled = true;
390}
391
9edabd4d 392uint64_t ram_bytes_remaining(void)
2f4fde93 393{
bae416e5
DDAG
394 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
395 0;
2f4fde93
JQ
396}
397
9360447d 398MigrationStats ram_counters;
96506894 399
b8fb8cb7
DDAG
400/* used by the search for pages to send */
401struct PageSearchStatus {
402 /* Current block being searched */
403 RAMBlock *block;
a935e30f
JQ
404 /* Current page to search from */
405 unsigned long page;
b8fb8cb7
DDAG
406 /* Set once we wrap around */
407 bool complete_round;
408};
409typedef struct PageSearchStatus PageSearchStatus;
410
76e03000
XG
411CompressionStats compression_counters;
412
56e93d26 413struct CompressParam {
56e93d26 414 bool done;
90e56fb4 415 bool quit;
5e5fdcff 416 bool zero_page;
56e93d26
JQ
417 QEMUFile *file;
418 QemuMutex mutex;
419 QemuCond cond;
420 RAMBlock *block;
421 ram_addr_t offset;
34ab9e97
XG
422
423 /* internally used fields */
dcaf446e 424 z_stream stream;
34ab9e97 425 uint8_t *originbuf;
56e93d26
JQ
426};
427typedef struct CompressParam CompressParam;
428
429struct DecompressParam {
73a8912b 430 bool done;
90e56fb4 431 bool quit;
56e93d26
JQ
432 QemuMutex mutex;
433 QemuCond cond;
434 void *des;
d341d9f3 435 uint8_t *compbuf;
56e93d26 436 int len;
797ca154 437 z_stream stream;
56e93d26
JQ
438};
439typedef struct DecompressParam DecompressParam;
440
441static CompressParam *comp_param;
442static QemuThread *compress_threads;
443/* comp_done_cond is used to wake up the migration thread when
444 * one of the compression threads has finished the compression.
445 * comp_done_lock is used to co-work with comp_done_cond.
446 */
0d9f9a5c
LL
447static QemuMutex comp_done_lock;
448static QemuCond comp_done_cond;
56e93d26
JQ
449/* The empty QEMUFileOps will be used by file in CompressParam */
450static const QEMUFileOps empty_ops = { };
451
34ab9e97 452static QEMUFile *decomp_file;
56e93d26
JQ
453static DecompressParam *decomp_param;
454static QemuThread *decompress_threads;
73a8912b
LL
455static QemuMutex decomp_done_lock;
456static QemuCond decomp_done_cond;
56e93d26 457
5e5fdcff 458static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
6ef3771c 459 ram_addr_t offset, uint8_t *source_buf);
56e93d26
JQ
460
461static void *do_data_compress(void *opaque)
462{
463 CompressParam *param = opaque;
a7a9a88f
LL
464 RAMBlock *block;
465 ram_addr_t offset;
5e5fdcff 466 bool zero_page;
56e93d26 467
a7a9a88f 468 qemu_mutex_lock(&param->mutex);
90e56fb4 469 while (!param->quit) {
a7a9a88f
LL
470 if (param->block) {
471 block = param->block;
472 offset = param->offset;
473 param->block = NULL;
474 qemu_mutex_unlock(&param->mutex);
475
5e5fdcff
XG
476 zero_page = do_compress_ram_page(param->file, &param->stream,
477 block, offset, param->originbuf);
a7a9a88f 478
0d9f9a5c 479 qemu_mutex_lock(&comp_done_lock);
a7a9a88f 480 param->done = true;
5e5fdcff 481 param->zero_page = zero_page;
0d9f9a5c
LL
482 qemu_cond_signal(&comp_done_cond);
483 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
484
485 qemu_mutex_lock(&param->mutex);
486 } else {
56e93d26
JQ
487 qemu_cond_wait(&param->cond, &param->mutex);
488 }
56e93d26 489 }
a7a9a88f 490 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
491
492 return NULL;
493}
494
f0afa331 495static void compress_threads_save_cleanup(void)
56e93d26
JQ
496{
497 int i, thread_count;
498
05306935 499 if (!migrate_use_compression() || !comp_param) {
56e93d26
JQ
500 return;
501 }
05306935 502
56e93d26
JQ
503 thread_count = migrate_compress_threads();
504 for (i = 0; i < thread_count; i++) {
dcaf446e
XG
505 /*
506 * we use it as a indicator which shows if the thread is
507 * properly init'd or not
508 */
509 if (!comp_param[i].file) {
510 break;
511 }
05306935
FL
512
513 qemu_mutex_lock(&comp_param[i].mutex);
514 comp_param[i].quit = true;
515 qemu_cond_signal(&comp_param[i].cond);
516 qemu_mutex_unlock(&comp_param[i].mutex);
517
56e93d26 518 qemu_thread_join(compress_threads + i);
56e93d26
JQ
519 qemu_mutex_destroy(&comp_param[i].mutex);
520 qemu_cond_destroy(&comp_param[i].cond);
dcaf446e 521 deflateEnd(&comp_param[i].stream);
34ab9e97 522 g_free(comp_param[i].originbuf);
dcaf446e
XG
523 qemu_fclose(comp_param[i].file);
524 comp_param[i].file = NULL;
56e93d26 525 }
0d9f9a5c
LL
526 qemu_mutex_destroy(&comp_done_lock);
527 qemu_cond_destroy(&comp_done_cond);
56e93d26
JQ
528 g_free(compress_threads);
529 g_free(comp_param);
56e93d26
JQ
530 compress_threads = NULL;
531 comp_param = NULL;
56e93d26
JQ
532}
533
dcaf446e 534static int compress_threads_save_setup(void)
56e93d26
JQ
535{
536 int i, thread_count;
537
538 if (!migrate_use_compression()) {
dcaf446e 539 return 0;
56e93d26 540 }
56e93d26
JQ
541 thread_count = migrate_compress_threads();
542 compress_threads = g_new0(QemuThread, thread_count);
543 comp_param = g_new0(CompressParam, thread_count);
0d9f9a5c
LL
544 qemu_cond_init(&comp_done_cond);
545 qemu_mutex_init(&comp_done_lock);
56e93d26 546 for (i = 0; i < thread_count; i++) {
34ab9e97
XG
547 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
548 if (!comp_param[i].originbuf) {
549 goto exit;
550 }
551
dcaf446e
XG
552 if (deflateInit(&comp_param[i].stream,
553 migrate_compress_level()) != Z_OK) {
34ab9e97 554 g_free(comp_param[i].originbuf);
dcaf446e
XG
555 goto exit;
556 }
557
e110aa91
C
558 /* comp_param[i].file is just used as a dummy buffer to save data,
559 * set its ops to empty.
56e93d26
JQ
560 */
561 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
562 comp_param[i].done = true;
90e56fb4 563 comp_param[i].quit = false;
56e93d26
JQ
564 qemu_mutex_init(&comp_param[i].mutex);
565 qemu_cond_init(&comp_param[i].cond);
566 qemu_thread_create(compress_threads + i, "compress",
567 do_data_compress, comp_param + i,
568 QEMU_THREAD_JOINABLE);
569 }
dcaf446e
XG
570 return 0;
571
572exit:
573 compress_threads_save_cleanup();
574 return -1;
56e93d26
JQ
575}
576
577/**
3d0684b2 578 * save_page_header: write page header to wire
56e93d26
JQ
579 *
580 * If this is the 1st block, it also writes the block identification
581 *
3d0684b2 582 * Returns the number of bytes written
56e93d26
JQ
583 *
584 * @f: QEMUFile where to send the data
585 * @block: block that contains the page we want to send
586 * @offset: offset inside the block for the page
587 * in the lower bits, it contains flags
588 */
2bf3aa85
JQ
589static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
590 ram_addr_t offset)
56e93d26 591{
9f5f380b 592 size_t size, len;
56e93d26 593
24795694
JQ
594 if (block == rs->last_sent_block) {
595 offset |= RAM_SAVE_FLAG_CONTINUE;
596 }
2bf3aa85 597 qemu_put_be64(f, offset);
56e93d26
JQ
598 size = 8;
599
600 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
9f5f380b 601 len = strlen(block->idstr);
2bf3aa85
JQ
602 qemu_put_byte(f, len);
603 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
9f5f380b 604 size += 1 + len;
24795694 605 rs->last_sent_block = block;
56e93d26
JQ
606 }
607 return size;
608}
609
3d0684b2
JQ
610/**
611 * mig_throttle_guest_down: throotle down the guest
612 *
613 * Reduce amount of guest cpu execution to hopefully slow down memory
614 * writes. If guest dirty memory rate is reduced below the rate at
615 * which we can transfer pages to the destination then we should be
616 * able to complete migration. Some workloads dirty memory way too
617 * fast and will not effectively converge, even with auto-converge.
070afca2
JH
618 */
619static void mig_throttle_guest_down(void)
620{
621 MigrationState *s = migrate_get_current();
2594f56d
DB
622 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
623 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
4cbc9c7f 624 int pct_max = s->parameters.max_cpu_throttle;
070afca2
JH
625
626 /* We have not started throttling yet. Let's start it. */
627 if (!cpu_throttle_active()) {
628 cpu_throttle_set(pct_initial);
629 } else {
630 /* Throttling already on, just increase the rate */
4cbc9c7f
LQ
631 cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
632 pct_max));
070afca2
JH
633 }
634}
635
3d0684b2
JQ
636/**
637 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
638 *
6f37bb8b 639 * @rs: current RAM state
3d0684b2
JQ
640 * @current_addr: address for the zero page
641 *
642 * Update the xbzrle cache to reflect a page that's been sent as all 0.
56e93d26
JQ
643 * The important thing is that a stale (not-yet-0'd) page be replaced
644 * by the new data.
645 * As a bonus, if the page wasn't in the cache it gets added so that
3d0684b2 646 * when a small write is made into the 0'd page it gets XBZRLE sent.
56e93d26 647 */
6f37bb8b 648static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
56e93d26 649{
6f37bb8b 650 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
56e93d26
JQ
651 return;
652 }
653
654 /* We don't care if this fails to allocate a new cache page
655 * as long as it updated an old one */
c00e0928 656 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
9360447d 657 ram_counters.dirty_sync_count);
56e93d26
JQ
658}
659
660#define ENCODING_FLAG_XBZRLE 0x1
661
662/**
663 * save_xbzrle_page: compress and send current page
664 *
665 * Returns: 1 means that we wrote the page
666 * 0 means that page is identical to the one already sent
667 * -1 means that xbzrle would be longer than normal
668 *
5a987738 669 * @rs: current RAM state
3d0684b2
JQ
670 * @current_data: pointer to the address of the page contents
671 * @current_addr: addr of the page
56e93d26
JQ
672 * @block: block that contains the page we want to send
673 * @offset: offset inside the block for the page
674 * @last_stage: if we are at the completion stage
56e93d26 675 */
204b88b8 676static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
56e93d26 677 ram_addr_t current_addr, RAMBlock *block,
072c2511 678 ram_addr_t offset, bool last_stage)
56e93d26
JQ
679{
680 int encoded_len = 0, bytes_xbzrle;
681 uint8_t *prev_cached_page;
682
9360447d
JQ
683 if (!cache_is_cached(XBZRLE.cache, current_addr,
684 ram_counters.dirty_sync_count)) {
685 xbzrle_counters.cache_miss++;
56e93d26
JQ
686 if (!last_stage) {
687 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
9360447d 688 ram_counters.dirty_sync_count) == -1) {
56e93d26
JQ
689 return -1;
690 } else {
691 /* update *current_data when the page has been
692 inserted into cache */
693 *current_data = get_cached_data(XBZRLE.cache, current_addr);
694 }
695 }
696 return -1;
697 }
698
699 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
700
701 /* save current buffer into memory */
702 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
703
704 /* XBZRLE encoding (if there is no overflow) */
705 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
706 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
707 TARGET_PAGE_SIZE);
ca353803
WY
708
709 /*
710 * Update the cache contents, so that it corresponds to the data
711 * sent, in all cases except where we skip the page.
712 */
713 if (!last_stage && encoded_len != 0) {
714 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
715 /*
716 * In the case where we couldn't compress, ensure that the caller
717 * sends the data from the cache, since the guest might have
718 * changed the RAM since we copied it.
719 */
720 *current_data = prev_cached_page;
721 }
722
56e93d26 723 if (encoded_len == 0) {
55c4446b 724 trace_save_xbzrle_page_skipping();
56e93d26
JQ
725 return 0;
726 } else if (encoded_len == -1) {
55c4446b 727 trace_save_xbzrle_page_overflow();
9360447d 728 xbzrle_counters.overflow++;
56e93d26
JQ
729 return -1;
730 }
731
56e93d26 732 /* Send XBZRLE based compressed page */
2bf3aa85 733 bytes_xbzrle = save_page_header(rs, rs->f, block,
204b88b8
JQ
734 offset | RAM_SAVE_FLAG_XBZRLE);
735 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
736 qemu_put_be16(rs->f, encoded_len);
737 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
56e93d26 738 bytes_xbzrle += encoded_len + 1 + 2;
9360447d
JQ
739 xbzrle_counters.pages++;
740 xbzrle_counters.bytes += bytes_xbzrle;
741 ram_counters.transferred += bytes_xbzrle;
56e93d26
JQ
742
743 return 1;
744}
745
3d0684b2
JQ
746/**
747 * migration_bitmap_find_dirty: find the next dirty page from start
f3f491fc 748 *
a5f7b1a6 749 * Returns the page offset within memory region of the start of a dirty page
3d0684b2 750 *
6f37bb8b 751 * @rs: current RAM state
3d0684b2 752 * @rb: RAMBlock where to search for dirty pages
a935e30f 753 * @start: page where we start the search
f3f491fc 754 */
56e93d26 755static inline
a935e30f 756unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
f20e2865 757 unsigned long start)
56e93d26 758{
6b6712ef
JQ
759 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
760 unsigned long *bitmap = rb->bmap;
56e93d26
JQ
761 unsigned long next;
762
fbd162e6 763 if (ramblock_is_ignored(rb)) {
b895de50
CLG
764 return size;
765 }
766
6eeb63f7
WW
767 /*
768 * When the free page optimization is enabled, we need to check the bitmap
769 * to send the non-free pages rather than all the pages in the bulk stage.
770 */
771 if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
6b6712ef 772 next = start + 1;
56e93d26 773 } else {
6b6712ef 774 next = find_next_bit(bitmap, size, start);
56e93d26
JQ
775 }
776
6b6712ef 777 return next;
56e93d26
JQ
778}
779
06b10688 780static inline bool migration_bitmap_clear_dirty(RAMState *rs,
f20e2865
JQ
781 RAMBlock *rb,
782 unsigned long page)
a82d593b
DDAG
783{
784 bool ret;
a82d593b 785
386a907b 786 qemu_mutex_lock(&rs->bitmap_mutex);
002cad6b
PX
787
788 /*
789 * Clear dirty bitmap if needed. This _must_ be called before we
790 * send any of the page in the chunk because we need to make sure
791 * we can capture further page content changes when we sync dirty
792 * log the next time. So as long as we are going to send any of
793 * the page in the chunk we clear the remote dirty bitmap for all.
794 * Clearing it earlier won't be a problem, but too late will.
795 */
796 if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
797 uint8_t shift = rb->clear_bmap_shift;
798 hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
8bba004c 799 hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
002cad6b
PX
800
801 /*
802 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
803 * can make things easier sometimes since then start address
804 * of the small chunk will always be 64 pages aligned so the
805 * bitmap will always be aligned to unsigned long. We should
806 * even be able to remove this restriction but I'm simply
807 * keeping it.
808 */
809 assert(shift >= 6);
810 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
811 memory_region_clear_dirty_bitmap(rb->mr, start, size);
812 }
813
6b6712ef 814 ret = test_and_clear_bit(page, rb->bmap);
a82d593b
DDAG
815
816 if (ret) {
0d8ec885 817 rs->migration_dirty_pages--;
a82d593b 818 }
386a907b
WW
819 qemu_mutex_unlock(&rs->bitmap_mutex);
820
a82d593b
DDAG
821 return ret;
822}
823
267691b6 824/* Called with RCU critical section */
7a3e9571 825static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
56e93d26 826{
0d8ec885 827 rs->migration_dirty_pages +=
5d0980a4 828 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length,
0d8ec885 829 &rs->num_dirty_pages_period);
56e93d26
JQ
830}
831
3d0684b2
JQ
832/**
833 * ram_pagesize_summary: calculate all the pagesizes of a VM
834 *
835 * Returns a summary bitmap of the page sizes of all RAMBlocks
836 *
837 * For VMs with just normal pages this is equivalent to the host page
838 * size. If it's got some huge pages then it's the OR of all the
839 * different page sizes.
e8ca1db2
DDAG
840 */
841uint64_t ram_pagesize_summary(void)
842{
843 RAMBlock *block;
844 uint64_t summary = 0;
845
fbd162e6 846 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
e8ca1db2
DDAG
847 summary |= block->page_size;
848 }
849
850 return summary;
851}
852
aecbfe9c
XG
853uint64_t ram_get_total_transferred_pages(void)
854{
855 return ram_counters.normal + ram_counters.duplicate +
856 compression_counters.pages + xbzrle_counters.pages;
857}
858
b734035b
XG
859static void migration_update_rates(RAMState *rs, int64_t end_time)
860{
be8b02ed 861 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
76e03000 862 double compressed_size;
b734035b
XG
863
864 /* calculate period counters */
865 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
866 / (end_time - rs->time_last_bitmap_sync);
867
be8b02ed 868 if (!page_count) {
b734035b
XG
869 return;
870 }
871
872 if (migrate_use_xbzrle()) {
873 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
be8b02ed 874 rs->xbzrle_cache_miss_prev) / page_count;
b734035b
XG
875 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
876 }
76e03000
XG
877
878 if (migrate_use_compression()) {
879 compression_counters.busy_rate = (double)(compression_counters.busy -
880 rs->compress_thread_busy_prev) / page_count;
881 rs->compress_thread_busy_prev = compression_counters.busy;
882
883 compressed_size = compression_counters.compressed_size -
884 rs->compressed_size_prev;
885 if (compressed_size) {
886 double uncompressed_size = (compression_counters.pages -
887 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
888
889 /* Compression-Ratio = Uncompressed-size / Compressed-size */
890 compression_counters.compression_rate =
891 uncompressed_size / compressed_size;
892
893 rs->compress_pages_prev = compression_counters.pages;
894 rs->compressed_size_prev = compression_counters.compressed_size;
895 }
896 }
b734035b
XG
897}
898
8d820d6f 899static void migration_bitmap_sync(RAMState *rs)
56e93d26
JQ
900{
901 RAMBlock *block;
56e93d26 902 int64_t end_time;
c4bdf0cf 903 uint64_t bytes_xfer_now;
56e93d26 904
9360447d 905 ram_counters.dirty_sync_count++;
56e93d26 906
f664da80
JQ
907 if (!rs->time_last_bitmap_sync) {
908 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
56e93d26
JQ
909 }
910
911 trace_migration_bitmap_sync_start();
9c1f8f44 912 memory_global_dirty_log_sync();
56e93d26 913
108cfae0 914 qemu_mutex_lock(&rs->bitmap_mutex);
89ac5a1d
DDAG
915 WITH_RCU_READ_LOCK_GUARD() {
916 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
917 ramblock_sync_dirty_bitmap(rs, block);
918 }
919 ram_counters.remaining = ram_bytes_remaining();
56e93d26 920 }
108cfae0 921 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26 922
9458a9a1 923 memory_global_after_dirty_log_sync();
a66cd90c 924 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1ffb5dfd 925
56e93d26
JQ
926 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
927
928 /* more than 1 second = 1000 millisecons */
f664da80 929 if (end_time > rs->time_last_bitmap_sync + 1000) {
9360447d 930 bytes_xfer_now = ram_counters.transferred;
d693c6f1 931
9ac78b61
PL
932 /* During block migration the auto-converge logic incorrectly detects
933 * that ram migration makes no progress. Avoid this by disabling the
934 * throttling logic during the bulk phase of block migration. */
935 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
56e93d26
JQ
936 /* The following detection logic can be refined later. For now:
937 Check to see if the dirtied bytes is 50% more than the approx.
938 amount of bytes that just got transferred since the last time we
070afca2
JH
939 were in this routine. If that happens twice, start or increase
940 throttling */
070afca2 941
d693c6f1 942 if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
eac74159 943 (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
b4a3c64b 944 (++rs->dirty_rate_high_cnt >= 2)) {
56e93d26 945 trace_migration_throttle();
8d820d6f 946 rs->dirty_rate_high_cnt = 0;
070afca2 947 mig_throttle_guest_down();
d693c6f1 948 }
56e93d26 949 }
070afca2 950
b734035b
XG
951 migration_update_rates(rs, end_time);
952
be8b02ed 953 rs->target_page_count_prev = rs->target_page_count;
d693c6f1
FF
954
955 /* reset period counters */
f664da80 956 rs->time_last_bitmap_sync = end_time;
a66cd90c 957 rs->num_dirty_pages_period = 0;
d2a4d85a 958 rs->bytes_xfer_prev = bytes_xfer_now;
56e93d26 959 }
4addcd4f 960 if (migrate_use_events()) {
3ab72385 961 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
4addcd4f 962 }
56e93d26
JQ
963}
964
bd227060
WW
965static void migration_bitmap_sync_precopy(RAMState *rs)
966{
967 Error *local_err = NULL;
968
969 /*
970 * The current notifier usage is just an optimization to migration, so we
971 * don't stop the normal migration process in the error case.
972 */
973 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
974 error_report_err(local_err);
975 }
976
977 migration_bitmap_sync(rs);
978
979 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
980 error_report_err(local_err);
981 }
982}
983
6c97ec5f
XG
984/**
985 * save_zero_page_to_file: send the zero page to the file
986 *
987 * Returns the size of data written to the file, 0 means the page is not
988 * a zero page
989 *
990 * @rs: current RAM state
991 * @file: the file where the data is saved
992 * @block: block that contains the page we want to send
993 * @offset: offset inside the block for the page
994 */
995static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
996 RAMBlock *block, ram_addr_t offset)
997{
998 uint8_t *p = block->host + offset;
999 int len = 0;
1000
1001 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1002 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1003 qemu_put_byte(file, 0);
1004 len += 1;
1005 }
1006 return len;
1007}
1008
56e93d26 1009/**
3d0684b2 1010 * save_zero_page: send the zero page to the stream
56e93d26 1011 *
3d0684b2 1012 * Returns the number of pages written.
56e93d26 1013 *
f7ccd61b 1014 * @rs: current RAM state
56e93d26
JQ
1015 * @block: block that contains the page we want to send
1016 * @offset: offset inside the block for the page
56e93d26 1017 */
7faccdc3 1018static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
56e93d26 1019{
6c97ec5f 1020 int len = save_zero_page_to_file(rs, rs->f, block, offset);
56e93d26 1021
6c97ec5f 1022 if (len) {
9360447d 1023 ram_counters.duplicate++;
6c97ec5f
XG
1024 ram_counters.transferred += len;
1025 return 1;
56e93d26 1026 }
6c97ec5f 1027 return -1;
56e93d26
JQ
1028}
1029
5727309d 1030static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
53f09a10 1031{
5727309d 1032 if (!migrate_release_ram() || !migration_in_postcopy()) {
53f09a10
PB
1033 return;
1034 }
1035
8bba004c 1036 ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
53f09a10
PB
1037}
1038
059ff0fb
XG
1039/*
1040 * @pages: the number of pages written by the control path,
1041 * < 0 - error
1042 * > 0 - number of pages written
1043 *
1044 * Return true if the pages has been saved, otherwise false is returned.
1045 */
1046static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1047 int *pages)
1048{
1049 uint64_t bytes_xmit = 0;
1050 int ret;
1051
1052 *pages = -1;
1053 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1054 &bytes_xmit);
1055 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1056 return false;
1057 }
1058
1059 if (bytes_xmit) {
1060 ram_counters.transferred += bytes_xmit;
1061 *pages = 1;
1062 }
1063
1064 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1065 return true;
1066 }
1067
1068 if (bytes_xmit > 0) {
1069 ram_counters.normal++;
1070 } else if (bytes_xmit == 0) {
1071 ram_counters.duplicate++;
1072 }
1073
1074 return true;
1075}
1076
65dacaa0
XG
1077/*
1078 * directly send the page to the stream
1079 *
1080 * Returns the number of pages written.
1081 *
1082 * @rs: current RAM state
1083 * @block: block that contains the page we want to send
1084 * @offset: offset inside the block for the page
1085 * @buf: the page to be sent
1086 * @async: send to page asyncly
1087 */
1088static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1089 uint8_t *buf, bool async)
1090{
1091 ram_counters.transferred += save_page_header(rs, rs->f, block,
1092 offset | RAM_SAVE_FLAG_PAGE);
1093 if (async) {
1094 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1095 migrate_release_ram() &
1096 migration_in_postcopy());
1097 } else {
1098 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1099 }
1100 ram_counters.transferred += TARGET_PAGE_SIZE;
1101 ram_counters.normal++;
1102 return 1;
1103}
1104
56e93d26 1105/**
3d0684b2 1106 * ram_save_page: send the given page to the stream
56e93d26 1107 *
3d0684b2 1108 * Returns the number of pages written.
3fd3c4b3
DDAG
1109 * < 0 - error
1110 * >=0 - Number of pages written - this might legally be 0
1111 * if xbzrle noticed the page was the same.
56e93d26 1112 *
6f37bb8b 1113 * @rs: current RAM state
56e93d26
JQ
1114 * @block: block that contains the page we want to send
1115 * @offset: offset inside the block for the page
1116 * @last_stage: if we are at the completion stage
56e93d26 1117 */
a0a8aa14 1118static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
56e93d26
JQ
1119{
1120 int pages = -1;
56e93d26 1121 uint8_t *p;
56e93d26 1122 bool send_async = true;
a08f6890 1123 RAMBlock *block = pss->block;
8bba004c 1124 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
059ff0fb 1125 ram_addr_t current_addr = block->offset + offset;
56e93d26 1126
2f68e399 1127 p = block->host + offset;
1db9d8e5 1128 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
56e93d26 1129
56e93d26 1130 XBZRLE_cache_lock();
d7400a34
XG
1131 if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1132 migrate_use_xbzrle()) {
059ff0fb
XG
1133 pages = save_xbzrle_page(rs, &p, current_addr, block,
1134 offset, last_stage);
1135 if (!last_stage) {
1136 /* Can't send this cached data async, since the cache page
1137 * might get updated before it gets to the wire
56e93d26 1138 */
059ff0fb 1139 send_async = false;
56e93d26
JQ
1140 }
1141 }
1142
1143 /* XBZRLE overflow or normal page */
1144 if (pages == -1) {
65dacaa0 1145 pages = save_normal_page(rs, block, offset, p, send_async);
56e93d26
JQ
1146 }
1147
1148 XBZRLE_cache_unlock();
1149
1150 return pages;
1151}
1152
b9ee2f7d
JQ
1153static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1154 ram_addr_t offset)
1155{
67a4c891 1156 if (multifd_queue_page(rs->f, block, offset) < 0) {
713f762a
IR
1157 return -1;
1158 }
b9ee2f7d
JQ
1159 ram_counters.normal++;
1160
1161 return 1;
1162}
1163
5e5fdcff 1164static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
6ef3771c 1165 ram_addr_t offset, uint8_t *source_buf)
56e93d26 1166{
53518d94 1167 RAMState *rs = ram_state;
a7a9a88f 1168 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
5e5fdcff 1169 bool zero_page = false;
6ef3771c 1170 int ret;
56e93d26 1171
5e5fdcff
XG
1172 if (save_zero_page_to_file(rs, f, block, offset)) {
1173 zero_page = true;
1174 goto exit;
1175 }
1176
6ef3771c 1177 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
34ab9e97
XG
1178
1179 /*
1180 * copy it to a internal buffer to avoid it being modified by VM
1181 * so that we can catch up the error during compression and
1182 * decompression
1183 */
1184 memcpy(source_buf, p, TARGET_PAGE_SIZE);
6ef3771c
XG
1185 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1186 if (ret < 0) {
1187 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
b3be2896 1188 error_report("compressed data failed!");
5e5fdcff 1189 return false;
b3be2896 1190 }
56e93d26 1191
5e5fdcff 1192exit:
6ef3771c 1193 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
5e5fdcff
XG
1194 return zero_page;
1195}
1196
1197static void
1198update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1199{
76e03000
XG
1200 ram_counters.transferred += bytes_xmit;
1201
5e5fdcff
XG
1202 if (param->zero_page) {
1203 ram_counters.duplicate++;
76e03000 1204 return;
5e5fdcff 1205 }
76e03000
XG
1206
1207 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1208 compression_counters.compressed_size += bytes_xmit - 8;
1209 compression_counters.pages++;
56e93d26
JQ
1210}
1211
32b05495
XG
1212static bool save_page_use_compression(RAMState *rs);
1213
ce25d337 1214static void flush_compressed_data(RAMState *rs)
56e93d26
JQ
1215{
1216 int idx, len, thread_count;
1217
32b05495 1218 if (!save_page_use_compression(rs)) {
56e93d26
JQ
1219 return;
1220 }
1221 thread_count = migrate_compress_threads();
a7a9a88f 1222
0d9f9a5c 1223 qemu_mutex_lock(&comp_done_lock);
56e93d26 1224 for (idx = 0; idx < thread_count; idx++) {
a7a9a88f 1225 while (!comp_param[idx].done) {
0d9f9a5c 1226 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26 1227 }
a7a9a88f 1228 }
0d9f9a5c 1229 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
1230
1231 for (idx = 0; idx < thread_count; idx++) {
1232 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 1233 if (!comp_param[idx].quit) {
ce25d337 1234 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
5e5fdcff
XG
1235 /*
1236 * it's safe to fetch zero_page without holding comp_done_lock
1237 * as there is no further request submitted to the thread,
1238 * i.e, the thread should be waiting for a request at this point.
1239 */
1240 update_compress_thread_counts(&comp_param[idx], len);
56e93d26 1241 }
a7a9a88f 1242 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26
JQ
1243 }
1244}
1245
1246static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1247 ram_addr_t offset)
1248{
1249 param->block = block;
1250 param->offset = offset;
1251}
1252
ce25d337
JQ
1253static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1254 ram_addr_t offset)
56e93d26
JQ
1255{
1256 int idx, thread_count, bytes_xmit = -1, pages = -1;
1d58872a 1257 bool wait = migrate_compress_wait_thread();
56e93d26
JQ
1258
1259 thread_count = migrate_compress_threads();
0d9f9a5c 1260 qemu_mutex_lock(&comp_done_lock);
1d58872a
XG
1261retry:
1262 for (idx = 0; idx < thread_count; idx++) {
1263 if (comp_param[idx].done) {
1264 comp_param[idx].done = false;
1265 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1266 qemu_mutex_lock(&comp_param[idx].mutex);
1267 set_compress_params(&comp_param[idx], block, offset);
1268 qemu_cond_signal(&comp_param[idx].cond);
1269 qemu_mutex_unlock(&comp_param[idx].mutex);
1270 pages = 1;
5e5fdcff 1271 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
56e93d26 1272 break;
56e93d26
JQ
1273 }
1274 }
1d58872a
XG
1275
1276 /*
1277 * wait for the free thread if the user specifies 'compress-wait-thread',
1278 * otherwise we will post the page out in the main thread as normal page.
1279 */
1280 if (pages < 0 && wait) {
1281 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1282 goto retry;
1283 }
0d9f9a5c 1284 qemu_mutex_unlock(&comp_done_lock);
56e93d26
JQ
1285
1286 return pages;
1287}
1288
3d0684b2
JQ
1289/**
1290 * find_dirty_block: find the next dirty page and update any state
1291 * associated with the search process.
b9e60928 1292 *
a5f7b1a6 1293 * Returns true if a page is found
b9e60928 1294 *
6f37bb8b 1295 * @rs: current RAM state
3d0684b2
JQ
1296 * @pss: data about the state of the current dirty page scan
1297 * @again: set to false if the search has scanned the whole of RAM
b9e60928 1298 */
f20e2865 1299static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
b9e60928 1300{
f20e2865 1301 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
6f37bb8b 1302 if (pss->complete_round && pss->block == rs->last_seen_block &&
a935e30f 1303 pss->page >= rs->last_page) {
b9e60928
DDAG
1304 /*
1305 * We've been once around the RAM and haven't found anything.
1306 * Give up.
1307 */
1308 *again = false;
1309 return false;
1310 }
8bba004c
AR
1311 if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS)
1312 >= pss->block->used_length) {
b9e60928 1313 /* Didn't find anything in this RAM Block */
a935e30f 1314 pss->page = 0;
b9e60928
DDAG
1315 pss->block = QLIST_NEXT_RCU(pss->block, next);
1316 if (!pss->block) {
48df9d80
XG
1317 /*
1318 * If memory migration starts over, we will meet a dirtied page
1319 * which may still exists in compression threads's ring, so we
1320 * should flush the compressed data to make sure the new page
1321 * is not overwritten by the old one in the destination.
1322 *
1323 * Also If xbzrle is on, stop using the data compression at this
1324 * point. In theory, xbzrle can do better than compression.
1325 */
1326 flush_compressed_data(rs);
1327
b9e60928
DDAG
1328 /* Hit the end of the list */
1329 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1330 /* Flag that we've looped */
1331 pss->complete_round = true;
6f37bb8b 1332 rs->ram_bulk_stage = false;
b9e60928
DDAG
1333 }
1334 /* Didn't find anything this time, but try again on the new block */
1335 *again = true;
1336 return false;
1337 } else {
1338 /* Can go around again, but... */
1339 *again = true;
1340 /* We've found something so probably don't need to */
1341 return true;
1342 }
1343}
1344
3d0684b2
JQ
1345/**
1346 * unqueue_page: gets a page of the queue
1347 *
a82d593b 1348 * Helper for 'get_queued_page' - gets a page off the queue
a82d593b 1349 *
3d0684b2
JQ
1350 * Returns the block of the page (or NULL if none available)
1351 *
ec481c6c 1352 * @rs: current RAM state
3d0684b2 1353 * @offset: used to return the offset within the RAMBlock
a82d593b 1354 */
f20e2865 1355static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
a82d593b
DDAG
1356{
1357 RAMBlock *block = NULL;
1358
ae526e32
XG
1359 if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1360 return NULL;
1361 }
1362
ec481c6c
JQ
1363 qemu_mutex_lock(&rs->src_page_req_mutex);
1364 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1365 struct RAMSrcPageRequest *entry =
1366 QSIMPLEQ_FIRST(&rs->src_page_requests);
a82d593b
DDAG
1367 block = entry->rb;
1368 *offset = entry->offset;
a82d593b
DDAG
1369
1370 if (entry->len > TARGET_PAGE_SIZE) {
1371 entry->len -= TARGET_PAGE_SIZE;
1372 entry->offset += TARGET_PAGE_SIZE;
1373 } else {
1374 memory_region_unref(block->mr);
ec481c6c 1375 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
a82d593b 1376 g_free(entry);
e03a34f8 1377 migration_consume_urgent_request();
a82d593b
DDAG
1378 }
1379 }
ec481c6c 1380 qemu_mutex_unlock(&rs->src_page_req_mutex);
a82d593b
DDAG
1381
1382 return block;
1383}
1384
3d0684b2 1385/**
ff1543af 1386 * get_queued_page: unqueue a page from the postcopy requests
3d0684b2
JQ
1387 *
1388 * Skips pages that are already sent (!dirty)
a82d593b 1389 *
a5f7b1a6 1390 * Returns true if a queued page is found
a82d593b 1391 *
6f37bb8b 1392 * @rs: current RAM state
3d0684b2 1393 * @pss: data about the state of the current dirty page scan
a82d593b 1394 */
f20e2865 1395static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
a82d593b
DDAG
1396{
1397 RAMBlock *block;
1398 ram_addr_t offset;
1399 bool dirty;
1400
1401 do {
f20e2865 1402 block = unqueue_page(rs, &offset);
a82d593b
DDAG
1403 /*
1404 * We're sending this page, and since it's postcopy nothing else
1405 * will dirty it, and we must make sure it doesn't get sent again
1406 * even if this queue request was received after the background
1407 * search already sent it.
1408 */
1409 if (block) {
f20e2865
JQ
1410 unsigned long page;
1411
6b6712ef
JQ
1412 page = offset >> TARGET_PAGE_BITS;
1413 dirty = test_bit(page, block->bmap);
a82d593b 1414 if (!dirty) {
06b10688 1415 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
64737606 1416 page);
a82d593b 1417 } else {
f20e2865 1418 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
a82d593b
DDAG
1419 }
1420 }
1421
1422 } while (block && !dirty);
1423
1424 if (block) {
1425 /*
1426 * As soon as we start servicing pages out of order, then we have
1427 * to kill the bulk stage, since the bulk stage assumes
1428 * in (migration_bitmap_find_and_reset_dirty) that every page is
1429 * dirty, that's no longer true.
1430 */
6f37bb8b 1431 rs->ram_bulk_stage = false;
a82d593b
DDAG
1432
1433 /*
1434 * We want the background search to continue from the queued page
1435 * since the guest is likely to want other pages near to the page
1436 * it just requested.
1437 */
1438 pss->block = block;
a935e30f 1439 pss->page = offset >> TARGET_PAGE_BITS;
422314e7
WY
1440
1441 /*
1442 * This unqueued page would break the "one round" check, even is
1443 * really rare.
1444 */
1445 pss->complete_round = false;
a82d593b
DDAG
1446 }
1447
1448 return !!block;
1449}
1450
6c595cde 1451/**
5e58f968
JQ
1452 * migration_page_queue_free: drop any remaining pages in the ram
1453 * request queue
6c595cde 1454 *
3d0684b2
JQ
1455 * It should be empty at the end anyway, but in error cases there may
1456 * be some left. in case that there is any page left, we drop it.
1457 *
6c595cde 1458 */
83c13382 1459static void migration_page_queue_free(RAMState *rs)
6c595cde 1460{
ec481c6c 1461 struct RAMSrcPageRequest *mspr, *next_mspr;
6c595cde
DDAG
1462 /* This queue generally should be empty - but in the case of a failed
1463 * migration might have some droppings in.
1464 */
89ac5a1d 1465 RCU_READ_LOCK_GUARD();
ec481c6c 1466 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
6c595cde 1467 memory_region_unref(mspr->rb->mr);
ec481c6c 1468 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
6c595cde
DDAG
1469 g_free(mspr);
1470 }
6c595cde
DDAG
1471}
1472
1473/**
3d0684b2
JQ
1474 * ram_save_queue_pages: queue the page for transmission
1475 *
1476 * A request from postcopy destination for example.
1477 *
1478 * Returns zero on success or negative on error
1479 *
3d0684b2
JQ
1480 * @rbname: Name of the RAMBLock of the request. NULL means the
1481 * same that last one.
1482 * @start: starting address from the start of the RAMBlock
1483 * @len: length (in bytes) to send
6c595cde 1484 */
96506894 1485int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
6c595cde
DDAG
1486{
1487 RAMBlock *ramblock;
53518d94 1488 RAMState *rs = ram_state;
6c595cde 1489
9360447d 1490 ram_counters.postcopy_requests++;
89ac5a1d
DDAG
1491 RCU_READ_LOCK_GUARD();
1492
6c595cde
DDAG
1493 if (!rbname) {
1494 /* Reuse last RAMBlock */
68a098f3 1495 ramblock = rs->last_req_rb;
6c595cde
DDAG
1496
1497 if (!ramblock) {
1498 /*
1499 * Shouldn't happen, we can't reuse the last RAMBlock if
1500 * it's the 1st request.
1501 */
1502 error_report("ram_save_queue_pages no previous block");
03acb4e9 1503 return -1;
6c595cde
DDAG
1504 }
1505 } else {
1506 ramblock = qemu_ram_block_by_name(rbname);
1507
1508 if (!ramblock) {
1509 /* We shouldn't be asked for a non-existent RAMBlock */
1510 error_report("ram_save_queue_pages no block '%s'", rbname);
03acb4e9 1511 return -1;
6c595cde 1512 }
68a098f3 1513 rs->last_req_rb = ramblock;
6c595cde
DDAG
1514 }
1515 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1516 if (start+len > ramblock->used_length) {
9458ad6b
JQ
1517 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1518 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
6c595cde 1519 __func__, start, len, ramblock->used_length);
03acb4e9 1520 return -1;
6c595cde
DDAG
1521 }
1522
ec481c6c
JQ
1523 struct RAMSrcPageRequest *new_entry =
1524 g_malloc0(sizeof(struct RAMSrcPageRequest));
6c595cde
DDAG
1525 new_entry->rb = ramblock;
1526 new_entry->offset = start;
1527 new_entry->len = len;
1528
1529 memory_region_ref(ramblock->mr);
ec481c6c
JQ
1530 qemu_mutex_lock(&rs->src_page_req_mutex);
1531 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
e03a34f8 1532 migration_make_urgent_request();
ec481c6c 1533 qemu_mutex_unlock(&rs->src_page_req_mutex);
6c595cde
DDAG
1534
1535 return 0;
6c595cde
DDAG
1536}
1537
d7400a34
XG
1538static bool save_page_use_compression(RAMState *rs)
1539{
1540 if (!migrate_use_compression()) {
1541 return false;
1542 }
1543
1544 /*
1545 * If xbzrle is on, stop using the data compression after first
1546 * round of migration even if compression is enabled. In theory,
1547 * xbzrle can do better than compression.
1548 */
1549 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1550 return true;
1551 }
1552
1553 return false;
1554}
1555
5e5fdcff
XG
1556/*
1557 * try to compress the page before posting it out, return true if the page
1558 * has been properly handled by compression, otherwise needs other
1559 * paths to handle it
1560 */
1561static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1562{
1563 if (!save_page_use_compression(rs)) {
1564 return false;
1565 }
1566
1567 /*
1568 * When starting the process of a new block, the first page of
1569 * the block should be sent out before other pages in the same
1570 * block, and all the pages in last block should have been sent
1571 * out, keeping this order is important, because the 'cont' flag
1572 * is used to avoid resending the block name.
1573 *
1574 * We post the fist page as normal page as compression will take
1575 * much CPU resource.
1576 */
1577 if (block != rs->last_sent_block) {
1578 flush_compressed_data(rs);
1579 return false;
1580 }
1581
1582 if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1583 return true;
1584 }
1585
76e03000 1586 compression_counters.busy++;
5e5fdcff
XG
1587 return false;
1588}
1589
a82d593b 1590/**
3d0684b2 1591 * ram_save_target_page: save one target page
a82d593b 1592 *
3d0684b2 1593 * Returns the number of pages written
a82d593b 1594 *
6f37bb8b 1595 * @rs: current RAM state
3d0684b2 1596 * @pss: data about the page we want to send
a82d593b 1597 * @last_stage: if we are at the completion stage
a82d593b 1598 */
a0a8aa14 1599static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
f20e2865 1600 bool last_stage)
a82d593b 1601{
a8ec91f9 1602 RAMBlock *block = pss->block;
8bba004c 1603 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
a8ec91f9
XG
1604 int res;
1605
1606 if (control_save_page(rs, block, offset, &res)) {
1607 return res;
1608 }
1609
5e5fdcff
XG
1610 if (save_compress_page(rs, block, offset)) {
1611 return 1;
d7400a34
XG
1612 }
1613
1614 res = save_zero_page(rs, block, offset);
1615 if (res > 0) {
1616 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1617 * page would be stale
1618 */
1619 if (!save_page_use_compression(rs)) {
1620 XBZRLE_cache_lock();
1621 xbzrle_cache_zero_page(rs, block->offset + offset);
1622 XBZRLE_cache_unlock();
1623 }
1624 ram_release_pages(block->idstr, offset, res);
1625 return res;
1626 }
1627
da3f56cb 1628 /*
c6b3a2e0
WY
1629 * Do not use multifd for:
1630 * 1. Compression as the first page in the new block should be posted out
1631 * before sending the compressed page
1632 * 2. In postcopy as one whole host page should be placed
da3f56cb 1633 */
c6b3a2e0
WY
1634 if (!save_page_use_compression(rs) && migrate_use_multifd()
1635 && !migration_in_postcopy()) {
b9ee2f7d 1636 return ram_save_multifd_page(rs, block, offset);
a82d593b
DDAG
1637 }
1638
1faa5665 1639 return ram_save_page(rs, pss, last_stage);
a82d593b
DDAG
1640}
1641
1642/**
3d0684b2 1643 * ram_save_host_page: save a whole host page
a82d593b 1644 *
3d0684b2
JQ
1645 * Starting at *offset send pages up to the end of the current host
1646 * page. It's valid for the initial offset to point into the middle of
1647 * a host page in which case the remainder of the hostpage is sent.
1648 * Only dirty target pages are sent. Note that the host page size may
1649 * be a huge page for this block.
1eb3fc0a
DDAG
1650 * The saving stops at the boundary of the used_length of the block
1651 * if the RAMBlock isn't a multiple of the host page size.
a82d593b 1652 *
3d0684b2
JQ
1653 * Returns the number of pages written or negative on error
1654 *
6f37bb8b 1655 * @rs: current RAM state
3d0684b2 1656 * @ms: current migration state
3d0684b2 1657 * @pss: data about the page we want to send
a82d593b 1658 * @last_stage: if we are at the completion stage
a82d593b 1659 */
a0a8aa14 1660static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
f20e2865 1661 bool last_stage)
a82d593b
DDAG
1662{
1663 int tmppages, pages = 0;
a935e30f
JQ
1664 size_t pagesize_bits =
1665 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
4c011c37 1666
fbd162e6 1667 if (ramblock_is_ignored(pss->block)) {
b895de50
CLG
1668 error_report("block %s should not be migrated !", pss->block->idstr);
1669 return 0;
1670 }
1671
a82d593b 1672 do {
1faa5665
XG
1673 /* Check the pages is dirty and if it is send it */
1674 if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1675 pss->page++;
1676 continue;
1677 }
1678
f20e2865 1679 tmppages = ram_save_target_page(rs, pss, last_stage);
a82d593b
DDAG
1680 if (tmppages < 0) {
1681 return tmppages;
1682 }
1683
1684 pages += tmppages;
a935e30f 1685 pss->page++;
97e1e067
DDAG
1686 /* Allow rate limiting to happen in the middle of huge pages */
1687 migration_rate_limit();
1eb3fc0a 1688 } while ((pss->page & (pagesize_bits - 1)) &&
8bba004c
AR
1689 offset_in_ramblock(pss->block,
1690 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
a82d593b
DDAG
1691
1692 /* The offset we leave with is the last one we looked at */
a935e30f 1693 pss->page--;
a82d593b
DDAG
1694 return pages;
1695}
6c595cde 1696
56e93d26 1697/**
3d0684b2 1698 * ram_find_and_save_block: finds a dirty page and sends it to f
56e93d26
JQ
1699 *
1700 * Called within an RCU critical section.
1701 *
e8f3735f
XG
1702 * Returns the number of pages written where zero means no dirty pages,
1703 * or negative on error
56e93d26 1704 *
6f37bb8b 1705 * @rs: current RAM state
56e93d26 1706 * @last_stage: if we are at the completion stage
a82d593b
DDAG
1707 *
1708 * On systems where host-page-size > target-page-size it will send all the
1709 * pages in a host page that are dirty.
56e93d26
JQ
1710 */
1711
ce25d337 1712static int ram_find_and_save_block(RAMState *rs, bool last_stage)
56e93d26 1713{
b8fb8cb7 1714 PageSearchStatus pss;
56e93d26 1715 int pages = 0;
b9e60928 1716 bool again, found;
56e93d26 1717
0827b9e9
AA
1718 /* No dirty page as there is zero RAM */
1719 if (!ram_bytes_total()) {
1720 return pages;
1721 }
1722
6f37bb8b 1723 pss.block = rs->last_seen_block;
a935e30f 1724 pss.page = rs->last_page;
b8fb8cb7
DDAG
1725 pss.complete_round = false;
1726
1727 if (!pss.block) {
1728 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1729 }
56e93d26 1730
b9e60928 1731 do {
a82d593b 1732 again = true;
f20e2865 1733 found = get_queued_page(rs, &pss);
b9e60928 1734
a82d593b
DDAG
1735 if (!found) {
1736 /* priority queue empty, so just search for something dirty */
f20e2865 1737 found = find_dirty_block(rs, &pss, &again);
a82d593b 1738 }
f3f491fc 1739
a82d593b 1740 if (found) {
f20e2865 1741 pages = ram_save_host_page(rs, &pss, last_stage);
56e93d26 1742 }
b9e60928 1743 } while (!pages && again);
56e93d26 1744
6f37bb8b 1745 rs->last_seen_block = pss.block;
a935e30f 1746 rs->last_page = pss.page;
56e93d26
JQ
1747
1748 return pages;
1749}
1750
1751void acct_update_position(QEMUFile *f, size_t size, bool zero)
1752{
1753 uint64_t pages = size / TARGET_PAGE_SIZE;
f7ccd61b 1754
56e93d26 1755 if (zero) {
9360447d 1756 ram_counters.duplicate += pages;
56e93d26 1757 } else {
9360447d
JQ
1758 ram_counters.normal += pages;
1759 ram_counters.transferred += size;
56e93d26
JQ
1760 qemu_update_position(f, size);
1761 }
1762}
1763
fbd162e6 1764static uint64_t ram_bytes_total_common(bool count_ignored)
56e93d26
JQ
1765{
1766 RAMBlock *block;
1767 uint64_t total = 0;
1768
89ac5a1d
DDAG
1769 RCU_READ_LOCK_GUARD();
1770
fbd162e6
YK
1771 if (count_ignored) {
1772 RAMBLOCK_FOREACH_MIGRATABLE(block) {
1773 total += block->used_length;
1774 }
1775 } else {
1776 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1777 total += block->used_length;
1778 }
99e15582 1779 }
56e93d26
JQ
1780 return total;
1781}
1782
fbd162e6
YK
1783uint64_t ram_bytes_total(void)
1784{
1785 return ram_bytes_total_common(false);
1786}
1787
f265e0e4 1788static void xbzrle_load_setup(void)
56e93d26 1789{
f265e0e4 1790 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
56e93d26
JQ
1791}
1792
f265e0e4
JQ
1793static void xbzrle_load_cleanup(void)
1794{
1795 g_free(XBZRLE.decoded_buf);
1796 XBZRLE.decoded_buf = NULL;
1797}
1798
7d7c96be
PX
1799static void ram_state_cleanup(RAMState **rsp)
1800{
b9ccaf6d
DDAG
1801 if (*rsp) {
1802 migration_page_queue_free(*rsp);
1803 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1804 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1805 g_free(*rsp);
1806 *rsp = NULL;
1807 }
7d7c96be
PX
1808}
1809
84593a08
PX
1810static void xbzrle_cleanup(void)
1811{
1812 XBZRLE_cache_lock();
1813 if (XBZRLE.cache) {
1814 cache_fini(XBZRLE.cache);
1815 g_free(XBZRLE.encoded_buf);
1816 g_free(XBZRLE.current_buf);
1817 g_free(XBZRLE.zero_target_page);
1818 XBZRLE.cache = NULL;
1819 XBZRLE.encoded_buf = NULL;
1820 XBZRLE.current_buf = NULL;
1821 XBZRLE.zero_target_page = NULL;
1822 }
1823 XBZRLE_cache_unlock();
1824}
1825
f265e0e4 1826static void ram_save_cleanup(void *opaque)
56e93d26 1827{
53518d94 1828 RAMState **rsp = opaque;
6b6712ef 1829 RAMBlock *block;
eb859c53 1830
2ff64038 1831 /* caller have hold iothread lock or is in a bh, so there is
4633456c 1832 * no writing race against the migration bitmap
2ff64038 1833 */
6b6712ef
JQ
1834 memory_global_dirty_log_stop();
1835
fbd162e6 1836 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
002cad6b
PX
1837 g_free(block->clear_bmap);
1838 block->clear_bmap = NULL;
6b6712ef
JQ
1839 g_free(block->bmap);
1840 block->bmap = NULL;
56e93d26
JQ
1841 }
1842
84593a08 1843 xbzrle_cleanup();
f0afa331 1844 compress_threads_save_cleanup();
7d7c96be 1845 ram_state_cleanup(rsp);
56e93d26
JQ
1846}
1847
6f37bb8b 1848static void ram_state_reset(RAMState *rs)
56e93d26 1849{
6f37bb8b
JQ
1850 rs->last_seen_block = NULL;
1851 rs->last_sent_block = NULL;
269ace29 1852 rs->last_page = 0;
6f37bb8b
JQ
1853 rs->last_version = ram_list.version;
1854 rs->ram_bulk_stage = true;
6eeb63f7 1855 rs->fpo_enabled = false;
56e93d26
JQ
1856}
1857
1858#define MAX_WAIT 50 /* ms, half buffered_file limit */
1859
4f2e4252
DDAG
1860/*
1861 * 'expected' is the value you expect the bitmap mostly to be full
1862 * of; it won't bother printing lines that are all this value.
1863 * If 'todump' is null the migration bitmap is dumped.
1864 */
6b6712ef
JQ
1865void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1866 unsigned long pages)
4f2e4252 1867{
4f2e4252
DDAG
1868 int64_t cur;
1869 int64_t linelen = 128;
1870 char linebuf[129];
1871
6b6712ef 1872 for (cur = 0; cur < pages; cur += linelen) {
4f2e4252
DDAG
1873 int64_t curb;
1874 bool found = false;
1875 /*
1876 * Last line; catch the case where the line length
1877 * is longer than remaining ram
1878 */
6b6712ef
JQ
1879 if (cur + linelen > pages) {
1880 linelen = pages - cur;
4f2e4252
DDAG
1881 }
1882 for (curb = 0; curb < linelen; curb++) {
1883 bool thisbit = test_bit(cur + curb, todump);
1884 linebuf[curb] = thisbit ? '1' : '.';
1885 found = found || (thisbit != expected);
1886 }
1887 if (found) {
1888 linebuf[curb] = '\0';
1889 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
1890 }
1891 }
1892}
1893
e0b266f0
DDAG
1894/* **** functions for postcopy ***** */
1895
ced1c616
PB
1896void ram_postcopy_migrated_memory_release(MigrationState *ms)
1897{
1898 struct RAMBlock *block;
ced1c616 1899
fbd162e6 1900 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
6b6712ef
JQ
1901 unsigned long *bitmap = block->bmap;
1902 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1903 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
ced1c616
PB
1904
1905 while (run_start < range) {
1906 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
8bba004c
AR
1907 ram_discard_range(block->idstr,
1908 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
1909 ((ram_addr_t)(run_end - run_start))
1910 << TARGET_PAGE_BITS);
ced1c616
PB
1911 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1912 }
1913 }
1914}
1915
3d0684b2
JQ
1916/**
1917 * postcopy_send_discard_bm_ram: discard a RAMBlock
1918 *
1919 * Returns zero on success
1920 *
e0b266f0 1921 * Callback from postcopy_each_ram_send_discard for each RAMBlock
3d0684b2
JQ
1922 *
1923 * @ms: current migration state
89dab31b 1924 * @block: RAMBlock to discard
e0b266f0 1925 */
810cf2bb 1926static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
e0b266f0 1927{
6b6712ef 1928 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
e0b266f0 1929 unsigned long current;
1e7cf8c3 1930 unsigned long *bitmap = block->bmap;
e0b266f0 1931
6b6712ef 1932 for (current = 0; current < end; ) {
1e7cf8c3 1933 unsigned long one = find_next_bit(bitmap, end, current);
33a5cb62 1934 unsigned long zero, discard_length;
e0b266f0 1935
33a5cb62
WY
1936 if (one >= end) {
1937 break;
1938 }
e0b266f0 1939
1e7cf8c3 1940 zero = find_next_zero_bit(bitmap, end, one + 1);
33a5cb62
WY
1941
1942 if (zero >= end) {
1943 discard_length = end - one;
e0b266f0 1944 } else {
33a5cb62
WY
1945 discard_length = zero - one;
1946 }
810cf2bb 1947 postcopy_discard_send_range(ms, one, discard_length);
33a5cb62 1948 current = one + discard_length;
e0b266f0
DDAG
1949 }
1950
1951 return 0;
1952}
1953
3d0684b2
JQ
1954/**
1955 * postcopy_each_ram_send_discard: discard all RAMBlocks
1956 *
1957 * Returns 0 for success or negative for error
1958 *
e0b266f0
DDAG
1959 * Utility for the outgoing postcopy code.
1960 * Calls postcopy_send_discard_bm_ram for each RAMBlock
1961 * passing it bitmap indexes and name.
e0b266f0
DDAG
1962 * (qemu_ram_foreach_block ends up passing unscaled lengths
1963 * which would mean postcopy code would have to deal with target page)
3d0684b2
JQ
1964 *
1965 * @ms: current migration state
e0b266f0
DDAG
1966 */
1967static int postcopy_each_ram_send_discard(MigrationState *ms)
1968{
1969 struct RAMBlock *block;
1970 int ret;
1971
fbd162e6 1972 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
810cf2bb 1973 postcopy_discard_send_init(ms, block->idstr);
e0b266f0
DDAG
1974
1975 /*
1976 * Postcopy sends chunks of bitmap over the wire, but it
1977 * just needs indexes at this point, avoids it having
1978 * target page specific code.
1979 */
810cf2bb
WY
1980 ret = postcopy_send_discard_bm_ram(ms, block);
1981 postcopy_discard_send_finish(ms);
e0b266f0
DDAG
1982 if (ret) {
1983 return ret;
1984 }
1985 }
1986
1987 return 0;
1988}
1989
3d0684b2 1990/**
8324ef86 1991 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
3d0684b2
JQ
1992 *
1993 * Helper for postcopy_chunk_hostpages; it's called twice to
1994 * canonicalize the two bitmaps, that are similar, but one is
1995 * inverted.
99e314eb 1996 *
3d0684b2
JQ
1997 * Postcopy requires that all target pages in a hostpage are dirty or
1998 * clean, not a mix. This function canonicalizes the bitmaps.
99e314eb 1999 *
3d0684b2 2000 * @ms: current migration state
3d0684b2 2001 * @block: block that contains the page we want to canonicalize
99e314eb 2002 */
1e7cf8c3 2003static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
99e314eb 2004{
53518d94 2005 RAMState *rs = ram_state;
6b6712ef 2006 unsigned long *bitmap = block->bmap;
29c59172 2007 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
6b6712ef 2008 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
99e314eb
DDAG
2009 unsigned long run_start;
2010
29c59172
DDAG
2011 if (block->page_size == TARGET_PAGE_SIZE) {
2012 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2013 return;
2014 }
2015
1e7cf8c3
WY
2016 /* Find a dirty page */
2017 run_start = find_next_bit(bitmap, pages, 0);
99e314eb 2018
6b6712ef 2019 while (run_start < pages) {
99e314eb
DDAG
2020
2021 /*
2022 * If the start of this run of pages is in the middle of a host
2023 * page, then we need to fixup this host page.
2024 */
9dec3cc3 2025 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
99e314eb 2026 /* Find the end of this run */
1e7cf8c3 2027 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
99e314eb
DDAG
2028 /*
2029 * If the end isn't at the start of a host page, then the
2030 * run doesn't finish at the end of a host page
2031 * and we need to discard.
2032 */
99e314eb
DDAG
2033 }
2034
9dec3cc3 2035 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
99e314eb 2036 unsigned long page;
dad45ab2
WY
2037 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2038 host_ratio);
2039 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
99e314eb 2040
99e314eb
DDAG
2041 /* Clean up the bitmap */
2042 for (page = fixup_start_addr;
2043 page < fixup_start_addr + host_ratio; page++) {
99e314eb
DDAG
2044 /*
2045 * Remark them as dirty, updating the count for any pages
2046 * that weren't previously dirty.
2047 */
0d8ec885 2048 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
99e314eb
DDAG
2049 }
2050 }
2051
1e7cf8c3
WY
2052 /* Find the next dirty page for the next iteration */
2053 run_start = find_next_bit(bitmap, pages, run_start);
99e314eb
DDAG
2054 }
2055}
2056
3d0684b2 2057/**
89dab31b 2058 * postcopy_chunk_hostpages: discard any partially sent host page
3d0684b2 2059 *
99e314eb
DDAG
2060 * Utility for the outgoing postcopy code.
2061 *
2062 * Discard any partially sent host-page size chunks, mark any partially
29c59172
DDAG
2063 * dirty host-page size chunks as all dirty. In this case the host-page
2064 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
99e314eb 2065 *
3d0684b2
JQ
2066 * Returns zero on success
2067 *
2068 * @ms: current migration state
6b6712ef 2069 * @block: block we want to work with
99e314eb 2070 */
6b6712ef 2071static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
99e314eb 2072{
810cf2bb 2073 postcopy_discard_send_init(ms, block->idstr);
99e314eb 2074
6b6712ef 2075 /*
1e7cf8c3 2076 * Ensure that all partially dirty host pages are made fully dirty.
6b6712ef 2077 */
1e7cf8c3 2078 postcopy_chunk_hostpages_pass(ms, block);
99e314eb 2079
810cf2bb 2080 postcopy_discard_send_finish(ms);
99e314eb
DDAG
2081 return 0;
2082}
2083
3d0684b2
JQ
2084/**
2085 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2086 *
2087 * Returns zero on success
2088 *
e0b266f0
DDAG
2089 * Transmit the set of pages to be discarded after precopy to the target
2090 * these are pages that:
2091 * a) Have been previously transmitted but are now dirty again
2092 * b) Pages that have never been transmitted, this ensures that
2093 * any pages on the destination that have been mapped by background
2094 * tasks get discarded (transparent huge pages is the specific concern)
2095 * Hopefully this is pretty sparse
3d0684b2
JQ
2096 *
2097 * @ms: current migration state
e0b266f0
DDAG
2098 */
2099int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2100{
53518d94 2101 RAMState *rs = ram_state;
6b6712ef 2102 RAMBlock *block;
e0b266f0 2103 int ret;
e0b266f0 2104
89ac5a1d 2105 RCU_READ_LOCK_GUARD();
e0b266f0
DDAG
2106
2107 /* This should be our last sync, the src is now paused */
eb859c53 2108 migration_bitmap_sync(rs);
e0b266f0 2109
6b6712ef
JQ
2110 /* Easiest way to make sure we don't resume in the middle of a host-page */
2111 rs->last_seen_block = NULL;
2112 rs->last_sent_block = NULL;
2113 rs->last_page = 0;
e0b266f0 2114
fbd162e6 2115 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
6b6712ef
JQ
2116 /* Deal with TPS != HPS and huge pages */
2117 ret = postcopy_chunk_hostpages(ms, block);
2118 if (ret) {
6b6712ef
JQ
2119 return ret;
2120 }
e0b266f0 2121
e0b266f0 2122#ifdef DEBUG_POSTCOPY
1e7cf8c3
WY
2123 ram_debug_dump_bitmap(block->bmap, true,
2124 block->used_length >> TARGET_PAGE_BITS);
e0b266f0 2125#endif
6b6712ef
JQ
2126 }
2127 trace_ram_postcopy_send_discard_bitmap();
e0b266f0
DDAG
2128
2129 ret = postcopy_each_ram_send_discard(ms);
e0b266f0
DDAG
2130
2131 return ret;
2132}
2133
3d0684b2
JQ
2134/**
2135 * ram_discard_range: discard dirtied pages at the beginning of postcopy
e0b266f0 2136 *
3d0684b2 2137 * Returns zero on success
e0b266f0 2138 *
36449157
JQ
2139 * @rbname: name of the RAMBlock of the request. NULL means the
2140 * same that last one.
3d0684b2
JQ
2141 * @start: RAMBlock starting page
2142 * @length: RAMBlock size
e0b266f0 2143 */
aaa2064c 2144int ram_discard_range(const char *rbname, uint64_t start, size_t length)
e0b266f0 2145{
36449157 2146 trace_ram_discard_range(rbname, start, length);
d3a5038c 2147
89ac5a1d 2148 RCU_READ_LOCK_GUARD();
36449157 2149 RAMBlock *rb = qemu_ram_block_by_name(rbname);
e0b266f0
DDAG
2150
2151 if (!rb) {
36449157 2152 error_report("ram_discard_range: Failed to find block '%s'", rbname);
03acb4e9 2153 return -1;
e0b266f0
DDAG
2154 }
2155
814bb08f
PX
2156 /*
2157 * On source VM, we don't need to update the received bitmap since
2158 * we don't even have one.
2159 */
2160 if (rb->receivedmap) {
2161 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2162 length >> qemu_target_page_bits());
2163 }
2164
03acb4e9 2165 return ram_block_discard_range(rb, start, length);
e0b266f0
DDAG
2166}
2167
84593a08
PX
2168/*
2169 * For every allocation, we will try not to crash the VM if the
2170 * allocation failed.
2171 */
2172static int xbzrle_init(void)
2173{
2174 Error *local_err = NULL;
2175
2176 if (!migrate_use_xbzrle()) {
2177 return 0;
2178 }
2179
2180 XBZRLE_cache_lock();
2181
2182 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2183 if (!XBZRLE.zero_target_page) {
2184 error_report("%s: Error allocating zero page", __func__);
2185 goto err_out;
2186 }
2187
2188 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2189 TARGET_PAGE_SIZE, &local_err);
2190 if (!XBZRLE.cache) {
2191 error_report_err(local_err);
2192 goto free_zero_page;
2193 }
2194
2195 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2196 if (!XBZRLE.encoded_buf) {
2197 error_report("%s: Error allocating encoded_buf", __func__);
2198 goto free_cache;
2199 }
2200
2201 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2202 if (!XBZRLE.current_buf) {
2203 error_report("%s: Error allocating current_buf", __func__);
2204 goto free_encoded_buf;
2205 }
2206
2207 /* We are all good */
2208 XBZRLE_cache_unlock();
2209 return 0;
2210
2211free_encoded_buf:
2212 g_free(XBZRLE.encoded_buf);
2213 XBZRLE.encoded_buf = NULL;
2214free_cache:
2215 cache_fini(XBZRLE.cache);
2216 XBZRLE.cache = NULL;
2217free_zero_page:
2218 g_free(XBZRLE.zero_target_page);
2219 XBZRLE.zero_target_page = NULL;
2220err_out:
2221 XBZRLE_cache_unlock();
2222 return -ENOMEM;
2223}
2224
53518d94 2225static int ram_state_init(RAMState **rsp)
56e93d26 2226{
7d00ee6a
PX
2227 *rsp = g_try_new0(RAMState, 1);
2228
2229 if (!*rsp) {
2230 error_report("%s: Init ramstate fail", __func__);
2231 return -1;
2232 }
53518d94
JQ
2233
2234 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2235 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2236 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
56e93d26 2237
7d00ee6a 2238 /*
40c4d4a8
IR
2239 * Count the total number of pages used by ram blocks not including any
2240 * gaps due to alignment or unplugs.
03158519 2241 * This must match with the initial values of dirty bitmap.
7d00ee6a 2242 */
40c4d4a8 2243 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
7d00ee6a
PX
2244 ram_state_reset(*rsp);
2245
2246 return 0;
2247}
2248
d6eff5d7 2249static void ram_list_init_bitmaps(void)
7d00ee6a 2250{
002cad6b 2251 MigrationState *ms = migrate_get_current();
d6eff5d7
PX
2252 RAMBlock *block;
2253 unsigned long pages;
002cad6b 2254 uint8_t shift;
56e93d26 2255
0827b9e9
AA
2256 /* Skip setting bitmap if there is no RAM */
2257 if (ram_bytes_total()) {
002cad6b
PX
2258 shift = ms->clear_bitmap_shift;
2259 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2260 error_report("clear_bitmap_shift (%u) too big, using "
2261 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2262 shift = CLEAR_BITMAP_SHIFT_MAX;
2263 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2264 error_report("clear_bitmap_shift (%u) too small, using "
2265 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2266 shift = CLEAR_BITMAP_SHIFT_MIN;
2267 }
2268
fbd162e6 2269 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
d6eff5d7 2270 pages = block->max_length >> TARGET_PAGE_BITS;
03158519
WY
2271 /*
2272 * The initial dirty bitmap for migration must be set with all
2273 * ones to make sure we'll migrate every guest RAM page to
2274 * destination.
40c4d4a8
IR
2275 * Here we set RAMBlock.bmap all to 1 because when rebegin a
2276 * new migration after a failed migration, ram_list.
2277 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2278 * guest memory.
03158519 2279 */
6b6712ef 2280 block->bmap = bitmap_new(pages);
40c4d4a8 2281 bitmap_set(block->bmap, 0, pages);
002cad6b
PX
2282 block->clear_bmap_shift = shift;
2283 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
0827b9e9 2284 }
f3f491fc 2285 }
d6eff5d7
PX
2286}
2287
2288static void ram_init_bitmaps(RAMState *rs)
2289{
2290 /* For memory_global_dirty_log_start below. */
2291 qemu_mutex_lock_iothread();
2292 qemu_mutex_lock_ramlist();
f3f491fc 2293
89ac5a1d
DDAG
2294 WITH_RCU_READ_LOCK_GUARD() {
2295 ram_list_init_bitmaps();
2296 memory_global_dirty_log_start();
2297 migration_bitmap_sync_precopy(rs);
2298 }
56e93d26 2299 qemu_mutex_unlock_ramlist();
49877834 2300 qemu_mutex_unlock_iothread();
d6eff5d7
PX
2301}
2302
2303static int ram_init_all(RAMState **rsp)
2304{
2305 if (ram_state_init(rsp)) {
2306 return -1;
2307 }
2308
2309 if (xbzrle_init()) {
2310 ram_state_cleanup(rsp);
2311 return -1;
2312 }
2313
2314 ram_init_bitmaps(*rsp);
a91246c9
HZ
2315
2316 return 0;
2317}
2318
08614f34
PX
2319static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2320{
2321 RAMBlock *block;
2322 uint64_t pages = 0;
2323
2324 /*
2325 * Postcopy is not using xbzrle/compression, so no need for that.
2326 * Also, since source are already halted, we don't need to care
2327 * about dirty page logging as well.
2328 */
2329
fbd162e6 2330 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
08614f34
PX
2331 pages += bitmap_count_one(block->bmap,
2332 block->used_length >> TARGET_PAGE_BITS);
2333 }
2334
2335 /* This may not be aligned with current bitmaps. Recalculate. */
2336 rs->migration_dirty_pages = pages;
2337
2338 rs->last_seen_block = NULL;
2339 rs->last_sent_block = NULL;
2340 rs->last_page = 0;
2341 rs->last_version = ram_list.version;
2342 /*
2343 * Disable the bulk stage, otherwise we'll resend the whole RAM no
2344 * matter what we have sent.
2345 */
2346 rs->ram_bulk_stage = false;
2347
2348 /* Update RAMState cache of output QEMUFile */
2349 rs->f = out;
2350
2351 trace_ram_state_resume_prepare(pages);
2352}
2353
6bcb05fc
WW
2354/*
2355 * This function clears bits of the free pages reported by the caller from the
2356 * migration dirty bitmap. @addr is the host address corresponding to the
2357 * start of the continuous guest free pages, and @len is the total bytes of
2358 * those pages.
2359 */
2360void qemu_guest_free_page_hint(void *addr, size_t len)
2361{
2362 RAMBlock *block;
2363 ram_addr_t offset;
2364 size_t used_len, start, npages;
2365 MigrationState *s = migrate_get_current();
2366
2367 /* This function is currently expected to be used during live migration */
2368 if (!migration_is_setup_or_active(s->state)) {
2369 return;
2370 }
2371
2372 for (; len > 0; len -= used_len, addr += used_len) {
2373 block = qemu_ram_block_from_host(addr, false, &offset);
2374 if (unlikely(!block || offset >= block->used_length)) {
2375 /*
2376 * The implementation might not support RAMBlock resize during
2377 * live migration, but it could happen in theory with future
2378 * updates. So we add a check here to capture that case.
2379 */
2380 error_report_once("%s unexpected error", __func__);
2381 return;
2382 }
2383
2384 if (len <= block->used_length - offset) {
2385 used_len = len;
2386 } else {
2387 used_len = block->used_length - offset;
2388 }
2389
2390 start = offset >> TARGET_PAGE_BITS;
2391 npages = used_len >> TARGET_PAGE_BITS;
2392
2393 qemu_mutex_lock(&ram_state->bitmap_mutex);
2394 ram_state->migration_dirty_pages -=
2395 bitmap_count_one_with_offset(block->bmap, start, npages);
2396 bitmap_clear(block->bmap, start, npages);
2397 qemu_mutex_unlock(&ram_state->bitmap_mutex);
2398 }
2399}
2400
3d0684b2
JQ
2401/*
2402 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
a91246c9
HZ
2403 * long-running RCU critical section. When rcu-reclaims in the code
2404 * start to become numerous it will be necessary to reduce the
2405 * granularity of these critical sections.
2406 */
2407
3d0684b2
JQ
2408/**
2409 * ram_save_setup: Setup RAM for migration
2410 *
2411 * Returns zero to indicate success and negative for error
2412 *
2413 * @f: QEMUFile where to send the data
2414 * @opaque: RAMState pointer
2415 */
a91246c9
HZ
2416static int ram_save_setup(QEMUFile *f, void *opaque)
2417{
53518d94 2418 RAMState **rsp = opaque;
a91246c9
HZ
2419 RAMBlock *block;
2420
dcaf446e
XG
2421 if (compress_threads_save_setup()) {
2422 return -1;
2423 }
2424
a91246c9
HZ
2425 /* migration has already setup the bitmap, reuse it. */
2426 if (!migration_in_colo_state()) {
7d00ee6a 2427 if (ram_init_all(rsp) != 0) {
dcaf446e 2428 compress_threads_save_cleanup();
a91246c9 2429 return -1;
53518d94 2430 }
a91246c9 2431 }
53518d94 2432 (*rsp)->f = f;
a91246c9 2433
0e6ebd48
DDAG
2434 WITH_RCU_READ_LOCK_GUARD() {
2435 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
56e93d26 2436
0e6ebd48
DDAG
2437 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2438 qemu_put_byte(f, strlen(block->idstr));
2439 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2440 qemu_put_be64(f, block->used_length);
2441 if (migrate_postcopy_ram() && block->page_size !=
2442 qemu_host_page_size) {
2443 qemu_put_be64(f, block->page_size);
2444 }
2445 if (migrate_ignore_shared()) {
2446 qemu_put_be64(f, block->mr->addr);
2447 }
fbd162e6 2448 }
56e93d26
JQ
2449 }
2450
56e93d26
JQ
2451 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2452 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2453
99f2c6fb 2454 multifd_send_sync_main(f);
56e93d26 2455 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
35374cbd 2456 qemu_fflush(f);
56e93d26
JQ
2457
2458 return 0;
2459}
2460
3d0684b2
JQ
2461/**
2462 * ram_save_iterate: iterative stage for migration
2463 *
2464 * Returns zero to indicate success and negative for error
2465 *
2466 * @f: QEMUFile where to send the data
2467 * @opaque: RAMState pointer
2468 */
56e93d26
JQ
2469static int ram_save_iterate(QEMUFile *f, void *opaque)
2470{
53518d94
JQ
2471 RAMState **temp = opaque;
2472 RAMState *rs = *temp;
3d4095b2 2473 int ret = 0;
56e93d26
JQ
2474 int i;
2475 int64_t t0;
5c90308f 2476 int done = 0;
56e93d26 2477
b2557345
PL
2478 if (blk_mig_bulk_active()) {
2479 /* Avoid transferring ram during bulk phase of block migration as
2480 * the bulk phase will usually take a long time and transferring
2481 * ram updates during that time is pointless. */
2482 goto out;
2483 }
2484
89ac5a1d
DDAG
2485 WITH_RCU_READ_LOCK_GUARD() {
2486 if (ram_list.version != rs->last_version) {
2487 ram_state_reset(rs);
2488 }
56e93d26 2489
89ac5a1d
DDAG
2490 /* Read version before ram_list.blocks */
2491 smp_rmb();
56e93d26 2492
89ac5a1d 2493 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
56e93d26 2494
89ac5a1d
DDAG
2495 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2496 i = 0;
2497 while ((ret = qemu_file_rate_limit(f)) == 0 ||
2498 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2499 int pages;
e03a34f8 2500
89ac5a1d
DDAG
2501 if (qemu_file_get_error(f)) {
2502 break;
2503 }
e8f3735f 2504
89ac5a1d
DDAG
2505 pages = ram_find_and_save_block(rs, false);
2506 /* no more pages to sent */
2507 if (pages == 0) {
2508 done = 1;
2509 break;
2510 }
e8f3735f 2511
89ac5a1d
DDAG
2512 if (pages < 0) {
2513 qemu_file_set_error(f, pages);
56e93d26
JQ
2514 break;
2515 }
89ac5a1d
DDAG
2516
2517 rs->target_page_count += pages;
2518
644acf99
WY
2519 /*
2520 * During postcopy, it is necessary to make sure one whole host
2521 * page is sent in one chunk.
2522 */
2523 if (migrate_postcopy_ram()) {
2524 flush_compressed_data(rs);
2525 }
2526
89ac5a1d
DDAG
2527 /*
2528 * we want to check in the 1st loop, just in case it was the 1st
2529 * time and we had to sync the dirty bitmap.
2530 * qemu_clock_get_ns() is a bit expensive, so we only check each
2531 * some iterations
2532 */
2533 if ((i & 63) == 0) {
2534 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2535 1000000;
2536 if (t1 > MAX_WAIT) {
2537 trace_ram_save_iterate_big_wait(t1, i);
2538 break;
2539 }
2540 }
2541 i++;
56e93d26 2542 }
56e93d26 2543 }
56e93d26
JQ
2544
2545 /*
2546 * Must occur before EOS (or any QEMUFile operation)
2547 * because of RDMA protocol.
2548 */
2549 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2550
b2557345 2551out:
b69a0227
JQ
2552 if (ret >= 0
2553 && migration_is_setup_or_active(migrate_get_current()->state)) {
99f2c6fb 2554 multifd_send_sync_main(rs->f);
3d4095b2
JQ
2555 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2556 qemu_fflush(f);
2557 ram_counters.transferred += 8;
56e93d26 2558
3d4095b2
JQ
2559 ret = qemu_file_get_error(f);
2560 }
56e93d26
JQ
2561 if (ret < 0) {
2562 return ret;
2563 }
2564
5c90308f 2565 return done;
56e93d26
JQ
2566}
2567
3d0684b2
JQ
2568/**
2569 * ram_save_complete: function called to send the remaining amount of ram
2570 *
e8f3735f 2571 * Returns zero to indicate success or negative on error
3d0684b2
JQ
2572 *
2573 * Called with iothread lock
2574 *
2575 * @f: QEMUFile where to send the data
2576 * @opaque: RAMState pointer
2577 */
56e93d26
JQ
2578static int ram_save_complete(QEMUFile *f, void *opaque)
2579{
53518d94
JQ
2580 RAMState **temp = opaque;
2581 RAMState *rs = *temp;
e8f3735f 2582 int ret = 0;
6f37bb8b 2583
89ac5a1d
DDAG
2584 WITH_RCU_READ_LOCK_GUARD() {
2585 if (!migration_in_postcopy()) {
2586 migration_bitmap_sync_precopy(rs);
2587 }
56e93d26 2588
89ac5a1d 2589 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
56e93d26 2590
89ac5a1d 2591 /* try transferring iterative blocks of memory */
56e93d26 2592
89ac5a1d
DDAG
2593 /* flush all remaining blocks regardless of rate limiting */
2594 while (true) {
2595 int pages;
56e93d26 2596
89ac5a1d
DDAG
2597 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2598 /* no more blocks to sent */
2599 if (pages == 0) {
2600 break;
2601 }
2602 if (pages < 0) {
2603 ret = pages;
2604 break;
2605 }
e8f3735f 2606 }
56e93d26 2607
89ac5a1d
DDAG
2608 flush_compressed_data(rs);
2609 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2610 }
d09a6fde 2611
3d4095b2 2612 if (ret >= 0) {
99f2c6fb 2613 multifd_send_sync_main(rs->f);
3d4095b2
JQ
2614 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2615 qemu_fflush(f);
2616 }
56e93d26 2617
e8f3735f 2618 return ret;
56e93d26
JQ
2619}
2620
c31b098f 2621static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
47995026
VSO
2622 uint64_t *res_precopy_only,
2623 uint64_t *res_compatible,
2624 uint64_t *res_postcopy_only)
56e93d26 2625{
53518d94
JQ
2626 RAMState **temp = opaque;
2627 RAMState *rs = *temp;
56e93d26
JQ
2628 uint64_t remaining_size;
2629
9edabd4d 2630 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 2631
5727309d 2632 if (!migration_in_postcopy() &&
663e6c1d 2633 remaining_size < max_size) {
56e93d26 2634 qemu_mutex_lock_iothread();
89ac5a1d
DDAG
2635 WITH_RCU_READ_LOCK_GUARD() {
2636 migration_bitmap_sync_precopy(rs);
2637 }
56e93d26 2638 qemu_mutex_unlock_iothread();
9edabd4d 2639 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 2640 }
c31b098f 2641
86e1167e
VSO
2642 if (migrate_postcopy_ram()) {
2643 /* We can do postcopy, and all the data is postcopiable */
47995026 2644 *res_compatible += remaining_size;
86e1167e 2645 } else {
47995026 2646 *res_precopy_only += remaining_size;
86e1167e 2647 }
56e93d26
JQ
2648}
2649
2650static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2651{
2652 unsigned int xh_len;
2653 int xh_flags;
063e760a 2654 uint8_t *loaded_data;
56e93d26 2655
56e93d26
JQ
2656 /* extract RLE header */
2657 xh_flags = qemu_get_byte(f);
2658 xh_len = qemu_get_be16(f);
2659
2660 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2661 error_report("Failed to load XBZRLE page - wrong compression!");
2662 return -1;
2663 }
2664
2665 if (xh_len > TARGET_PAGE_SIZE) {
2666 error_report("Failed to load XBZRLE page - len overflow!");
2667 return -1;
2668 }
f265e0e4 2669 loaded_data = XBZRLE.decoded_buf;
56e93d26 2670 /* load data and decode */
f265e0e4 2671 /* it can change loaded_data to point to an internal buffer */
063e760a 2672 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
56e93d26
JQ
2673
2674 /* decode RLE */
063e760a 2675 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
56e93d26
JQ
2676 TARGET_PAGE_SIZE) == -1) {
2677 error_report("Failed to load XBZRLE page - decode error!");
2678 return -1;
2679 }
2680
2681 return 0;
2682}
2683
3d0684b2
JQ
2684/**
2685 * ram_block_from_stream: read a RAMBlock id from the migration stream
2686 *
2687 * Must be called from within a rcu critical section.
2688 *
56e93d26 2689 * Returns a pointer from within the RCU-protected ram_list.
a7180877 2690 *
3d0684b2
JQ
2691 * @f: QEMUFile where to read the data from
2692 * @flags: Page flags (mostly to see if it's a continuation of previous block)
a7180877 2693 */
3d0684b2 2694static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
56e93d26
JQ
2695{
2696 static RAMBlock *block = NULL;
2697 char id[256];
2698 uint8_t len;
2699
2700 if (flags & RAM_SAVE_FLAG_CONTINUE) {
4c4bad48 2701 if (!block) {
56e93d26
JQ
2702 error_report("Ack, bad migration stream!");
2703 return NULL;
2704 }
4c4bad48 2705 return block;
56e93d26
JQ
2706 }
2707
2708 len = qemu_get_byte(f);
2709 qemu_get_buffer(f, (uint8_t *)id, len);
2710 id[len] = 0;
2711
e3dd7493 2712 block = qemu_ram_block_by_name(id);
4c4bad48
HZ
2713 if (!block) {
2714 error_report("Can't find block %s", id);
2715 return NULL;
56e93d26
JQ
2716 }
2717
fbd162e6 2718 if (ramblock_is_ignored(block)) {
b895de50
CLG
2719 error_report("block %s should not be migrated !", id);
2720 return NULL;
2721 }
2722
4c4bad48
HZ
2723 return block;
2724}
2725
2726static inline void *host_from_ram_block_offset(RAMBlock *block,
2727 ram_addr_t offset)
2728{
2729 if (!offset_in_ramblock(block, offset)) {
2730 return NULL;
2731 }
2732
2733 return block->host + offset;
56e93d26
JQ
2734}
2735
13af18f2
ZC
2736static inline void *colo_cache_from_block_offset(RAMBlock *block,
2737 ram_addr_t offset)
2738{
2739 if (!offset_in_ramblock(block, offset)) {
2740 return NULL;
2741 }
2742 if (!block->colo_cache) {
2743 error_report("%s: colo_cache is NULL in block :%s",
2744 __func__, block->idstr);
2745 return NULL;
2746 }
7d9acafa
ZC
2747
2748 /*
2749 * During colo checkpoint, we need bitmap of these migrated pages.
2750 * It help us to decide which pages in ram cache should be flushed
2751 * into VM's RAM later.
2752 */
2753 if (!test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
2754 ram_state->migration_dirty_pages++;
2755 }
13af18f2
ZC
2756 return block->colo_cache + offset;
2757}
2758
3d0684b2
JQ
2759/**
2760 * ram_handle_compressed: handle the zero page case
2761 *
56e93d26
JQ
2762 * If a page (or a whole RDMA chunk) has been
2763 * determined to be zero, then zap it.
3d0684b2
JQ
2764 *
2765 * @host: host address for the zero page
2766 * @ch: what the page is filled from. We only support zero
2767 * @size: size of the zero page
56e93d26
JQ
2768 */
2769void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2770{
2771 if (ch != 0 || !is_zero_range(host, size)) {
2772 memset(host, ch, size);
2773 }
2774}
2775
797ca154
XG
2776/* return the size after decompression, or negative value on error */
2777static int
2778qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
2779 const uint8_t *source, size_t source_len)
2780{
2781 int err;
2782
2783 err = inflateReset(stream);
2784 if (err != Z_OK) {
2785 return -1;
2786 }
2787
2788 stream->avail_in = source_len;
2789 stream->next_in = (uint8_t *)source;
2790 stream->avail_out = dest_len;
2791 stream->next_out = dest;
2792
2793 err = inflate(stream, Z_NO_FLUSH);
2794 if (err != Z_STREAM_END) {
2795 return -1;
2796 }
2797
2798 return stream->total_out;
2799}
2800
56e93d26
JQ
2801static void *do_data_decompress(void *opaque)
2802{
2803 DecompressParam *param = opaque;
2804 unsigned long pagesize;
33d151f4 2805 uint8_t *des;
34ab9e97 2806 int len, ret;
56e93d26 2807
33d151f4 2808 qemu_mutex_lock(&param->mutex);
90e56fb4 2809 while (!param->quit) {
33d151f4
LL
2810 if (param->des) {
2811 des = param->des;
2812 len = param->len;
2813 param->des = 0;
2814 qemu_mutex_unlock(&param->mutex);
2815
56e93d26 2816 pagesize = TARGET_PAGE_SIZE;
34ab9e97
XG
2817
2818 ret = qemu_uncompress_data(&param->stream, des, pagesize,
2819 param->compbuf, len);
f548222c 2820 if (ret < 0 && migrate_get_current()->decompress_error_check) {
34ab9e97
XG
2821 error_report("decompress data failed");
2822 qemu_file_set_error(decomp_file, ret);
2823 }
73a8912b 2824
33d151f4
LL
2825 qemu_mutex_lock(&decomp_done_lock);
2826 param->done = true;
2827 qemu_cond_signal(&decomp_done_cond);
2828 qemu_mutex_unlock(&decomp_done_lock);
2829
2830 qemu_mutex_lock(&param->mutex);
2831 } else {
2832 qemu_cond_wait(&param->cond, &param->mutex);
2833 }
56e93d26 2834 }
33d151f4 2835 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
2836
2837 return NULL;
2838}
2839
34ab9e97 2840static int wait_for_decompress_done(void)
5533b2e9
LL
2841{
2842 int idx, thread_count;
2843
2844 if (!migrate_use_compression()) {
34ab9e97 2845 return 0;
5533b2e9
LL
2846 }
2847
2848 thread_count = migrate_decompress_threads();
2849 qemu_mutex_lock(&decomp_done_lock);
2850 for (idx = 0; idx < thread_count; idx++) {
2851 while (!decomp_param[idx].done) {
2852 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2853 }
2854 }
2855 qemu_mutex_unlock(&decomp_done_lock);
34ab9e97 2856 return qemu_file_get_error(decomp_file);
5533b2e9
LL
2857}
2858
f0afa331 2859static void compress_threads_load_cleanup(void)
56e93d26
JQ
2860{
2861 int i, thread_count;
2862
3416ab5b
JQ
2863 if (!migrate_use_compression()) {
2864 return;
2865 }
56e93d26
JQ
2866 thread_count = migrate_decompress_threads();
2867 for (i = 0; i < thread_count; i++) {
797ca154
XG
2868 /*
2869 * we use it as a indicator which shows if the thread is
2870 * properly init'd or not
2871 */
2872 if (!decomp_param[i].compbuf) {
2873 break;
2874 }
2875
56e93d26 2876 qemu_mutex_lock(&decomp_param[i].mutex);
90e56fb4 2877 decomp_param[i].quit = true;
56e93d26
JQ
2878 qemu_cond_signal(&decomp_param[i].cond);
2879 qemu_mutex_unlock(&decomp_param[i].mutex);
2880 }
2881 for (i = 0; i < thread_count; i++) {
797ca154
XG
2882 if (!decomp_param[i].compbuf) {
2883 break;
2884 }
2885
56e93d26
JQ
2886 qemu_thread_join(decompress_threads + i);
2887 qemu_mutex_destroy(&decomp_param[i].mutex);
2888 qemu_cond_destroy(&decomp_param[i].cond);
797ca154 2889 inflateEnd(&decomp_param[i].stream);
56e93d26 2890 g_free(decomp_param[i].compbuf);
797ca154 2891 decomp_param[i].compbuf = NULL;
56e93d26
JQ
2892 }
2893 g_free(decompress_threads);
2894 g_free(decomp_param);
56e93d26
JQ
2895 decompress_threads = NULL;
2896 decomp_param = NULL;
34ab9e97 2897 decomp_file = NULL;
56e93d26
JQ
2898}
2899
34ab9e97 2900static int compress_threads_load_setup(QEMUFile *f)
797ca154
XG
2901{
2902 int i, thread_count;
2903
2904 if (!migrate_use_compression()) {
2905 return 0;
2906 }
2907
2908 thread_count = migrate_decompress_threads();
2909 decompress_threads = g_new0(QemuThread, thread_count);
2910 decomp_param = g_new0(DecompressParam, thread_count);
2911 qemu_mutex_init(&decomp_done_lock);
2912 qemu_cond_init(&decomp_done_cond);
34ab9e97 2913 decomp_file = f;
797ca154
XG
2914 for (i = 0; i < thread_count; i++) {
2915 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
2916 goto exit;
2917 }
2918
2919 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2920 qemu_mutex_init(&decomp_param[i].mutex);
2921 qemu_cond_init(&decomp_param[i].cond);
2922 decomp_param[i].done = true;
2923 decomp_param[i].quit = false;
2924 qemu_thread_create(decompress_threads + i, "decompress",
2925 do_data_decompress, decomp_param + i,
2926 QEMU_THREAD_JOINABLE);
2927 }
2928 return 0;
2929exit:
2930 compress_threads_load_cleanup();
2931 return -1;
2932}
2933
c1bc6626 2934static void decompress_data_with_multi_threads(QEMUFile *f,
56e93d26
JQ
2935 void *host, int len)
2936{
2937 int idx, thread_count;
2938
2939 thread_count = migrate_decompress_threads();
73a8912b 2940 qemu_mutex_lock(&decomp_done_lock);
56e93d26
JQ
2941 while (true) {
2942 for (idx = 0; idx < thread_count; idx++) {
73a8912b 2943 if (decomp_param[idx].done) {
33d151f4
LL
2944 decomp_param[idx].done = false;
2945 qemu_mutex_lock(&decomp_param[idx].mutex);
c1bc6626 2946 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
56e93d26
JQ
2947 decomp_param[idx].des = host;
2948 decomp_param[idx].len = len;
33d151f4
LL
2949 qemu_cond_signal(&decomp_param[idx].cond);
2950 qemu_mutex_unlock(&decomp_param[idx].mutex);
56e93d26
JQ
2951 break;
2952 }
2953 }
2954 if (idx < thread_count) {
2955 break;
73a8912b
LL
2956 } else {
2957 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
56e93d26
JQ
2958 }
2959 }
73a8912b 2960 qemu_mutex_unlock(&decomp_done_lock);
56e93d26
JQ
2961}
2962
13af18f2
ZC
2963/*
2964 * colo cache: this is for secondary VM, we cache the whole
2965 * memory of the secondary VM, it is need to hold the global lock
2966 * to call this helper.
2967 */
2968int colo_init_ram_cache(void)
2969{
2970 RAMBlock *block;
2971
44901b5a
PB
2972 WITH_RCU_READ_LOCK_GUARD() {
2973 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2974 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
2975 NULL,
2976 false);
2977 if (!block->colo_cache) {
2978 error_report("%s: Can't alloc memory for COLO cache of block %s,"
2979 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
2980 block->used_length);
2981 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2982 if (block->colo_cache) {
2983 qemu_anon_ram_free(block->colo_cache, block->used_length);
2984 block->colo_cache = NULL;
2985 }
89ac5a1d 2986 }
44901b5a 2987 return -errno;
89ac5a1d 2988 }
44901b5a 2989 memcpy(block->colo_cache, block->host, block->used_length);
13af18f2 2990 }
13af18f2 2991 }
44901b5a 2992
7d9acafa
ZC
2993 /*
2994 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
2995 * with to decide which page in cache should be flushed into SVM's RAM. Here
2996 * we use the same name 'ram_bitmap' as for migration.
2997 */
2998 if (ram_bytes_total()) {
2999 RAMBlock *block;
3000
fbd162e6 3001 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7d9acafa
ZC
3002 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3003
3004 block->bmap = bitmap_new(pages);
3005 bitmap_set(block->bmap, 0, pages);
3006 }
3007 }
3008 ram_state = g_new0(RAMState, 1);
3009 ram_state->migration_dirty_pages = 0;
c6e5bafb 3010 qemu_mutex_init(&ram_state->bitmap_mutex);
d1955d22 3011 memory_global_dirty_log_start();
7d9acafa 3012
13af18f2 3013 return 0;
13af18f2
ZC
3014}
3015
3016/* It is need to hold the global lock to call this helper */
3017void colo_release_ram_cache(void)
3018{
3019 RAMBlock *block;
3020
d1955d22 3021 memory_global_dirty_log_stop();
fbd162e6 3022 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7d9acafa
ZC
3023 g_free(block->bmap);
3024 block->bmap = NULL;
3025 }
3026
89ac5a1d
DDAG
3027 WITH_RCU_READ_LOCK_GUARD() {
3028 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3029 if (block->colo_cache) {
3030 qemu_anon_ram_free(block->colo_cache, block->used_length);
3031 block->colo_cache = NULL;
3032 }
13af18f2
ZC
3033 }
3034 }
c6e5bafb 3035 qemu_mutex_destroy(&ram_state->bitmap_mutex);
7d9acafa
ZC
3036 g_free(ram_state);
3037 ram_state = NULL;
13af18f2
ZC
3038}
3039
f265e0e4
JQ
3040/**
3041 * ram_load_setup: Setup RAM for migration incoming side
3042 *
3043 * Returns zero to indicate success and negative for error
3044 *
3045 * @f: QEMUFile where to receive the data
3046 * @opaque: RAMState pointer
3047 */
3048static int ram_load_setup(QEMUFile *f, void *opaque)
3049{
34ab9e97 3050 if (compress_threads_load_setup(f)) {
797ca154
XG
3051 return -1;
3052 }
3053
f265e0e4 3054 xbzrle_load_setup();
f9494614 3055 ramblock_recv_map_init();
13af18f2 3056
f265e0e4
JQ
3057 return 0;
3058}
3059
3060static int ram_load_cleanup(void *opaque)
3061{
f9494614 3062 RAMBlock *rb;
56eb90af 3063
fbd162e6 3064 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
bd108a44 3065 qemu_ram_block_writeback(rb);
56eb90af
JH
3066 }
3067
f265e0e4 3068 xbzrle_load_cleanup();
f0afa331 3069 compress_threads_load_cleanup();
f9494614 3070
fbd162e6 3071 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
f9494614
AP
3072 g_free(rb->receivedmap);
3073 rb->receivedmap = NULL;
3074 }
13af18f2 3075
f265e0e4
JQ
3076 return 0;
3077}
3078
3d0684b2
JQ
3079/**
3080 * ram_postcopy_incoming_init: allocate postcopy data structures
3081 *
3082 * Returns 0 for success and negative if there was one error
3083 *
3084 * @mis: current migration incoming state
3085 *
3086 * Allocate data structures etc needed by incoming migration with
3087 * postcopy-ram. postcopy-ram's similarly names
3088 * postcopy_ram_incoming_init does the work.
1caddf8a
DDAG
3089 */
3090int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3091{
c136180c 3092 return postcopy_ram_incoming_init(mis);
1caddf8a
DDAG
3093}
3094
3d0684b2
JQ
3095/**
3096 * ram_load_postcopy: load a page in postcopy case
3097 *
3098 * Returns 0 for success or -errno in case of error
3099 *
a7180877
DDAG
3100 * Called in postcopy mode by ram_load().
3101 * rcu_read_lock is taken prior to this being called.
3d0684b2
JQ
3102 *
3103 * @f: QEMUFile where to send the data
a7180877
DDAG
3104 */
3105static int ram_load_postcopy(QEMUFile *f)
3106{
3107 int flags = 0, ret = 0;
3108 bool place_needed = false;
1aa83678 3109 bool matches_target_page_size = false;
a7180877
DDAG
3110 MigrationIncomingState *mis = migration_incoming_get_current();
3111 /* Temporary page that is later 'placed' */
3414322a 3112 void *postcopy_host_page = mis->postcopy_tmp_page;
91ba442f 3113 void *this_host = NULL;
a3b6ff6d 3114 bool all_zero = false;
4cbb3c63 3115 int target_pages = 0;
a7180877
DDAG
3116
3117 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3118 ram_addr_t addr;
3119 void *host = NULL;
3120 void *page_buffer = NULL;
3121 void *place_source = NULL;
df9ff5e1 3122 RAMBlock *block = NULL;
a7180877 3123 uint8_t ch;
644acf99 3124 int len;
a7180877
DDAG
3125
3126 addr = qemu_get_be64(f);
7a9ddfbf
PX
3127
3128 /*
3129 * If qemu file error, we should stop here, and then "addr"
3130 * may be invalid
3131 */
3132 ret = qemu_file_get_error(f);
3133 if (ret) {
3134 break;
3135 }
3136
a7180877
DDAG
3137 flags = addr & ~TARGET_PAGE_MASK;
3138 addr &= TARGET_PAGE_MASK;
3139
3140 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3141 place_needed = false;
644acf99
WY
3142 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3143 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
df9ff5e1 3144 block = ram_block_from_stream(f, flags);
4c4bad48
HZ
3145
3146 host = host_from_ram_block_offset(block, addr);
a7180877
DDAG
3147 if (!host) {
3148 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3149 ret = -EINVAL;
3150 break;
3151 }
4cbb3c63 3152 target_pages++;
1aa83678 3153 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
a7180877 3154 /*
28abd200
DDAG
3155 * Postcopy requires that we place whole host pages atomically;
3156 * these may be huge pages for RAMBlocks that are backed by
3157 * hugetlbfs.
a7180877
DDAG
3158 * To make it atomic, the data is read into a temporary page
3159 * that's moved into place later.
3160 * The migration protocol uses, possibly smaller, target-pages
3161 * however the source ensures it always sends all the components
91ba442f 3162 * of a host page in one chunk.
a7180877
DDAG
3163 */
3164 page_buffer = postcopy_host_page +
28abd200 3165 ((uintptr_t)host & (block->page_size - 1));
a7180877 3166 /* If all TP are zero then we can optimise the place */
e5e73b0f 3167 if (target_pages == 1) {
a7180877 3168 all_zero = true;
91ba442f
WY
3169 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3170 block->page_size);
c53b7ddc
DDAG
3171 } else {
3172 /* not the 1st TP within the HP */
91ba442f
WY
3173 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) !=
3174 (uintptr_t)this_host) {
3175 error_report("Non-same host page %p/%p",
3176 host, this_host);
c53b7ddc
DDAG
3177 ret = -EINVAL;
3178 break;
3179 }
a7180877
DDAG
3180 }
3181
3182 /*
3183 * If it's the last part of a host page then we place the host
3184 * page
3185 */
4cbb3c63
WY
3186 if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3187 place_needed = true;
3188 target_pages = 0;
3189 }
a7180877
DDAG
3190 place_source = postcopy_host_page;
3191 }
3192
3193 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
bb890ed5 3194 case RAM_SAVE_FLAG_ZERO:
a7180877 3195 ch = qemu_get_byte(f);
2e36bc1b
WY
3196 /*
3197 * Can skip to set page_buffer when
3198 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3199 */
3200 if (ch || !matches_target_page_size) {
3201 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3202 }
a7180877
DDAG
3203 if (ch) {
3204 all_zero = false;
3205 }
3206 break;
3207
3208 case RAM_SAVE_FLAG_PAGE:
3209 all_zero = false;
1aa83678
PX
3210 if (!matches_target_page_size) {
3211 /* For huge pages, we always use temporary buffer */
a7180877
DDAG
3212 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3213 } else {
1aa83678
PX
3214 /*
3215 * For small pages that matches target page size, we
3216 * avoid the qemu_file copy. Instead we directly use
3217 * the buffer of QEMUFile to place the page. Note: we
3218 * cannot do any QEMUFile operation before using that
3219 * buffer to make sure the buffer is valid when
3220 * placing the page.
a7180877
DDAG
3221 */
3222 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3223 TARGET_PAGE_SIZE);
3224 }
3225 break;
644acf99
WY
3226 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3227 all_zero = false;
3228 len = qemu_get_be32(f);
3229 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3230 error_report("Invalid compressed data length: %d", len);
3231 ret = -EINVAL;
3232 break;
3233 }
3234 decompress_data_with_multi_threads(f, page_buffer, len);
3235 break;
3236
a7180877
DDAG
3237 case RAM_SAVE_FLAG_EOS:
3238 /* normal exit */
6df264ac 3239 multifd_recv_sync_main();
a7180877
DDAG
3240 break;
3241 default:
3242 error_report("Unknown combination of migration flags: %#x"
3243 " (postcopy mode)", flags);
3244 ret = -EINVAL;
7a9ddfbf
PX
3245 break;
3246 }
3247
644acf99
WY
3248 /* Got the whole host page, wait for decompress before placing. */
3249 if (place_needed) {
3250 ret |= wait_for_decompress_done();
3251 }
3252
7a9ddfbf
PX
3253 /* Detect for any possible file errors */
3254 if (!ret && qemu_file_get_error(f)) {
3255 ret = qemu_file_get_error(f);
a7180877
DDAG
3256 }
3257
7a9ddfbf 3258 if (!ret && place_needed) {
a7180877 3259 /* This gets called at the last target page in the host page */
91ba442f
WY
3260 void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3261 block->page_size);
df9ff5e1 3262
a7180877 3263 if (all_zero) {
df9ff5e1 3264 ret = postcopy_place_page_zero(mis, place_dest,
8be4620b 3265 block);
a7180877 3266 } else {
df9ff5e1 3267 ret = postcopy_place_page(mis, place_dest,
8be4620b 3268 place_source, block);
a7180877
DDAG
3269 }
3270 }
a7180877
DDAG
3271 }
3272
3273 return ret;
3274}
3275
acab30b8
DHB
3276static bool postcopy_is_advised(void)
3277{
3278 PostcopyState ps = postcopy_state_get();
3279 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3280}
3281
3282static bool postcopy_is_running(void)
3283{
3284 PostcopyState ps = postcopy_state_get();
3285 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3286}
3287
e6f4aa18
ZC
3288/*
3289 * Flush content of RAM cache into SVM's memory.
3290 * Only flush the pages that be dirtied by PVM or SVM or both.
3291 */
3292static void colo_flush_ram_cache(void)
3293{
3294 RAMBlock *block = NULL;
3295 void *dst_host;
3296 void *src_host;
3297 unsigned long offset = 0;
3298
d1955d22 3299 memory_global_dirty_log_sync();
89ac5a1d
DDAG
3300 WITH_RCU_READ_LOCK_GUARD() {
3301 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3302 ramblock_sync_dirty_bitmap(ram_state, block);
3303 }
d1955d22 3304 }
d1955d22 3305
e6f4aa18 3306 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
89ac5a1d
DDAG
3307 WITH_RCU_READ_LOCK_GUARD() {
3308 block = QLIST_FIRST_RCU(&ram_list.blocks);
e6f4aa18 3309
89ac5a1d
DDAG
3310 while (block) {
3311 offset = migration_bitmap_find_dirty(ram_state, block, offset);
e6f4aa18 3312
8bba004c
AR
3313 if (((ram_addr_t)offset) << TARGET_PAGE_BITS
3314 >= block->used_length) {
89ac5a1d
DDAG
3315 offset = 0;
3316 block = QLIST_NEXT_RCU(block, next);
3317 } else {
3318 migration_bitmap_clear_dirty(ram_state, block, offset);
8bba004c
AR
3319 dst_host = block->host
3320 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3321 src_host = block->colo_cache
3322 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
89ac5a1d
DDAG
3323 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3324 }
e6f4aa18
ZC
3325 }
3326 }
e6f4aa18
ZC
3327 trace_colo_flush_ram_cache_end();
3328}
3329
10da4a36
WY
3330/**
3331 * ram_load_precopy: load pages in precopy case
3332 *
3333 * Returns 0 for success or -errno in case of error
3334 *
3335 * Called in precopy mode by ram_load().
3336 * rcu_read_lock is taken prior to this being called.
3337 *
3338 * @f: QEMUFile where to send the data
3339 */
3340static int ram_load_precopy(QEMUFile *f)
56e93d26 3341{
e65cec5e 3342 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
ef08fb38 3343 /* ADVISE is earlier, it shows the source has the postcopy capability on */
acab30b8 3344 bool postcopy_advised = postcopy_is_advised();
edc60127
JQ
3345 if (!migrate_use_compression()) {
3346 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3347 }
a7180877 3348
10da4a36 3349 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
56e93d26 3350 ram_addr_t addr, total_ram_bytes;
a776aa15 3351 void *host = NULL;
56e93d26
JQ
3352 uint8_t ch;
3353
e65cec5e
YK
3354 /*
3355 * Yield periodically to let main loop run, but an iteration of
3356 * the main loop is expensive, so do it each some iterations
3357 */
3358 if ((i & 32767) == 0 && qemu_in_coroutine()) {
3359 aio_co_schedule(qemu_get_current_aio_context(),
3360 qemu_coroutine_self());
3361 qemu_coroutine_yield();
3362 }
3363 i++;
3364
56e93d26
JQ
3365 addr = qemu_get_be64(f);
3366 flags = addr & ~TARGET_PAGE_MASK;
3367 addr &= TARGET_PAGE_MASK;
3368
edc60127
JQ
3369 if (flags & invalid_flags) {
3370 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3371 error_report("Received an unexpected compressed page");
3372 }
3373
3374 ret = -EINVAL;
3375 break;
3376 }
3377
bb890ed5 3378 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
a776aa15 3379 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4c4bad48
HZ
3380 RAMBlock *block = ram_block_from_stream(f, flags);
3381
13af18f2
ZC
3382 /*
3383 * After going into COLO, we should load the Page into colo_cache.
3384 */
3385 if (migration_incoming_in_colo_state()) {
3386 host = colo_cache_from_block_offset(block, addr);
3387 } else {
3388 host = host_from_ram_block_offset(block, addr);
3389 }
a776aa15
DDAG
3390 if (!host) {
3391 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3392 ret = -EINVAL;
3393 break;
3394 }
13af18f2
ZC
3395
3396 if (!migration_incoming_in_colo_state()) {
3397 ramblock_recv_bitmap_set(block, host);
3398 }
3399
1db9d8e5 3400 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
a776aa15
DDAG
3401 }
3402
56e93d26
JQ
3403 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3404 case RAM_SAVE_FLAG_MEM_SIZE:
3405 /* Synchronize RAM block list */
3406 total_ram_bytes = addr;
3407 while (!ret && total_ram_bytes) {
3408 RAMBlock *block;
56e93d26
JQ
3409 char id[256];
3410 ram_addr_t length;
3411
3412 len = qemu_get_byte(f);
3413 qemu_get_buffer(f, (uint8_t *)id, len);
3414 id[len] = 0;
3415 length = qemu_get_be64(f);
3416
e3dd7493 3417 block = qemu_ram_block_by_name(id);
b895de50
CLG
3418 if (block && !qemu_ram_is_migratable(block)) {
3419 error_report("block %s should not be migrated !", id);
3420 ret = -EINVAL;
3421 } else if (block) {
e3dd7493
DDAG
3422 if (length != block->used_length) {
3423 Error *local_err = NULL;
56e93d26 3424
fa53a0e5 3425 ret = qemu_ram_resize(block, length,
e3dd7493
DDAG
3426 &local_err);
3427 if (local_err) {
3428 error_report_err(local_err);
56e93d26 3429 }
56e93d26 3430 }
ef08fb38
DDAG
3431 /* For postcopy we need to check hugepage sizes match */
3432 if (postcopy_advised &&
3433 block->page_size != qemu_host_page_size) {
3434 uint64_t remote_page_size = qemu_get_be64(f);
3435 if (remote_page_size != block->page_size) {
3436 error_report("Mismatched RAM page size %s "
3437 "(local) %zd != %" PRId64,
3438 id, block->page_size,
3439 remote_page_size);
3440 ret = -EINVAL;
3441 }
3442 }
fbd162e6
YK
3443 if (migrate_ignore_shared()) {
3444 hwaddr addr = qemu_get_be64(f);
fbd162e6
YK
3445 if (ramblock_is_ignored(block) &&
3446 block->mr->addr != addr) {
3447 error_report("Mismatched GPAs for block %s "
3448 "%" PRId64 "!= %" PRId64,
3449 id, (uint64_t)addr,
3450 (uint64_t)block->mr->addr);
3451 ret = -EINVAL;
3452 }
3453 }
e3dd7493
DDAG
3454 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3455 block->idstr);
3456 } else {
56e93d26
JQ
3457 error_report("Unknown ramblock \"%s\", cannot "
3458 "accept migration", id);
3459 ret = -EINVAL;
3460 }
3461
3462 total_ram_bytes -= length;
3463 }
3464 break;
a776aa15 3465
bb890ed5 3466 case RAM_SAVE_FLAG_ZERO:
56e93d26
JQ
3467 ch = qemu_get_byte(f);
3468 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3469 break;
a776aa15 3470
56e93d26 3471 case RAM_SAVE_FLAG_PAGE:
56e93d26
JQ
3472 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3473 break;
56e93d26 3474
a776aa15 3475 case RAM_SAVE_FLAG_COMPRESS_PAGE:
56e93d26
JQ
3476 len = qemu_get_be32(f);
3477 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3478 error_report("Invalid compressed data length: %d", len);
3479 ret = -EINVAL;
3480 break;
3481 }
c1bc6626 3482 decompress_data_with_multi_threads(f, host, len);
56e93d26 3483 break;
a776aa15 3484
56e93d26 3485 case RAM_SAVE_FLAG_XBZRLE:
56e93d26
JQ
3486 if (load_xbzrle(f, addr, host) < 0) {
3487 error_report("Failed to decompress XBZRLE page at "
3488 RAM_ADDR_FMT, addr);
3489 ret = -EINVAL;
3490 break;
3491 }
3492 break;
3493 case RAM_SAVE_FLAG_EOS:
3494 /* normal exit */
6df264ac 3495 multifd_recv_sync_main();
56e93d26
JQ
3496 break;
3497 default:
3498 if (flags & RAM_SAVE_FLAG_HOOK) {
632e3a5c 3499 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
56e93d26
JQ
3500 } else {
3501 error_report("Unknown combination of migration flags: %#x",
3502 flags);
3503 ret = -EINVAL;
3504 }
3505 }
3506 if (!ret) {
3507 ret = qemu_file_get_error(f);
3508 }
3509 }
3510
ca1a6b70 3511 ret |= wait_for_decompress_done();
10da4a36
WY
3512 return ret;
3513}
3514
3515static int ram_load(QEMUFile *f, void *opaque, int version_id)
3516{
3517 int ret = 0;
3518 static uint64_t seq_iter;
3519 /*
3520 * If system is running in postcopy mode, page inserts to host memory must
3521 * be atomic
3522 */
3523 bool postcopy_running = postcopy_is_running();
3524
3525 seq_iter++;
3526
3527 if (version_id != 4) {
3528 return -EINVAL;
3529 }
3530
3531 /*
3532 * This RCU critical section can be very long running.
3533 * When RCU reclaims in the code start to become numerous,
3534 * it will be necessary to reduce the granularity of this
3535 * critical section.
3536 */
89ac5a1d
DDAG
3537 WITH_RCU_READ_LOCK_GUARD() {
3538 if (postcopy_running) {
3539 ret = ram_load_postcopy(f);
3540 } else {
3541 ret = ram_load_precopy(f);
3542 }
10da4a36 3543 }
55c4446b 3544 trace_ram_load_complete(ret, seq_iter);
e6f4aa18
ZC
3545
3546 if (!ret && migration_incoming_in_colo_state()) {
3547 colo_flush_ram_cache();
3548 }
56e93d26
JQ
3549 return ret;
3550}
3551
c6467627
VSO
3552static bool ram_has_postcopy(void *opaque)
3553{
469dd51b 3554 RAMBlock *rb;
fbd162e6 3555 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
469dd51b
JH
3556 if (ramblock_is_pmem(rb)) {
3557 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3558 "is not supported now!", rb->idstr, rb->host);
3559 return false;
3560 }
3561 }
3562
c6467627
VSO
3563 return migrate_postcopy_ram();
3564}
3565
edd090c7
PX
3566/* Sync all the dirty bitmap with destination VM. */
3567static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3568{
3569 RAMBlock *block;
3570 QEMUFile *file = s->to_dst_file;
3571 int ramblock_count = 0;
3572
3573 trace_ram_dirty_bitmap_sync_start();
3574
fbd162e6 3575 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
edd090c7
PX
3576 qemu_savevm_send_recv_bitmap(file, block->idstr);
3577 trace_ram_dirty_bitmap_request(block->idstr);
3578 ramblock_count++;
3579 }
3580
3581 trace_ram_dirty_bitmap_sync_wait();
3582
3583 /* Wait until all the ramblocks' dirty bitmap synced */
3584 while (ramblock_count--) {
3585 qemu_sem_wait(&s->rp_state.rp_sem);
3586 }
3587
3588 trace_ram_dirty_bitmap_sync_complete();
3589
3590 return 0;
3591}
3592
3593static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3594{
3595 qemu_sem_post(&s->rp_state.rp_sem);
3596}
3597
a335debb
PX
3598/*
3599 * Read the received bitmap, revert it as the initial dirty bitmap.
3600 * This is only used when the postcopy migration is paused but wants
3601 * to resume from a middle point.
3602 */
3603int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3604{
3605 int ret = -EINVAL;
3606 QEMUFile *file = s->rp_state.from_dst_file;
3607 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
a725ef9f 3608 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
3609 uint64_t size, end_mark;
3610
3611 trace_ram_dirty_bitmap_reload_begin(block->idstr);
3612
3613 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3614 error_report("%s: incorrect state %s", __func__,
3615 MigrationStatus_str(s->state));
3616 return -EINVAL;
3617 }
3618
3619 /*
3620 * Note: see comments in ramblock_recv_bitmap_send() on why we
3621 * need the endianess convertion, and the paddings.
3622 */
3623 local_size = ROUND_UP(local_size, 8);
3624
3625 /* Add paddings */
3626 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
3627
3628 size = qemu_get_be64(file);
3629
3630 /* The size of the bitmap should match with our ramblock */
3631 if (size != local_size) {
3632 error_report("%s: ramblock '%s' bitmap size mismatch "
3633 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
3634 block->idstr, size, local_size);
3635 ret = -EINVAL;
3636 goto out;
3637 }
3638
3639 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
3640 end_mark = qemu_get_be64(file);
3641
3642 ret = qemu_file_get_error(file);
3643 if (ret || size != local_size) {
3644 error_report("%s: read bitmap failed for ramblock '%s': %d"
3645 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
3646 __func__, block->idstr, ret, local_size, size);
3647 ret = -EIO;
3648 goto out;
3649 }
3650
3651 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
3652 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
3653 __func__, block->idstr, end_mark);
3654 ret = -EINVAL;
3655 goto out;
3656 }
3657
3658 /*
3659 * Endianess convertion. We are during postcopy (though paused).
3660 * The dirty bitmap won't change. We can directly modify it.
3661 */
3662 bitmap_from_le(block->bmap, le_bitmap, nbits);
3663
3664 /*
3665 * What we received is "received bitmap". Revert it as the initial
3666 * dirty bitmap for this ramblock.
3667 */
3668 bitmap_complement(block->bmap, block->bmap, nbits);
3669
3670 trace_ram_dirty_bitmap_reload_complete(block->idstr);
3671
edd090c7
PX
3672 /*
3673 * We succeeded to sync bitmap for current ramblock. If this is
3674 * the last one to sync, we need to notify the main send thread.
3675 */
3676 ram_dirty_bitmap_reload_notify(s);
3677
a335debb
PX
3678 ret = 0;
3679out:
bf269906 3680 g_free(le_bitmap);
a335debb
PX
3681 return ret;
3682}
3683
edd090c7
PX
3684static int ram_resume_prepare(MigrationState *s, void *opaque)
3685{
3686 RAMState *rs = *(RAMState **)opaque;
08614f34 3687 int ret;
edd090c7 3688
08614f34
PX
3689 ret = ram_dirty_bitmap_sync_all(s, rs);
3690 if (ret) {
3691 return ret;
3692 }
3693
3694 ram_state_resume_prepare(rs, s->to_dst_file);
3695
3696 return 0;
edd090c7
PX
3697}
3698
56e93d26 3699static SaveVMHandlers savevm_ram_handlers = {
9907e842 3700 .save_setup = ram_save_setup,
56e93d26 3701 .save_live_iterate = ram_save_iterate,
763c906b 3702 .save_live_complete_postcopy = ram_save_complete,
a3e06c3d 3703 .save_live_complete_precopy = ram_save_complete,
c6467627 3704 .has_postcopy = ram_has_postcopy,
56e93d26
JQ
3705 .save_live_pending = ram_save_pending,
3706 .load_state = ram_load,
f265e0e4
JQ
3707 .save_cleanup = ram_save_cleanup,
3708 .load_setup = ram_load_setup,
3709 .load_cleanup = ram_load_cleanup,
edd090c7 3710 .resume_prepare = ram_resume_prepare,
56e93d26
JQ
3711};
3712
3713void ram_mig_init(void)
3714{
3715 qemu_mutex_init(&XBZRLE.lock);
ce62df53 3716 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
56e93d26 3717}