]> git.proxmox.com Git - mirror_qemu.git/blame - migration/ram.c
migration: Add traces for multifd terminate threads
[mirror_qemu.git] / migration / ram.c
CommitLineData
56e93d26
JQ
1/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
76cc7b58
JQ
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
56e93d26
JQ
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
e688df6b 28
1393a485 29#include "qemu/osdep.h"
33c11879 30#include "cpu.h"
56e93d26 31#include <zlib.h>
f348b6d1 32#include "qemu/cutils.h"
56e93d26
JQ
33#include "qemu/bitops.h"
34#include "qemu/bitmap.h"
7205c9ec 35#include "qemu/main-loop.h"
56eb90af 36#include "qemu/pmem.h"
709e3fe8 37#include "xbzrle.h"
7b1e1a22 38#include "ram.h"
6666c96a 39#include "migration.h"
71bb07db 40#include "socket.h"
f2a8f0a6 41#include "migration/register.h"
7b1e1a22 42#include "migration/misc.h"
08a0aee1 43#include "qemu-file.h"
be07b0ac 44#include "postcopy-ram.h"
53d37d36 45#include "page_cache.h"
56e93d26 46#include "qemu/error-report.h"
e688df6b 47#include "qapi/error.h"
9af23989 48#include "qapi/qapi-events-migration.h"
8acabf69 49#include "qapi/qmp/qerror.h"
56e93d26 50#include "trace.h"
56e93d26 51#include "exec/ram_addr.h"
f9494614 52#include "exec/target_page.h"
56e93d26 53#include "qemu/rcu_queue.h"
a91246c9 54#include "migration/colo.h"
53d37d36 55#include "block.h"
af8b7d2b
JQ
56#include "sysemu/sysemu.h"
57#include "qemu/uuid.h"
edd090c7 58#include "savevm.h"
b9ee2f7d 59#include "qemu/iov.h"
56e93d26 60
56e93d26
JQ
61/***********************************************************/
62/* ram save/restore */
63
bb890ed5
JQ
64/* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
65 * worked for pages that where filled with the same char. We switched
66 * it to only search for the zero value. And to avoid confusion with
67 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
68 */
69
56e93d26 70#define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
bb890ed5 71#define RAM_SAVE_FLAG_ZERO 0x02
56e93d26
JQ
72#define RAM_SAVE_FLAG_MEM_SIZE 0x04
73#define RAM_SAVE_FLAG_PAGE 0x08
74#define RAM_SAVE_FLAG_EOS 0x10
75#define RAM_SAVE_FLAG_CONTINUE 0x20
76#define RAM_SAVE_FLAG_XBZRLE 0x40
77/* 0x80 is reserved in migration.h start with 0x100 next */
78#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
79
56e93d26
JQ
80static inline bool is_zero_range(uint8_t *p, uint64_t size)
81{
a1febc49 82 return buffer_is_zero(p, size);
56e93d26
JQ
83}
84
9360447d
JQ
85XBZRLECacheStats xbzrle_counters;
86
56e93d26
JQ
87/* struct contains XBZRLE cache and a static page
88 used by the compression */
89static struct {
90 /* buffer used for XBZRLE encoding */
91 uint8_t *encoded_buf;
92 /* buffer for storing page content */
93 uint8_t *current_buf;
94 /* Cache for XBZRLE, Protected by lock. */
95 PageCache *cache;
96 QemuMutex lock;
c00e0928
JQ
97 /* it will store a page full of zeros */
98 uint8_t *zero_target_page;
f265e0e4
JQ
99 /* buffer used for XBZRLE decoding */
100 uint8_t *decoded_buf;
56e93d26
JQ
101} XBZRLE;
102
56e93d26
JQ
103static void XBZRLE_cache_lock(void)
104{
105 if (migrate_use_xbzrle())
106 qemu_mutex_lock(&XBZRLE.lock);
107}
108
109static void XBZRLE_cache_unlock(void)
110{
111 if (migrate_use_xbzrle())
112 qemu_mutex_unlock(&XBZRLE.lock);
113}
114
3d0684b2
JQ
115/**
116 * xbzrle_cache_resize: resize the xbzrle cache
117 *
118 * This function is called from qmp_migrate_set_cache_size in main
119 * thread, possibly while a migration is in progress. A running
120 * migration may be using the cache and might finish during this call,
121 * hence changes to the cache are protected by XBZRLE.lock().
122 *
c9dede2d 123 * Returns 0 for success or -1 for error
3d0684b2
JQ
124 *
125 * @new_size: new cache size
8acabf69 126 * @errp: set *errp if the check failed, with reason
56e93d26 127 */
c9dede2d 128int xbzrle_cache_resize(int64_t new_size, Error **errp)
56e93d26
JQ
129{
130 PageCache *new_cache;
c9dede2d 131 int64_t ret = 0;
56e93d26 132
8acabf69
JQ
133 /* Check for truncation */
134 if (new_size != (size_t)new_size) {
135 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
136 "exceeding address space");
137 return -1;
138 }
139
2a313e5c
JQ
140 if (new_size == migrate_xbzrle_cache_size()) {
141 /* nothing to do */
c9dede2d 142 return 0;
2a313e5c
JQ
143 }
144
56e93d26
JQ
145 XBZRLE_cache_lock();
146
147 if (XBZRLE.cache != NULL) {
80f8dfde 148 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
56e93d26 149 if (!new_cache) {
56e93d26
JQ
150 ret = -1;
151 goto out;
152 }
153
154 cache_fini(XBZRLE.cache);
155 XBZRLE.cache = new_cache;
156 }
56e93d26
JQ
157out:
158 XBZRLE_cache_unlock();
159 return ret;
160}
161
fbd162e6
YK
162static bool ramblock_is_ignored(RAMBlock *block)
163{
164 return !qemu_ram_is_migratable(block) ||
165 (migrate_ignore_shared() && qemu_ram_is_shared(block));
166}
167
b895de50 168/* Should be holding either ram_list.mutex, or the RCU lock. */
fbd162e6
YK
169#define RAMBLOCK_FOREACH_NOT_IGNORED(block) \
170 INTERNAL_RAMBLOCK_FOREACH(block) \
171 if (ramblock_is_ignored(block)) {} else
172
b895de50 173#define RAMBLOCK_FOREACH_MIGRATABLE(block) \
343f632c 174 INTERNAL_RAMBLOCK_FOREACH(block) \
b895de50
CLG
175 if (!qemu_ram_is_migratable(block)) {} else
176
343f632c
DDAG
177#undef RAMBLOCK_FOREACH
178
fbd162e6
YK
179int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
180{
181 RAMBlock *block;
182 int ret = 0;
183
184 rcu_read_lock();
185 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
186 ret = func(block, opaque);
187 if (ret) {
188 break;
189 }
190 }
191 rcu_read_unlock();
192 return ret;
193}
194
f9494614
AP
195static void ramblock_recv_map_init(void)
196{
197 RAMBlock *rb;
198
fbd162e6 199 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
f9494614
AP
200 assert(!rb->receivedmap);
201 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
202 }
203}
204
205int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
206{
207 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
208 rb->receivedmap);
209}
210
1cba9f6e
DDAG
211bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
212{
213 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
214}
215
f9494614
AP
216void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
217{
218 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
219}
220
221void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
222 size_t nr)
223{
224 bitmap_set_atomic(rb->receivedmap,
225 ramblock_recv_bitmap_offset(host_addr, rb),
226 nr);
227}
228
a335debb
PX
229#define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
230
231/*
232 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
233 *
234 * Returns >0 if success with sent bytes, or <0 if error.
235 */
236int64_t ramblock_recv_bitmap_send(QEMUFile *file,
237 const char *block_name)
238{
239 RAMBlock *block = qemu_ram_block_by_name(block_name);
240 unsigned long *le_bitmap, nbits;
241 uint64_t size;
242
243 if (!block) {
244 error_report("%s: invalid block name: %s", __func__, block_name);
245 return -1;
246 }
247
248 nbits = block->used_length >> TARGET_PAGE_BITS;
249
250 /*
251 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
252 * machines we may need 4 more bytes for padding (see below
253 * comment). So extend it a bit before hand.
254 */
255 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
256
257 /*
258 * Always use little endian when sending the bitmap. This is
259 * required that when source and destination VMs are not using the
260 * same endianess. (Note: big endian won't work.)
261 */
262 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
263
264 /* Size of the bitmap, in bytes */
a725ef9f 265 size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
266
267 /*
268 * size is always aligned to 8 bytes for 64bit machines, but it
269 * may not be true for 32bit machines. We need this padding to
270 * make sure the migration can survive even between 32bit and
271 * 64bit machines.
272 */
273 size = ROUND_UP(size, 8);
274
275 qemu_put_be64(file, size);
276 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
277 /*
278 * Mark as an end, in case the middle part is screwed up due to
279 * some "misterious" reason.
280 */
281 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
282 qemu_fflush(file);
283
bf269906 284 g_free(le_bitmap);
a335debb
PX
285
286 if (qemu_file_get_error(file)) {
287 return qemu_file_get_error(file);
288 }
289
290 return size + sizeof(size);
291}
292
ec481c6c
JQ
293/*
294 * An outstanding page request, on the source, having been received
295 * and queued
296 */
297struct RAMSrcPageRequest {
298 RAMBlock *rb;
299 hwaddr offset;
300 hwaddr len;
301
302 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
303};
304
6f37bb8b
JQ
305/* State of RAM for migration */
306struct RAMState {
204b88b8
JQ
307 /* QEMUFile used for this migration */
308 QEMUFile *f;
6f37bb8b
JQ
309 /* Last block that we have visited searching for dirty pages */
310 RAMBlock *last_seen_block;
311 /* Last block from where we have sent data */
312 RAMBlock *last_sent_block;
269ace29
JQ
313 /* Last dirty target page we have sent */
314 ram_addr_t last_page;
6f37bb8b
JQ
315 /* last ram version we have seen */
316 uint32_t last_version;
317 /* We are in the first round */
318 bool ram_bulk_stage;
6eeb63f7
WW
319 /* The free page optimization is enabled */
320 bool fpo_enabled;
8d820d6f
JQ
321 /* How many times we have dirty too many pages */
322 int dirty_rate_high_cnt;
f664da80
JQ
323 /* these variables are used for bitmap sync */
324 /* last time we did a full bitmap_sync */
325 int64_t time_last_bitmap_sync;
eac74159 326 /* bytes transferred at start_time */
c4bdf0cf 327 uint64_t bytes_xfer_prev;
a66cd90c 328 /* number of dirty pages since start_time */
68908ed6 329 uint64_t num_dirty_pages_period;
b5833fde
JQ
330 /* xbzrle misses since the beginning of the period */
331 uint64_t xbzrle_cache_miss_prev;
76e03000
XG
332
333 /* compression statistics since the beginning of the period */
334 /* amount of count that no free thread to compress data */
335 uint64_t compress_thread_busy_prev;
336 /* amount bytes after compression */
337 uint64_t compressed_size_prev;
338 /* amount of compressed pages */
339 uint64_t compress_pages_prev;
340
be8b02ed
XG
341 /* total handled target pages at the beginning of period */
342 uint64_t target_page_count_prev;
343 /* total handled target pages since start */
344 uint64_t target_page_count;
9360447d 345 /* number of dirty bits in the bitmap */
2dfaf12e 346 uint64_t migration_dirty_pages;
386a907b 347 /* Protects modification of the bitmap and migration dirty pages */
108cfae0 348 QemuMutex bitmap_mutex;
68a098f3
JQ
349 /* The RAMBlock used in the last src_page_requests */
350 RAMBlock *last_req_rb;
ec481c6c
JQ
351 /* Queue of outstanding page requests from the destination */
352 QemuMutex src_page_req_mutex;
b58deb34 353 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
6f37bb8b
JQ
354};
355typedef struct RAMState RAMState;
356
53518d94 357static RAMState *ram_state;
6f37bb8b 358
bd227060
WW
359static NotifierWithReturnList precopy_notifier_list;
360
361void precopy_infrastructure_init(void)
362{
363 notifier_with_return_list_init(&precopy_notifier_list);
364}
365
366void precopy_add_notifier(NotifierWithReturn *n)
367{
368 notifier_with_return_list_add(&precopy_notifier_list, n);
369}
370
371void precopy_remove_notifier(NotifierWithReturn *n)
372{
373 notifier_with_return_remove(n);
374}
375
376int precopy_notify(PrecopyNotifyReason reason, Error **errp)
377{
378 PrecopyNotifyData pnd;
379 pnd.reason = reason;
380 pnd.errp = errp;
381
382 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
383}
384
6eeb63f7
WW
385void precopy_enable_free_page_optimization(void)
386{
387 if (!ram_state) {
388 return;
389 }
390
391 ram_state->fpo_enabled = true;
392}
393
9edabd4d 394uint64_t ram_bytes_remaining(void)
2f4fde93 395{
bae416e5
DDAG
396 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
397 0;
2f4fde93
JQ
398}
399
9360447d 400MigrationStats ram_counters;
96506894 401
b8fb8cb7
DDAG
402/* used by the search for pages to send */
403struct PageSearchStatus {
404 /* Current block being searched */
405 RAMBlock *block;
a935e30f
JQ
406 /* Current page to search from */
407 unsigned long page;
b8fb8cb7
DDAG
408 /* Set once we wrap around */
409 bool complete_round;
410};
411typedef struct PageSearchStatus PageSearchStatus;
412
76e03000
XG
413CompressionStats compression_counters;
414
56e93d26 415struct CompressParam {
56e93d26 416 bool done;
90e56fb4 417 bool quit;
5e5fdcff 418 bool zero_page;
56e93d26
JQ
419 QEMUFile *file;
420 QemuMutex mutex;
421 QemuCond cond;
422 RAMBlock *block;
423 ram_addr_t offset;
34ab9e97
XG
424
425 /* internally used fields */
dcaf446e 426 z_stream stream;
34ab9e97 427 uint8_t *originbuf;
56e93d26
JQ
428};
429typedef struct CompressParam CompressParam;
430
431struct DecompressParam {
73a8912b 432 bool done;
90e56fb4 433 bool quit;
56e93d26
JQ
434 QemuMutex mutex;
435 QemuCond cond;
436 void *des;
d341d9f3 437 uint8_t *compbuf;
56e93d26 438 int len;
797ca154 439 z_stream stream;
56e93d26
JQ
440};
441typedef struct DecompressParam DecompressParam;
442
443static CompressParam *comp_param;
444static QemuThread *compress_threads;
445/* comp_done_cond is used to wake up the migration thread when
446 * one of the compression threads has finished the compression.
447 * comp_done_lock is used to co-work with comp_done_cond.
448 */
0d9f9a5c
LL
449static QemuMutex comp_done_lock;
450static QemuCond comp_done_cond;
56e93d26
JQ
451/* The empty QEMUFileOps will be used by file in CompressParam */
452static const QEMUFileOps empty_ops = { };
453
34ab9e97 454static QEMUFile *decomp_file;
56e93d26
JQ
455static DecompressParam *decomp_param;
456static QemuThread *decompress_threads;
73a8912b
LL
457static QemuMutex decomp_done_lock;
458static QemuCond decomp_done_cond;
56e93d26 459
5e5fdcff 460static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
6ef3771c 461 ram_addr_t offset, uint8_t *source_buf);
56e93d26
JQ
462
463static void *do_data_compress(void *opaque)
464{
465 CompressParam *param = opaque;
a7a9a88f
LL
466 RAMBlock *block;
467 ram_addr_t offset;
5e5fdcff 468 bool zero_page;
56e93d26 469
a7a9a88f 470 qemu_mutex_lock(&param->mutex);
90e56fb4 471 while (!param->quit) {
a7a9a88f
LL
472 if (param->block) {
473 block = param->block;
474 offset = param->offset;
475 param->block = NULL;
476 qemu_mutex_unlock(&param->mutex);
477
5e5fdcff
XG
478 zero_page = do_compress_ram_page(param->file, &param->stream,
479 block, offset, param->originbuf);
a7a9a88f 480
0d9f9a5c 481 qemu_mutex_lock(&comp_done_lock);
a7a9a88f 482 param->done = true;
5e5fdcff 483 param->zero_page = zero_page;
0d9f9a5c
LL
484 qemu_cond_signal(&comp_done_cond);
485 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
486
487 qemu_mutex_lock(&param->mutex);
488 } else {
56e93d26
JQ
489 qemu_cond_wait(&param->cond, &param->mutex);
490 }
56e93d26 491 }
a7a9a88f 492 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
493
494 return NULL;
495}
496
f0afa331 497static void compress_threads_save_cleanup(void)
56e93d26
JQ
498{
499 int i, thread_count;
500
05306935 501 if (!migrate_use_compression() || !comp_param) {
56e93d26
JQ
502 return;
503 }
05306935 504
56e93d26
JQ
505 thread_count = migrate_compress_threads();
506 for (i = 0; i < thread_count; i++) {
dcaf446e
XG
507 /*
508 * we use it as a indicator which shows if the thread is
509 * properly init'd or not
510 */
511 if (!comp_param[i].file) {
512 break;
513 }
05306935
FL
514
515 qemu_mutex_lock(&comp_param[i].mutex);
516 comp_param[i].quit = true;
517 qemu_cond_signal(&comp_param[i].cond);
518 qemu_mutex_unlock(&comp_param[i].mutex);
519
56e93d26 520 qemu_thread_join(compress_threads + i);
56e93d26
JQ
521 qemu_mutex_destroy(&comp_param[i].mutex);
522 qemu_cond_destroy(&comp_param[i].cond);
dcaf446e 523 deflateEnd(&comp_param[i].stream);
34ab9e97 524 g_free(comp_param[i].originbuf);
dcaf446e
XG
525 qemu_fclose(comp_param[i].file);
526 comp_param[i].file = NULL;
56e93d26 527 }
0d9f9a5c
LL
528 qemu_mutex_destroy(&comp_done_lock);
529 qemu_cond_destroy(&comp_done_cond);
56e93d26
JQ
530 g_free(compress_threads);
531 g_free(comp_param);
56e93d26
JQ
532 compress_threads = NULL;
533 comp_param = NULL;
56e93d26
JQ
534}
535
dcaf446e 536static int compress_threads_save_setup(void)
56e93d26
JQ
537{
538 int i, thread_count;
539
540 if (!migrate_use_compression()) {
dcaf446e 541 return 0;
56e93d26 542 }
56e93d26
JQ
543 thread_count = migrate_compress_threads();
544 compress_threads = g_new0(QemuThread, thread_count);
545 comp_param = g_new0(CompressParam, thread_count);
0d9f9a5c
LL
546 qemu_cond_init(&comp_done_cond);
547 qemu_mutex_init(&comp_done_lock);
56e93d26 548 for (i = 0; i < thread_count; i++) {
34ab9e97
XG
549 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
550 if (!comp_param[i].originbuf) {
551 goto exit;
552 }
553
dcaf446e
XG
554 if (deflateInit(&comp_param[i].stream,
555 migrate_compress_level()) != Z_OK) {
34ab9e97 556 g_free(comp_param[i].originbuf);
dcaf446e
XG
557 goto exit;
558 }
559
e110aa91
C
560 /* comp_param[i].file is just used as a dummy buffer to save data,
561 * set its ops to empty.
56e93d26
JQ
562 */
563 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
564 comp_param[i].done = true;
90e56fb4 565 comp_param[i].quit = false;
56e93d26
JQ
566 qemu_mutex_init(&comp_param[i].mutex);
567 qemu_cond_init(&comp_param[i].cond);
568 qemu_thread_create(compress_threads + i, "compress",
569 do_data_compress, comp_param + i,
570 QEMU_THREAD_JOINABLE);
571 }
dcaf446e
XG
572 return 0;
573
574exit:
575 compress_threads_save_cleanup();
576 return -1;
56e93d26
JQ
577}
578
f986c3d2
JQ
579/* Multiple fd's */
580
af8b7d2b
JQ
581#define MULTIFD_MAGIC 0x11223344U
582#define MULTIFD_VERSION 1
583
6df264ac
JQ
584#define MULTIFD_FLAG_SYNC (1 << 0)
585
efd1a1d6 586/* This value needs to be a multiple of qemu_target_page_size() */
4b0c7264 587#define MULTIFD_PACKET_SIZE (512 * 1024)
efd1a1d6 588
af8b7d2b
JQ
589typedef struct {
590 uint32_t magic;
591 uint32_t version;
592 unsigned char uuid[16]; /* QemuUUID */
593 uint8_t id;
5fbd8b4b
JQ
594 uint8_t unused1[7]; /* Reserved for future use */
595 uint64_t unused2[4]; /* Reserved for future use */
af8b7d2b
JQ
596} __attribute__((packed)) MultiFDInit_t;
597
2a26c979
JQ
598typedef struct {
599 uint32_t magic;
600 uint32_t version;
601 uint32_t flags;
6f862692
JQ
602 /* maximum number of allocated pages */
603 uint32_t pages_alloc;
604 uint32_t pages_used;
2a34ee59
JQ
605 /* size of the next packet that contains pages */
606 uint32_t next_packet_size;
2a26c979 607 uint64_t packet_num;
5fbd8b4b 608 uint64_t unused[4]; /* Reserved for future use */
2a26c979
JQ
609 char ramblock[256];
610 uint64_t offset[];
611} __attribute__((packed)) MultiFDPacket_t;
612
34c55a94
JQ
613typedef struct {
614 /* number of used pages */
615 uint32_t used;
616 /* number of allocated pages */
617 uint32_t allocated;
618 /* global number of generated multifd packets */
619 uint64_t packet_num;
620 /* offset of each page */
621 ram_addr_t *offset;
622 /* pointer to each page */
623 struct iovec *iov;
624 RAMBlock *block;
625} MultiFDPages_t;
626
8c4598f2
JQ
627typedef struct {
628 /* this fields are not changed once the thread is created */
629 /* channel number */
f986c3d2 630 uint8_t id;
8c4598f2 631 /* channel thread name */
f986c3d2 632 char *name;
8c4598f2 633 /* channel thread id */
f986c3d2 634 QemuThread thread;
8c4598f2 635 /* communication channel */
60df2d4a 636 QIOChannel *c;
8c4598f2 637 /* sem where to wait for more work */
f986c3d2 638 QemuSemaphore sem;
8c4598f2 639 /* this mutex protects the following parameters */
f986c3d2 640 QemuMutex mutex;
8c4598f2 641 /* is this channel thread running */
66770707 642 bool running;
8c4598f2 643 /* should this thread finish */
f986c3d2 644 bool quit;
0beb5ed3
JQ
645 /* thread has work to do */
646 int pending_job;
34c55a94
JQ
647 /* array of pages to sent */
648 MultiFDPages_t *pages;
2a26c979
JQ
649 /* packet allocated len */
650 uint32_t packet_len;
651 /* pointer to the packet */
652 MultiFDPacket_t *packet;
653 /* multifd flags for each packet */
654 uint32_t flags;
2a34ee59
JQ
655 /* size of the next packet that contains pages */
656 uint32_t next_packet_size;
2a26c979
JQ
657 /* global number of generated multifd packets */
658 uint64_t packet_num;
408ea6ae
JQ
659 /* thread local variables */
660 /* packets sent through this channel */
661 uint64_t num_packets;
662 /* pages sent through this channel */
663 uint64_t num_pages;
8c4598f2
JQ
664} MultiFDSendParams;
665
666typedef struct {
667 /* this fields are not changed once the thread is created */
668 /* channel number */
669 uint8_t id;
670 /* channel thread name */
671 char *name;
672 /* channel thread id */
673 QemuThread thread;
674 /* communication channel */
675 QIOChannel *c;
8c4598f2
JQ
676 /* this mutex protects the following parameters */
677 QemuMutex mutex;
678 /* is this channel thread running */
679 bool running;
3c3ca25d
JQ
680 /* should this thread finish */
681 bool quit;
34c55a94
JQ
682 /* array of pages to receive */
683 MultiFDPages_t *pages;
2a26c979
JQ
684 /* packet allocated len */
685 uint32_t packet_len;
686 /* pointer to the packet */
687 MultiFDPacket_t *packet;
688 /* multifd flags for each packet */
689 uint32_t flags;
690 /* global number of generated multifd packets */
691 uint64_t packet_num;
408ea6ae 692 /* thread local variables */
2a34ee59
JQ
693 /* size of the next packet that contains pages */
694 uint32_t next_packet_size;
408ea6ae
JQ
695 /* packets sent through this channel */
696 uint64_t num_packets;
697 /* pages sent through this channel */
698 uint64_t num_pages;
6df264ac
JQ
699 /* syncs main thread and channels */
700 QemuSemaphore sem_sync;
8c4598f2 701} MultiFDRecvParams;
f986c3d2 702
af8b7d2b
JQ
703static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
704{
705 MultiFDInit_t msg;
706 int ret;
707
708 msg.magic = cpu_to_be32(MULTIFD_MAGIC);
709 msg.version = cpu_to_be32(MULTIFD_VERSION);
710 msg.id = p->id;
711 memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
712
713 ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
714 if (ret != 0) {
715 return -1;
716 }
717 return 0;
718}
719
720static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
721{
722 MultiFDInit_t msg;
723 int ret;
724
725 ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
726 if (ret != 0) {
727 return -1;
728 }
729
341ba0df
PM
730 msg.magic = be32_to_cpu(msg.magic);
731 msg.version = be32_to_cpu(msg.version);
af8b7d2b
JQ
732
733 if (msg.magic != MULTIFD_MAGIC) {
734 error_setg(errp, "multifd: received packet magic %x "
735 "expected %x", msg.magic, MULTIFD_MAGIC);
736 return -1;
737 }
738
739 if (msg.version != MULTIFD_VERSION) {
740 error_setg(errp, "multifd: received packet version %d "
741 "expected %d", msg.version, MULTIFD_VERSION);
742 return -1;
743 }
744
745 if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
746 char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
747 char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
748
749 error_setg(errp, "multifd: received uuid '%s' and expected "
750 "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
751 g_free(uuid);
752 g_free(msg_uuid);
753 return -1;
754 }
755
756 if (msg.id > migrate_multifd_channels()) {
757 error_setg(errp, "multifd: received channel version %d "
758 "expected %d", msg.version, MULTIFD_VERSION);
759 return -1;
760 }
761
762 return msg.id;
763}
764
34c55a94
JQ
765static MultiFDPages_t *multifd_pages_init(size_t size)
766{
767 MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
768
769 pages->allocated = size;
770 pages->iov = g_new0(struct iovec, size);
771 pages->offset = g_new0(ram_addr_t, size);
772
773 return pages;
774}
775
776static void multifd_pages_clear(MultiFDPages_t *pages)
777{
778 pages->used = 0;
779 pages->allocated = 0;
780 pages->packet_num = 0;
781 pages->block = NULL;
782 g_free(pages->iov);
783 pages->iov = NULL;
784 g_free(pages->offset);
785 pages->offset = NULL;
786 g_free(pages);
787}
788
2a26c979
JQ
789static void multifd_send_fill_packet(MultiFDSendParams *p)
790{
791 MultiFDPacket_t *packet = p->packet;
7ed379b2 792 uint32_t page_max = MULTIFD_PACKET_SIZE / qemu_target_page_size();
2a26c979
JQ
793 int i;
794
795 packet->magic = cpu_to_be32(MULTIFD_MAGIC);
796 packet->version = cpu_to_be32(MULTIFD_VERSION);
797 packet->flags = cpu_to_be32(p->flags);
7ed379b2 798 packet->pages_alloc = cpu_to_be32(page_max);
6f862692 799 packet->pages_used = cpu_to_be32(p->pages->used);
2a34ee59 800 packet->next_packet_size = cpu_to_be32(p->next_packet_size);
2a26c979
JQ
801 packet->packet_num = cpu_to_be64(p->packet_num);
802
803 if (p->pages->block) {
804 strncpy(packet->ramblock, p->pages->block->idstr, 256);
805 }
806
807 for (i = 0; i < p->pages->used; i++) {
808 packet->offset[i] = cpu_to_be64(p->pages->offset[i]);
809 }
810}
811
812static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
813{
814 MultiFDPacket_t *packet = p->packet;
7ed379b2 815 uint32_t pages_max = MULTIFD_PACKET_SIZE / qemu_target_page_size();
2a26c979
JQ
816 RAMBlock *block;
817 int i;
818
341ba0df 819 packet->magic = be32_to_cpu(packet->magic);
2a26c979
JQ
820 if (packet->magic != MULTIFD_MAGIC) {
821 error_setg(errp, "multifd: received packet "
822 "magic %x and expected magic %x",
823 packet->magic, MULTIFD_MAGIC);
824 return -1;
825 }
826
341ba0df 827 packet->version = be32_to_cpu(packet->version);
2a26c979
JQ
828 if (packet->version != MULTIFD_VERSION) {
829 error_setg(errp, "multifd: received packet "
830 "version %d and expected version %d",
831 packet->version, MULTIFD_VERSION);
832 return -1;
833 }
834
835 p->flags = be32_to_cpu(packet->flags);
836
6f862692 837 packet->pages_alloc = be32_to_cpu(packet->pages_alloc);
7ed379b2
JQ
838 /*
839 * If we recevied a packet that is 100 times bigger than expected
840 * just stop migration. It is a magic number.
841 */
842 if (packet->pages_alloc > pages_max * 100) {
2a26c979 843 error_setg(errp, "multifd: received packet "
7ed379b2
JQ
844 "with size %d and expected a maximum size of %d",
845 packet->pages_alloc, pages_max * 100) ;
2a26c979
JQ
846 return -1;
847 }
7ed379b2
JQ
848 /*
849 * We received a packet that is bigger than expected but inside
850 * reasonable limits (see previous comment). Just reallocate.
851 */
852 if (packet->pages_alloc > p->pages->allocated) {
853 multifd_pages_clear(p->pages);
f151f8ac 854 p->pages = multifd_pages_init(packet->pages_alloc);
7ed379b2 855 }
2a26c979 856
6f862692
JQ
857 p->pages->used = be32_to_cpu(packet->pages_used);
858 if (p->pages->used > packet->pages_alloc) {
2a26c979 859 error_setg(errp, "multifd: received packet "
6f862692
JQ
860 "with %d pages and expected maximum pages are %d",
861 p->pages->used, packet->pages_alloc) ;
2a26c979
JQ
862 return -1;
863 }
864
2a34ee59 865 p->next_packet_size = be32_to_cpu(packet->next_packet_size);
2a26c979
JQ
866 p->packet_num = be64_to_cpu(packet->packet_num);
867
868 if (p->pages->used) {
869 /* make sure that ramblock is 0 terminated */
870 packet->ramblock[255] = 0;
871 block = qemu_ram_block_by_name(packet->ramblock);
872 if (!block) {
873 error_setg(errp, "multifd: unknown ram block %s",
874 packet->ramblock);
875 return -1;
876 }
877 }
878
879 for (i = 0; i < p->pages->used; i++) {
880 ram_addr_t offset = be64_to_cpu(packet->offset[i]);
881
882 if (offset > (block->used_length - TARGET_PAGE_SIZE)) {
883 error_setg(errp, "multifd: offset too long " RAM_ADDR_FMT
884 " (max " RAM_ADDR_FMT ")",
885 offset, block->max_length);
886 return -1;
887 }
888 p->pages->iov[i].iov_base = block->host + offset;
889 p->pages->iov[i].iov_len = TARGET_PAGE_SIZE;
890 }
891
892 return 0;
893}
894
f986c3d2
JQ
895struct {
896 MultiFDSendParams *params;
34c55a94
JQ
897 /* array of pages to sent */
898 MultiFDPages_t *pages;
6df264ac
JQ
899 /* syncs main thread and channels */
900 QemuSemaphore sem_sync;
901 /* global number of generated multifd packets */
902 uint64_t packet_num;
b9ee2f7d
JQ
903 /* send channels ready */
904 QemuSemaphore channels_ready;
f986c3d2
JQ
905} *multifd_send_state;
906
b9ee2f7d
JQ
907/*
908 * How we use multifd_send_state->pages and channel->pages?
909 *
910 * We create a pages for each channel, and a main one. Each time that
911 * we need to send a batch of pages we interchange the ones between
912 * multifd_send_state and the channel that is sending it. There are
913 * two reasons for that:
914 * - to not have to do so many mallocs during migration
915 * - to make easier to know what to free at the end of migration
916 *
917 * This way we always know who is the owner of each "pages" struct,
a5f7b1a6 918 * and we don't need any locking. It belongs to the migration thread
b9ee2f7d
JQ
919 * or to the channel thread. Switching is safe because the migration
920 * thread is using the channel mutex when changing it, and the channel
921 * have to had finish with its own, otherwise pending_job can't be
922 * false.
923 */
924
1b81c974 925static int multifd_send_pages(RAMState *rs)
b9ee2f7d
JQ
926{
927 int i;
928 static int next_channel;
929 MultiFDSendParams *p = NULL; /* make happy gcc */
930 MultiFDPages_t *pages = multifd_send_state->pages;
931 uint64_t transferred;
932
933 qemu_sem_wait(&multifd_send_state->channels_ready);
934 for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
935 p = &multifd_send_state->params[i];
936
937 qemu_mutex_lock(&p->mutex);
713f762a
IR
938 if (p->quit) {
939 error_report("%s: channel %d has already quit!", __func__, i);
940 qemu_mutex_unlock(&p->mutex);
941 return -1;
942 }
b9ee2f7d
JQ
943 if (!p->pending_job) {
944 p->pending_job++;
945 next_channel = (i + 1) % migrate_multifd_channels();
946 break;
947 }
948 qemu_mutex_unlock(&p->mutex);
949 }
950 p->pages->used = 0;
951
952 p->packet_num = multifd_send_state->packet_num++;
953 p->pages->block = NULL;
954 multifd_send_state->pages = p->pages;
955 p->pages = pages;
4fcefd44 956 transferred = ((uint64_t) pages->used) * TARGET_PAGE_SIZE + p->packet_len;
1b81c974 957 qemu_file_update_transfer(rs->f, transferred);
b9ee2f7d
JQ
958 ram_counters.multifd_bytes += transferred;
959 ram_counters.transferred += transferred;;
960 qemu_mutex_unlock(&p->mutex);
961 qemu_sem_post(&p->sem);
713f762a
IR
962
963 return 1;
b9ee2f7d
JQ
964}
965
1b81c974 966static int multifd_queue_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
b9ee2f7d
JQ
967{
968 MultiFDPages_t *pages = multifd_send_state->pages;
969
970 if (!pages->block) {
971 pages->block = block;
972 }
973
974 if (pages->block == block) {
975 pages->offset[pages->used] = offset;
976 pages->iov[pages->used].iov_base = block->host + offset;
977 pages->iov[pages->used].iov_len = TARGET_PAGE_SIZE;
978 pages->used++;
979
980 if (pages->used < pages->allocated) {
713f762a 981 return 1;
b9ee2f7d
JQ
982 }
983 }
984
1b81c974 985 if (multifd_send_pages(rs) < 0) {
713f762a
IR
986 return -1;
987 }
b9ee2f7d
JQ
988
989 if (pages->block != block) {
1b81c974 990 return multifd_queue_page(rs, block, offset);
b9ee2f7d 991 }
713f762a
IR
992
993 return 1;
b9ee2f7d
JQ
994}
995
66770707 996static void multifd_send_terminate_threads(Error *err)
f986c3d2
JQ
997{
998 int i;
999
5558c91a
JQ
1000 trace_multifd_send_terminate_threads(err != NULL);
1001
7a169d74
JQ
1002 if (err) {
1003 MigrationState *s = migrate_get_current();
1004 migrate_set_error(s, err);
1005 if (s->state == MIGRATION_STATUS_SETUP ||
1006 s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
1007 s->state == MIGRATION_STATUS_DEVICE ||
1008 s->state == MIGRATION_STATUS_ACTIVE) {
1009 migrate_set_state(&s->state, s->state,
1010 MIGRATION_STATUS_FAILED);
1011 }
1012 }
1013
66770707 1014 for (i = 0; i < migrate_multifd_channels(); i++) {
f986c3d2
JQ
1015 MultiFDSendParams *p = &multifd_send_state->params[i];
1016
1017 qemu_mutex_lock(&p->mutex);
1018 p->quit = true;
1019 qemu_sem_post(&p->sem);
1020 qemu_mutex_unlock(&p->mutex);
1021 }
1022}
1023
1398b2e3 1024void multifd_save_cleanup(void)
f986c3d2
JQ
1025{
1026 int i;
f986c3d2
JQ
1027
1028 if (!migrate_use_multifd()) {
1398b2e3 1029 return;
f986c3d2 1030 }
66770707
JQ
1031 multifd_send_terminate_threads(NULL);
1032 for (i = 0; i < migrate_multifd_channels(); i++) {
f986c3d2
JQ
1033 MultiFDSendParams *p = &multifd_send_state->params[i];
1034
66770707
JQ
1035 if (p->running) {
1036 qemu_thread_join(&p->thread);
1037 }
60df2d4a
JQ
1038 socket_send_channel_destroy(p->c);
1039 p->c = NULL;
f986c3d2
JQ
1040 qemu_mutex_destroy(&p->mutex);
1041 qemu_sem_destroy(&p->sem);
1042 g_free(p->name);
1043 p->name = NULL;
34c55a94
JQ
1044 multifd_pages_clear(p->pages);
1045 p->pages = NULL;
2a26c979
JQ
1046 p->packet_len = 0;
1047 g_free(p->packet);
1048 p->packet = NULL;
f986c3d2 1049 }
b9ee2f7d 1050 qemu_sem_destroy(&multifd_send_state->channels_ready);
6df264ac 1051 qemu_sem_destroy(&multifd_send_state->sem_sync);
f986c3d2
JQ
1052 g_free(multifd_send_state->params);
1053 multifd_send_state->params = NULL;
34c55a94
JQ
1054 multifd_pages_clear(multifd_send_state->pages);
1055 multifd_send_state->pages = NULL;
f986c3d2
JQ
1056 g_free(multifd_send_state);
1057 multifd_send_state = NULL;
f986c3d2
JQ
1058}
1059
1b81c974 1060static void multifd_send_sync_main(RAMState *rs)
6df264ac
JQ
1061{
1062 int i;
1063
1064 if (!migrate_use_multifd()) {
1065 return;
1066 }
b9ee2f7d 1067 if (multifd_send_state->pages->used) {
1b81c974 1068 if (multifd_send_pages(rs) < 0) {
713f762a
IR
1069 error_report("%s: multifd_send_pages fail", __func__);
1070 return;
1071 }
b9ee2f7d 1072 }
6df264ac
JQ
1073 for (i = 0; i < migrate_multifd_channels(); i++) {
1074 MultiFDSendParams *p = &multifd_send_state->params[i];
1075
1076 trace_multifd_send_sync_main_signal(p->id);
1077
1078 qemu_mutex_lock(&p->mutex);
b9ee2f7d 1079
713f762a
IR
1080 if (p->quit) {
1081 error_report("%s: channel %d has already quit", __func__, i);
1082 qemu_mutex_unlock(&p->mutex);
1083 return;
1084 }
1085
b9ee2f7d 1086 p->packet_num = multifd_send_state->packet_num++;
6df264ac
JQ
1087 p->flags |= MULTIFD_FLAG_SYNC;
1088 p->pending_job++;
1b81c974 1089 qemu_file_update_transfer(rs->f, p->packet_len);
81507f6b
IR
1090 ram_counters.multifd_bytes += p->packet_len;
1091 ram_counters.transferred += p->packet_len;
6df264ac
JQ
1092 qemu_mutex_unlock(&p->mutex);
1093 qemu_sem_post(&p->sem);
1094 }
1095 for (i = 0; i < migrate_multifd_channels(); i++) {
1096 MultiFDSendParams *p = &multifd_send_state->params[i];
1097
1098 trace_multifd_send_sync_main_wait(p->id);
1099 qemu_sem_wait(&multifd_send_state->sem_sync);
1100 }
1101 trace_multifd_send_sync_main(multifd_send_state->packet_num);
1102}
1103
f986c3d2
JQ
1104static void *multifd_send_thread(void *opaque)
1105{
1106 MultiFDSendParams *p = opaque;
af8b7d2b 1107 Error *local_err = NULL;
a3ec6b7d
IR
1108 int ret = 0;
1109 uint32_t flags = 0;
af8b7d2b 1110
408ea6ae 1111 trace_multifd_send_thread_start(p->id);
74637e6f 1112 rcu_register_thread();
408ea6ae 1113
af8b7d2b
JQ
1114 if (multifd_send_initial_packet(p, &local_err) < 0) {
1115 goto out;
1116 }
408ea6ae
JQ
1117 /* initial packet */
1118 p->num_packets = 1;
f986c3d2
JQ
1119
1120 while (true) {
d82628e4 1121 qemu_sem_wait(&p->sem);
f986c3d2 1122 qemu_mutex_lock(&p->mutex);
0beb5ed3
JQ
1123
1124 if (p->pending_job) {
1125 uint32_t used = p->pages->used;
1126 uint64_t packet_num = p->packet_num;
a3ec6b7d 1127 flags = p->flags;
0beb5ed3 1128
2a34ee59 1129 p->next_packet_size = used * qemu_target_page_size();
0beb5ed3
JQ
1130 multifd_send_fill_packet(p);
1131 p->flags = 0;
1132 p->num_packets++;
1133 p->num_pages += used;
1134 p->pages->used = 0;
1135 qemu_mutex_unlock(&p->mutex);
1136
2a34ee59
JQ
1137 trace_multifd_send(p->id, packet_num, used, flags,
1138 p->next_packet_size);
0beb5ed3 1139
8b2db7f5
JQ
1140 ret = qio_channel_write_all(p->c, (void *)p->packet,
1141 p->packet_len, &local_err);
1142 if (ret != 0) {
1143 break;
1144 }
1145
ad24c7cb
JQ
1146 if (used) {
1147 ret = qio_channel_writev_all(p->c, p->pages->iov,
1148 used, &local_err);
1149 if (ret != 0) {
1150 break;
1151 }
8b2db7f5 1152 }
0beb5ed3
JQ
1153
1154 qemu_mutex_lock(&p->mutex);
1155 p->pending_job--;
1156 qemu_mutex_unlock(&p->mutex);
6df264ac
JQ
1157
1158 if (flags & MULTIFD_FLAG_SYNC) {
1159 qemu_sem_post(&multifd_send_state->sem_sync);
1160 }
b9ee2f7d 1161 qemu_sem_post(&multifd_send_state->channels_ready);
0beb5ed3 1162 } else if (p->quit) {
f986c3d2
JQ
1163 qemu_mutex_unlock(&p->mutex);
1164 break;
6df264ac
JQ
1165 } else {
1166 qemu_mutex_unlock(&p->mutex);
1167 /* sometimes there are spurious wakeups */
f986c3d2 1168 }
f986c3d2
JQ
1169 }
1170
af8b7d2b
JQ
1171out:
1172 if (local_err) {
1173 multifd_send_terminate_threads(local_err);
1174 }
1175
a3ec6b7d
IR
1176 /*
1177 * Error happen, I will exit, but I can't just leave, tell
1178 * who pay attention to me.
1179 */
1180 if (ret != 0) {
1181 if (flags & MULTIFD_FLAG_SYNC) {
1182 qemu_sem_post(&multifd_send_state->sem_sync);
1183 }
1184 qemu_sem_post(&multifd_send_state->channels_ready);
1185 }
1186
66770707
JQ
1187 qemu_mutex_lock(&p->mutex);
1188 p->running = false;
1189 qemu_mutex_unlock(&p->mutex);
1190
74637e6f 1191 rcu_unregister_thread();
408ea6ae
JQ
1192 trace_multifd_send_thread_end(p->id, p->num_packets, p->num_pages);
1193
f986c3d2
JQ
1194 return NULL;
1195}
1196
60df2d4a
JQ
1197static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
1198{
1199 MultiFDSendParams *p = opaque;
1200 QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
1201 Error *local_err = NULL;
1202
1203 if (qio_task_propagate_error(task, &local_err)) {
1398b2e3
FL
1204 migrate_set_error(migrate_get_current(), local_err);
1205 multifd_save_cleanup();
60df2d4a
JQ
1206 } else {
1207 p->c = QIO_CHANNEL(sioc);
1208 qio_channel_set_delay(p->c, false);
1209 p->running = true;
1210 qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
1211 QEMU_THREAD_JOINABLE);
60df2d4a
JQ
1212 }
1213}
1214
f986c3d2
JQ
1215int multifd_save_setup(void)
1216{
1217 int thread_count;
efd1a1d6 1218 uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
f986c3d2
JQ
1219 uint8_t i;
1220
1221 if (!migrate_use_multifd()) {
1222 return 0;
1223 }
1224 thread_count = migrate_multifd_channels();
1225 multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
1226 multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
34c55a94 1227 multifd_send_state->pages = multifd_pages_init(page_count);
6df264ac 1228 qemu_sem_init(&multifd_send_state->sem_sync, 0);
b9ee2f7d 1229 qemu_sem_init(&multifd_send_state->channels_ready, 0);
34c55a94 1230
f986c3d2
JQ
1231 for (i = 0; i < thread_count; i++) {
1232 MultiFDSendParams *p = &multifd_send_state->params[i];
1233
1234 qemu_mutex_init(&p->mutex);
1235 qemu_sem_init(&p->sem, 0);
1236 p->quit = false;
0beb5ed3 1237 p->pending_job = 0;
f986c3d2 1238 p->id = i;
34c55a94 1239 p->pages = multifd_pages_init(page_count);
2a26c979
JQ
1240 p->packet_len = sizeof(MultiFDPacket_t)
1241 + sizeof(ram_addr_t) * page_count;
1242 p->packet = g_malloc0(p->packet_len);
f986c3d2 1243 p->name = g_strdup_printf("multifdsend_%d", i);
60df2d4a 1244 socket_send_channel_create(multifd_new_send_channel_async, p);
f986c3d2
JQ
1245 }
1246 return 0;
1247}
1248
f986c3d2
JQ
1249struct {
1250 MultiFDRecvParams *params;
1251 /* number of created threads */
1252 int count;
6df264ac
JQ
1253 /* syncs main thread and channels */
1254 QemuSemaphore sem_sync;
1255 /* global number of generated multifd packets */
1256 uint64_t packet_num;
f986c3d2
JQ
1257} *multifd_recv_state;
1258
66770707 1259static void multifd_recv_terminate_threads(Error *err)
f986c3d2
JQ
1260{
1261 int i;
1262
5558c91a
JQ
1263 trace_multifd_recv_terminate_threads(err != NULL);
1264
7a169d74
JQ
1265 if (err) {
1266 MigrationState *s = migrate_get_current();
1267 migrate_set_error(s, err);
1268 if (s->state == MIGRATION_STATUS_SETUP ||
1269 s->state == MIGRATION_STATUS_ACTIVE) {
1270 migrate_set_state(&s->state, s->state,
1271 MIGRATION_STATUS_FAILED);
1272 }
1273 }
1274
66770707 1275 for (i = 0; i < migrate_multifd_channels(); i++) {
f986c3d2
JQ
1276 MultiFDRecvParams *p = &multifd_recv_state->params[i];
1277
1278 qemu_mutex_lock(&p->mutex);
3c3ca25d 1279 p->quit = true;
7a5cc33c
JQ
1280 /* We could arrive here for two reasons:
1281 - normal quit, i.e. everything went fine, just finished
1282 - error quit: We close the channels so the channel threads
1283 finish the qio_channel_read_all_eof() */
1284 qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
f986c3d2
JQ
1285 qemu_mutex_unlock(&p->mutex);
1286 }
1287}
1288
1289int multifd_load_cleanup(Error **errp)
1290{
1291 int i;
1292 int ret = 0;
1293
1294 if (!migrate_use_multifd()) {
1295 return 0;
1296 }
66770707
JQ
1297 multifd_recv_terminate_threads(NULL);
1298 for (i = 0; i < migrate_multifd_channels(); i++) {
f986c3d2
JQ
1299 MultiFDRecvParams *p = &multifd_recv_state->params[i];
1300
66770707 1301 if (p->running) {
3c3ca25d 1302 p->quit = true;
f193bc0c
IR
1303 /*
1304 * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code,
1305 * however try to wakeup it without harm in cleanup phase.
1306 */
1307 qemu_sem_post(&p->sem_sync);
66770707
JQ
1308 qemu_thread_join(&p->thread);
1309 }
60df2d4a
JQ
1310 object_unref(OBJECT(p->c));
1311 p->c = NULL;
f986c3d2 1312 qemu_mutex_destroy(&p->mutex);
6df264ac 1313 qemu_sem_destroy(&p->sem_sync);
f986c3d2
JQ
1314 g_free(p->name);
1315 p->name = NULL;
34c55a94
JQ
1316 multifd_pages_clear(p->pages);
1317 p->pages = NULL;
2a26c979
JQ
1318 p->packet_len = 0;
1319 g_free(p->packet);
1320 p->packet = NULL;
f986c3d2 1321 }
6df264ac 1322 qemu_sem_destroy(&multifd_recv_state->sem_sync);
f986c3d2
JQ
1323 g_free(multifd_recv_state->params);
1324 multifd_recv_state->params = NULL;
1325 g_free(multifd_recv_state);
1326 multifd_recv_state = NULL;
1327
1328 return ret;
1329}
1330
6df264ac
JQ
1331static void multifd_recv_sync_main(void)
1332{
1333 int i;
1334
1335 if (!migrate_use_multifd()) {
1336 return;
1337 }
1338 for (i = 0; i < migrate_multifd_channels(); i++) {
1339 MultiFDRecvParams *p = &multifd_recv_state->params[i];
1340
6df264ac
JQ
1341 trace_multifd_recv_sync_main_wait(p->id);
1342 qemu_sem_wait(&multifd_recv_state->sem_sync);
77568ea7
WY
1343 }
1344 for (i = 0; i < migrate_multifd_channels(); i++) {
1345 MultiFDRecvParams *p = &multifd_recv_state->params[i];
1346
6df264ac
JQ
1347 qemu_mutex_lock(&p->mutex);
1348 if (multifd_recv_state->packet_num < p->packet_num) {
1349 multifd_recv_state->packet_num = p->packet_num;
1350 }
1351 qemu_mutex_unlock(&p->mutex);
6df264ac 1352 trace_multifd_recv_sync_main_signal(p->id);
6df264ac
JQ
1353 qemu_sem_post(&p->sem_sync);
1354 }
1355 trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
1356}
1357
f986c3d2
JQ
1358static void *multifd_recv_thread(void *opaque)
1359{
1360 MultiFDRecvParams *p = opaque;
2a26c979
JQ
1361 Error *local_err = NULL;
1362 int ret;
f986c3d2 1363
408ea6ae 1364 trace_multifd_recv_thread_start(p->id);
74637e6f 1365 rcu_register_thread();
408ea6ae 1366
f986c3d2 1367 while (true) {
6df264ac
JQ
1368 uint32_t used;
1369 uint32_t flags;
0beb5ed3 1370
3c3ca25d
JQ
1371 if (p->quit) {
1372 break;
1373 }
1374
8b2db7f5
JQ
1375 ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
1376 p->packet_len, &local_err);
1377 if (ret == 0) { /* EOF */
1378 break;
1379 }
1380 if (ret == -1) { /* Error */
1381 break;
1382 }
2a26c979 1383
6df264ac
JQ
1384 qemu_mutex_lock(&p->mutex);
1385 ret = multifd_recv_unfill_packet(p, &local_err);
1386 if (ret) {
f986c3d2
JQ
1387 qemu_mutex_unlock(&p->mutex);
1388 break;
1389 }
6df264ac
JQ
1390
1391 used = p->pages->used;
1392 flags = p->flags;
2a34ee59
JQ
1393 trace_multifd_recv(p->id, p->packet_num, used, flags,
1394 p->next_packet_size);
6df264ac
JQ
1395 p->num_packets++;
1396 p->num_pages += used;
f986c3d2 1397 qemu_mutex_unlock(&p->mutex);
6df264ac 1398
ad24c7cb
JQ
1399 if (used) {
1400 ret = qio_channel_readv_all(p->c, p->pages->iov,
1401 used, &local_err);
1402 if (ret != 0) {
1403 break;
1404 }
8b2db7f5
JQ
1405 }
1406
6df264ac
JQ
1407 if (flags & MULTIFD_FLAG_SYNC) {
1408 qemu_sem_post(&multifd_recv_state->sem_sync);
1409 qemu_sem_wait(&p->sem_sync);
1410 }
f986c3d2
JQ
1411 }
1412
d82628e4
JQ
1413 if (local_err) {
1414 multifd_recv_terminate_threads(local_err);
1415 }
66770707
JQ
1416 qemu_mutex_lock(&p->mutex);
1417 p->running = false;
1418 qemu_mutex_unlock(&p->mutex);
1419
74637e6f 1420 rcu_unregister_thread();
408ea6ae
JQ
1421 trace_multifd_recv_thread_end(p->id, p->num_packets, p->num_pages);
1422
f986c3d2
JQ
1423 return NULL;
1424}
1425
1426int multifd_load_setup(void)
1427{
1428 int thread_count;
efd1a1d6 1429 uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
f986c3d2
JQ
1430 uint8_t i;
1431
1432 if (!migrate_use_multifd()) {
1433 return 0;
1434 }
1435 thread_count = migrate_multifd_channels();
1436 multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
1437 multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
66770707 1438 atomic_set(&multifd_recv_state->count, 0);
6df264ac 1439 qemu_sem_init(&multifd_recv_state->sem_sync, 0);
34c55a94 1440
f986c3d2
JQ
1441 for (i = 0; i < thread_count; i++) {
1442 MultiFDRecvParams *p = &multifd_recv_state->params[i];
1443
1444 qemu_mutex_init(&p->mutex);
6df264ac 1445 qemu_sem_init(&p->sem_sync, 0);
3c3ca25d 1446 p->quit = false;
f986c3d2 1447 p->id = i;
34c55a94 1448 p->pages = multifd_pages_init(page_count);
2a26c979
JQ
1449 p->packet_len = sizeof(MultiFDPacket_t)
1450 + sizeof(ram_addr_t) * page_count;
1451 p->packet = g_malloc0(p->packet_len);
f986c3d2 1452 p->name = g_strdup_printf("multifdrecv_%d", i);
f986c3d2
JQ
1453 }
1454 return 0;
1455}
1456
62c1e0ca
JQ
1457bool multifd_recv_all_channels_created(void)
1458{
1459 int thread_count = migrate_multifd_channels();
1460
1461 if (!migrate_use_multifd()) {
1462 return true;
1463 }
1464
1465 return thread_count == atomic_read(&multifd_recv_state->count);
1466}
1467
49ed0d24
FL
1468/*
1469 * Try to receive all multifd channels to get ready for the migration.
1470 * - Return true and do not set @errp when correctly receving all channels;
1471 * - Return false and do not set @errp when correctly receiving the current one;
1472 * - Return false and set @errp when failing to receive the current channel.
1473 */
1474bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
71bb07db 1475{
60df2d4a 1476 MultiFDRecvParams *p;
af8b7d2b
JQ
1477 Error *local_err = NULL;
1478 int id;
60df2d4a 1479
af8b7d2b
JQ
1480 id = multifd_recv_initial_packet(ioc, &local_err);
1481 if (id < 0) {
1482 multifd_recv_terminate_threads(local_err);
49ed0d24
FL
1483 error_propagate_prepend(errp, local_err,
1484 "failed to receive packet"
1485 " via multifd channel %d: ",
1486 atomic_read(&multifd_recv_state->count));
81e62053 1487 return false;
af8b7d2b
JQ
1488 }
1489
1490 p = &multifd_recv_state->params[id];
1491 if (p->c != NULL) {
1492 error_setg(&local_err, "multifd: received id '%d' already setup'",
1493 id);
1494 multifd_recv_terminate_threads(local_err);
49ed0d24 1495 error_propagate(errp, local_err);
81e62053 1496 return false;
af8b7d2b 1497 }
60df2d4a
JQ
1498 p->c = ioc;
1499 object_ref(OBJECT(ioc));
408ea6ae
JQ
1500 /* initial packet */
1501 p->num_packets = 1;
60df2d4a
JQ
1502
1503 p->running = true;
1504 qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
1505 QEMU_THREAD_JOINABLE);
1506 atomic_inc(&multifd_recv_state->count);
49ed0d24
FL
1507 return atomic_read(&multifd_recv_state->count) ==
1508 migrate_multifd_channels();
71bb07db
JQ
1509}
1510
56e93d26 1511/**
3d0684b2 1512 * save_page_header: write page header to wire
56e93d26
JQ
1513 *
1514 * If this is the 1st block, it also writes the block identification
1515 *
3d0684b2 1516 * Returns the number of bytes written
56e93d26
JQ
1517 *
1518 * @f: QEMUFile where to send the data
1519 * @block: block that contains the page we want to send
1520 * @offset: offset inside the block for the page
1521 * in the lower bits, it contains flags
1522 */
2bf3aa85
JQ
1523static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
1524 ram_addr_t offset)
56e93d26 1525{
9f5f380b 1526 size_t size, len;
56e93d26 1527
24795694
JQ
1528 if (block == rs->last_sent_block) {
1529 offset |= RAM_SAVE_FLAG_CONTINUE;
1530 }
2bf3aa85 1531 qemu_put_be64(f, offset);
56e93d26
JQ
1532 size = 8;
1533
1534 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
9f5f380b 1535 len = strlen(block->idstr);
2bf3aa85
JQ
1536 qemu_put_byte(f, len);
1537 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
9f5f380b 1538 size += 1 + len;
24795694 1539 rs->last_sent_block = block;
56e93d26
JQ
1540 }
1541 return size;
1542}
1543
3d0684b2
JQ
1544/**
1545 * mig_throttle_guest_down: throotle down the guest
1546 *
1547 * Reduce amount of guest cpu execution to hopefully slow down memory
1548 * writes. If guest dirty memory rate is reduced below the rate at
1549 * which we can transfer pages to the destination then we should be
1550 * able to complete migration. Some workloads dirty memory way too
1551 * fast and will not effectively converge, even with auto-converge.
070afca2
JH
1552 */
1553static void mig_throttle_guest_down(void)
1554{
1555 MigrationState *s = migrate_get_current();
2594f56d
DB
1556 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
1557 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
4cbc9c7f 1558 int pct_max = s->parameters.max_cpu_throttle;
070afca2
JH
1559
1560 /* We have not started throttling yet. Let's start it. */
1561 if (!cpu_throttle_active()) {
1562 cpu_throttle_set(pct_initial);
1563 } else {
1564 /* Throttling already on, just increase the rate */
4cbc9c7f
LQ
1565 cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
1566 pct_max));
070afca2
JH
1567 }
1568}
1569
3d0684b2
JQ
1570/**
1571 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
1572 *
6f37bb8b 1573 * @rs: current RAM state
3d0684b2
JQ
1574 * @current_addr: address for the zero page
1575 *
1576 * Update the xbzrle cache to reflect a page that's been sent as all 0.
56e93d26
JQ
1577 * The important thing is that a stale (not-yet-0'd) page be replaced
1578 * by the new data.
1579 * As a bonus, if the page wasn't in the cache it gets added so that
3d0684b2 1580 * when a small write is made into the 0'd page it gets XBZRLE sent.
56e93d26 1581 */
6f37bb8b 1582static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
56e93d26 1583{
6f37bb8b 1584 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
56e93d26
JQ
1585 return;
1586 }
1587
1588 /* We don't care if this fails to allocate a new cache page
1589 * as long as it updated an old one */
c00e0928 1590 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
9360447d 1591 ram_counters.dirty_sync_count);
56e93d26
JQ
1592}
1593
1594#define ENCODING_FLAG_XBZRLE 0x1
1595
1596/**
1597 * save_xbzrle_page: compress and send current page
1598 *
1599 * Returns: 1 means that we wrote the page
1600 * 0 means that page is identical to the one already sent
1601 * -1 means that xbzrle would be longer than normal
1602 *
5a987738 1603 * @rs: current RAM state
3d0684b2
JQ
1604 * @current_data: pointer to the address of the page contents
1605 * @current_addr: addr of the page
56e93d26
JQ
1606 * @block: block that contains the page we want to send
1607 * @offset: offset inside the block for the page
1608 * @last_stage: if we are at the completion stage
56e93d26 1609 */
204b88b8 1610static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
56e93d26 1611 ram_addr_t current_addr, RAMBlock *block,
072c2511 1612 ram_addr_t offset, bool last_stage)
56e93d26
JQ
1613{
1614 int encoded_len = 0, bytes_xbzrle;
1615 uint8_t *prev_cached_page;
1616
9360447d
JQ
1617 if (!cache_is_cached(XBZRLE.cache, current_addr,
1618 ram_counters.dirty_sync_count)) {
1619 xbzrle_counters.cache_miss++;
56e93d26
JQ
1620 if (!last_stage) {
1621 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
9360447d 1622 ram_counters.dirty_sync_count) == -1) {
56e93d26
JQ
1623 return -1;
1624 } else {
1625 /* update *current_data when the page has been
1626 inserted into cache */
1627 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1628 }
1629 }
1630 return -1;
1631 }
1632
1633 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1634
1635 /* save current buffer into memory */
1636 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1637
1638 /* XBZRLE encoding (if there is no overflow) */
1639 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1640 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1641 TARGET_PAGE_SIZE);
ca353803
WY
1642
1643 /*
1644 * Update the cache contents, so that it corresponds to the data
1645 * sent, in all cases except where we skip the page.
1646 */
1647 if (!last_stage && encoded_len != 0) {
1648 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1649 /*
1650 * In the case where we couldn't compress, ensure that the caller
1651 * sends the data from the cache, since the guest might have
1652 * changed the RAM since we copied it.
1653 */
1654 *current_data = prev_cached_page;
1655 }
1656
56e93d26 1657 if (encoded_len == 0) {
55c4446b 1658 trace_save_xbzrle_page_skipping();
56e93d26
JQ
1659 return 0;
1660 } else if (encoded_len == -1) {
55c4446b 1661 trace_save_xbzrle_page_overflow();
9360447d 1662 xbzrle_counters.overflow++;
56e93d26
JQ
1663 return -1;
1664 }
1665
56e93d26 1666 /* Send XBZRLE based compressed page */
2bf3aa85 1667 bytes_xbzrle = save_page_header(rs, rs->f, block,
204b88b8
JQ
1668 offset | RAM_SAVE_FLAG_XBZRLE);
1669 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1670 qemu_put_be16(rs->f, encoded_len);
1671 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
56e93d26 1672 bytes_xbzrle += encoded_len + 1 + 2;
9360447d
JQ
1673 xbzrle_counters.pages++;
1674 xbzrle_counters.bytes += bytes_xbzrle;
1675 ram_counters.transferred += bytes_xbzrle;
56e93d26
JQ
1676
1677 return 1;
1678}
1679
3d0684b2
JQ
1680/**
1681 * migration_bitmap_find_dirty: find the next dirty page from start
f3f491fc 1682 *
a5f7b1a6 1683 * Returns the page offset within memory region of the start of a dirty page
3d0684b2 1684 *
6f37bb8b 1685 * @rs: current RAM state
3d0684b2 1686 * @rb: RAMBlock where to search for dirty pages
a935e30f 1687 * @start: page where we start the search
f3f491fc 1688 */
56e93d26 1689static inline
a935e30f 1690unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
f20e2865 1691 unsigned long start)
56e93d26 1692{
6b6712ef
JQ
1693 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1694 unsigned long *bitmap = rb->bmap;
56e93d26
JQ
1695 unsigned long next;
1696
fbd162e6 1697 if (ramblock_is_ignored(rb)) {
b895de50
CLG
1698 return size;
1699 }
1700
6eeb63f7
WW
1701 /*
1702 * When the free page optimization is enabled, we need to check the bitmap
1703 * to send the non-free pages rather than all the pages in the bulk stage.
1704 */
1705 if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
6b6712ef 1706 next = start + 1;
56e93d26 1707 } else {
6b6712ef 1708 next = find_next_bit(bitmap, size, start);
56e93d26
JQ
1709 }
1710
6b6712ef 1711 return next;
56e93d26
JQ
1712}
1713
06b10688 1714static inline bool migration_bitmap_clear_dirty(RAMState *rs,
f20e2865
JQ
1715 RAMBlock *rb,
1716 unsigned long page)
a82d593b
DDAG
1717{
1718 bool ret;
a82d593b 1719
386a907b 1720 qemu_mutex_lock(&rs->bitmap_mutex);
002cad6b
PX
1721
1722 /*
1723 * Clear dirty bitmap if needed. This _must_ be called before we
1724 * send any of the page in the chunk because we need to make sure
1725 * we can capture further page content changes when we sync dirty
1726 * log the next time. So as long as we are going to send any of
1727 * the page in the chunk we clear the remote dirty bitmap for all.
1728 * Clearing it earlier won't be a problem, but too late will.
1729 */
1730 if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
1731 uint8_t shift = rb->clear_bmap_shift;
1732 hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
1733 hwaddr start = (page << TARGET_PAGE_BITS) & (-size);
1734
1735 /*
1736 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
1737 * can make things easier sometimes since then start address
1738 * of the small chunk will always be 64 pages aligned so the
1739 * bitmap will always be aligned to unsigned long. We should
1740 * even be able to remove this restriction but I'm simply
1741 * keeping it.
1742 */
1743 assert(shift >= 6);
1744 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
1745 memory_region_clear_dirty_bitmap(rb->mr, start, size);
1746 }
1747
6b6712ef 1748 ret = test_and_clear_bit(page, rb->bmap);
a82d593b
DDAG
1749
1750 if (ret) {
0d8ec885 1751 rs->migration_dirty_pages--;
a82d593b 1752 }
386a907b
WW
1753 qemu_mutex_unlock(&rs->bitmap_mutex);
1754
a82d593b
DDAG
1755 return ret;
1756}
1757
267691b6 1758/* Called with RCU critical section */
7a3e9571 1759static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
56e93d26 1760{
0d8ec885 1761 rs->migration_dirty_pages +=
5d0980a4 1762 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length,
0d8ec885 1763 &rs->num_dirty_pages_period);
56e93d26
JQ
1764}
1765
3d0684b2
JQ
1766/**
1767 * ram_pagesize_summary: calculate all the pagesizes of a VM
1768 *
1769 * Returns a summary bitmap of the page sizes of all RAMBlocks
1770 *
1771 * For VMs with just normal pages this is equivalent to the host page
1772 * size. If it's got some huge pages then it's the OR of all the
1773 * different page sizes.
e8ca1db2
DDAG
1774 */
1775uint64_t ram_pagesize_summary(void)
1776{
1777 RAMBlock *block;
1778 uint64_t summary = 0;
1779
fbd162e6 1780 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
e8ca1db2
DDAG
1781 summary |= block->page_size;
1782 }
1783
1784 return summary;
1785}
1786
aecbfe9c
XG
1787uint64_t ram_get_total_transferred_pages(void)
1788{
1789 return ram_counters.normal + ram_counters.duplicate +
1790 compression_counters.pages + xbzrle_counters.pages;
1791}
1792
b734035b
XG
1793static void migration_update_rates(RAMState *rs, int64_t end_time)
1794{
be8b02ed 1795 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
76e03000 1796 double compressed_size;
b734035b
XG
1797
1798 /* calculate period counters */
1799 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1800 / (end_time - rs->time_last_bitmap_sync);
1801
be8b02ed 1802 if (!page_count) {
b734035b
XG
1803 return;
1804 }
1805
1806 if (migrate_use_xbzrle()) {
1807 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
be8b02ed 1808 rs->xbzrle_cache_miss_prev) / page_count;
b734035b
XG
1809 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1810 }
76e03000
XG
1811
1812 if (migrate_use_compression()) {
1813 compression_counters.busy_rate = (double)(compression_counters.busy -
1814 rs->compress_thread_busy_prev) / page_count;
1815 rs->compress_thread_busy_prev = compression_counters.busy;
1816
1817 compressed_size = compression_counters.compressed_size -
1818 rs->compressed_size_prev;
1819 if (compressed_size) {
1820 double uncompressed_size = (compression_counters.pages -
1821 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1822
1823 /* Compression-Ratio = Uncompressed-size / Compressed-size */
1824 compression_counters.compression_rate =
1825 uncompressed_size / compressed_size;
1826
1827 rs->compress_pages_prev = compression_counters.pages;
1828 rs->compressed_size_prev = compression_counters.compressed_size;
1829 }
1830 }
b734035b
XG
1831}
1832
8d820d6f 1833static void migration_bitmap_sync(RAMState *rs)
56e93d26
JQ
1834{
1835 RAMBlock *block;
56e93d26 1836 int64_t end_time;
c4bdf0cf 1837 uint64_t bytes_xfer_now;
56e93d26 1838
9360447d 1839 ram_counters.dirty_sync_count++;
56e93d26 1840
f664da80
JQ
1841 if (!rs->time_last_bitmap_sync) {
1842 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
56e93d26
JQ
1843 }
1844
1845 trace_migration_bitmap_sync_start();
9c1f8f44 1846 memory_global_dirty_log_sync();
56e93d26 1847
108cfae0 1848 qemu_mutex_lock(&rs->bitmap_mutex);
56e93d26 1849 rcu_read_lock();
fbd162e6 1850 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7a3e9571 1851 ramblock_sync_dirty_bitmap(rs, block);
56e93d26 1852 }
650af890 1853 ram_counters.remaining = ram_bytes_remaining();
56e93d26 1854 rcu_read_unlock();
108cfae0 1855 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26 1856
a66cd90c 1857 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1ffb5dfd 1858
56e93d26
JQ
1859 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1860
1861 /* more than 1 second = 1000 millisecons */
f664da80 1862 if (end_time > rs->time_last_bitmap_sync + 1000) {
9360447d 1863 bytes_xfer_now = ram_counters.transferred;
d693c6f1 1864
9ac78b61
PL
1865 /* During block migration the auto-converge logic incorrectly detects
1866 * that ram migration makes no progress. Avoid this by disabling the
1867 * throttling logic during the bulk phase of block migration. */
1868 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
56e93d26
JQ
1869 /* The following detection logic can be refined later. For now:
1870 Check to see if the dirtied bytes is 50% more than the approx.
1871 amount of bytes that just got transferred since the last time we
070afca2
JH
1872 were in this routine. If that happens twice, start or increase
1873 throttling */
070afca2 1874
d693c6f1 1875 if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
eac74159 1876 (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
b4a3c64b 1877 (++rs->dirty_rate_high_cnt >= 2)) {
56e93d26 1878 trace_migration_throttle();
8d820d6f 1879 rs->dirty_rate_high_cnt = 0;
070afca2 1880 mig_throttle_guest_down();
d693c6f1 1881 }
56e93d26 1882 }
070afca2 1883
b734035b
XG
1884 migration_update_rates(rs, end_time);
1885
be8b02ed 1886 rs->target_page_count_prev = rs->target_page_count;
d693c6f1
FF
1887
1888 /* reset period counters */
f664da80 1889 rs->time_last_bitmap_sync = end_time;
a66cd90c 1890 rs->num_dirty_pages_period = 0;
d2a4d85a 1891 rs->bytes_xfer_prev = bytes_xfer_now;
56e93d26 1892 }
4addcd4f 1893 if (migrate_use_events()) {
3ab72385 1894 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
4addcd4f 1895 }
56e93d26
JQ
1896}
1897
bd227060
WW
1898static void migration_bitmap_sync_precopy(RAMState *rs)
1899{
1900 Error *local_err = NULL;
1901
1902 /*
1903 * The current notifier usage is just an optimization to migration, so we
1904 * don't stop the normal migration process in the error case.
1905 */
1906 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1907 error_report_err(local_err);
1908 }
1909
1910 migration_bitmap_sync(rs);
1911
1912 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1913 error_report_err(local_err);
1914 }
1915}
1916
6c97ec5f
XG
1917/**
1918 * save_zero_page_to_file: send the zero page to the file
1919 *
1920 * Returns the size of data written to the file, 0 means the page is not
1921 * a zero page
1922 *
1923 * @rs: current RAM state
1924 * @file: the file where the data is saved
1925 * @block: block that contains the page we want to send
1926 * @offset: offset inside the block for the page
1927 */
1928static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1929 RAMBlock *block, ram_addr_t offset)
1930{
1931 uint8_t *p = block->host + offset;
1932 int len = 0;
1933
1934 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1935 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1936 qemu_put_byte(file, 0);
1937 len += 1;
1938 }
1939 return len;
1940}
1941
56e93d26 1942/**
3d0684b2 1943 * save_zero_page: send the zero page to the stream
56e93d26 1944 *
3d0684b2 1945 * Returns the number of pages written.
56e93d26 1946 *
f7ccd61b 1947 * @rs: current RAM state
56e93d26
JQ
1948 * @block: block that contains the page we want to send
1949 * @offset: offset inside the block for the page
56e93d26 1950 */
7faccdc3 1951static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
56e93d26 1952{
6c97ec5f 1953 int len = save_zero_page_to_file(rs, rs->f, block, offset);
56e93d26 1954
6c97ec5f 1955 if (len) {
9360447d 1956 ram_counters.duplicate++;
6c97ec5f
XG
1957 ram_counters.transferred += len;
1958 return 1;
56e93d26 1959 }
6c97ec5f 1960 return -1;
56e93d26
JQ
1961}
1962
5727309d 1963static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
53f09a10 1964{
5727309d 1965 if (!migrate_release_ram() || !migration_in_postcopy()) {
53f09a10
PB
1966 return;
1967 }
1968
aaa2064c 1969 ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
53f09a10
PB
1970}
1971
059ff0fb
XG
1972/*
1973 * @pages: the number of pages written by the control path,
1974 * < 0 - error
1975 * > 0 - number of pages written
1976 *
1977 * Return true if the pages has been saved, otherwise false is returned.
1978 */
1979static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1980 int *pages)
1981{
1982 uint64_t bytes_xmit = 0;
1983 int ret;
1984
1985 *pages = -1;
1986 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1987 &bytes_xmit);
1988 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1989 return false;
1990 }
1991
1992 if (bytes_xmit) {
1993 ram_counters.transferred += bytes_xmit;
1994 *pages = 1;
1995 }
1996
1997 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1998 return true;
1999 }
2000
2001 if (bytes_xmit > 0) {
2002 ram_counters.normal++;
2003 } else if (bytes_xmit == 0) {
2004 ram_counters.duplicate++;
2005 }
2006
2007 return true;
2008}
2009
65dacaa0
XG
2010/*
2011 * directly send the page to the stream
2012 *
2013 * Returns the number of pages written.
2014 *
2015 * @rs: current RAM state
2016 * @block: block that contains the page we want to send
2017 * @offset: offset inside the block for the page
2018 * @buf: the page to be sent
2019 * @async: send to page asyncly
2020 */
2021static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
2022 uint8_t *buf, bool async)
2023{
2024 ram_counters.transferred += save_page_header(rs, rs->f, block,
2025 offset | RAM_SAVE_FLAG_PAGE);
2026 if (async) {
2027 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
2028 migrate_release_ram() &
2029 migration_in_postcopy());
2030 } else {
2031 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
2032 }
2033 ram_counters.transferred += TARGET_PAGE_SIZE;
2034 ram_counters.normal++;
2035 return 1;
2036}
2037
56e93d26 2038/**
3d0684b2 2039 * ram_save_page: send the given page to the stream
56e93d26 2040 *
3d0684b2 2041 * Returns the number of pages written.
3fd3c4b3
DDAG
2042 * < 0 - error
2043 * >=0 - Number of pages written - this might legally be 0
2044 * if xbzrle noticed the page was the same.
56e93d26 2045 *
6f37bb8b 2046 * @rs: current RAM state
56e93d26
JQ
2047 * @block: block that contains the page we want to send
2048 * @offset: offset inside the block for the page
2049 * @last_stage: if we are at the completion stage
56e93d26 2050 */
a0a8aa14 2051static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
56e93d26
JQ
2052{
2053 int pages = -1;
56e93d26 2054 uint8_t *p;
56e93d26 2055 bool send_async = true;
a08f6890 2056 RAMBlock *block = pss->block;
a935e30f 2057 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
059ff0fb 2058 ram_addr_t current_addr = block->offset + offset;
56e93d26 2059
2f68e399 2060 p = block->host + offset;
1db9d8e5 2061 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
56e93d26 2062
56e93d26 2063 XBZRLE_cache_lock();
d7400a34
XG
2064 if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
2065 migrate_use_xbzrle()) {
059ff0fb
XG
2066 pages = save_xbzrle_page(rs, &p, current_addr, block,
2067 offset, last_stage);
2068 if (!last_stage) {
2069 /* Can't send this cached data async, since the cache page
2070 * might get updated before it gets to the wire
56e93d26 2071 */
059ff0fb 2072 send_async = false;
56e93d26
JQ
2073 }
2074 }
2075
2076 /* XBZRLE overflow or normal page */
2077 if (pages == -1) {
65dacaa0 2078 pages = save_normal_page(rs, block, offset, p, send_async);
56e93d26
JQ
2079 }
2080
2081 XBZRLE_cache_unlock();
2082
2083 return pages;
2084}
2085
b9ee2f7d
JQ
2086static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
2087 ram_addr_t offset)
2088{
1b81c974 2089 if (multifd_queue_page(rs, block, offset) < 0) {
713f762a
IR
2090 return -1;
2091 }
b9ee2f7d
JQ
2092 ram_counters.normal++;
2093
2094 return 1;
2095}
2096
5e5fdcff 2097static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
6ef3771c 2098 ram_addr_t offset, uint8_t *source_buf)
56e93d26 2099{
53518d94 2100 RAMState *rs = ram_state;
a7a9a88f 2101 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
5e5fdcff 2102 bool zero_page = false;
6ef3771c 2103 int ret;
56e93d26 2104
5e5fdcff
XG
2105 if (save_zero_page_to_file(rs, f, block, offset)) {
2106 zero_page = true;
2107 goto exit;
2108 }
2109
6ef3771c 2110 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
34ab9e97
XG
2111
2112 /*
2113 * copy it to a internal buffer to avoid it being modified by VM
2114 * so that we can catch up the error during compression and
2115 * decompression
2116 */
2117 memcpy(source_buf, p, TARGET_PAGE_SIZE);
6ef3771c
XG
2118 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
2119 if (ret < 0) {
2120 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
b3be2896 2121 error_report("compressed data failed!");
5e5fdcff 2122 return false;
b3be2896 2123 }
56e93d26 2124
5e5fdcff 2125exit:
6ef3771c 2126 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
5e5fdcff
XG
2127 return zero_page;
2128}
2129
2130static void
2131update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
2132{
76e03000
XG
2133 ram_counters.transferred += bytes_xmit;
2134
5e5fdcff
XG
2135 if (param->zero_page) {
2136 ram_counters.duplicate++;
76e03000 2137 return;
5e5fdcff 2138 }
76e03000
XG
2139
2140 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
2141 compression_counters.compressed_size += bytes_xmit - 8;
2142 compression_counters.pages++;
56e93d26
JQ
2143}
2144
32b05495
XG
2145static bool save_page_use_compression(RAMState *rs);
2146
ce25d337 2147static void flush_compressed_data(RAMState *rs)
56e93d26
JQ
2148{
2149 int idx, len, thread_count;
2150
32b05495 2151 if (!save_page_use_compression(rs)) {
56e93d26
JQ
2152 return;
2153 }
2154 thread_count = migrate_compress_threads();
a7a9a88f 2155
0d9f9a5c 2156 qemu_mutex_lock(&comp_done_lock);
56e93d26 2157 for (idx = 0; idx < thread_count; idx++) {
a7a9a88f 2158 while (!comp_param[idx].done) {
0d9f9a5c 2159 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26 2160 }
a7a9a88f 2161 }
0d9f9a5c 2162 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
2163
2164 for (idx = 0; idx < thread_count; idx++) {
2165 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 2166 if (!comp_param[idx].quit) {
ce25d337 2167 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
5e5fdcff
XG
2168 /*
2169 * it's safe to fetch zero_page without holding comp_done_lock
2170 * as there is no further request submitted to the thread,
2171 * i.e, the thread should be waiting for a request at this point.
2172 */
2173 update_compress_thread_counts(&comp_param[idx], len);
56e93d26 2174 }
a7a9a88f 2175 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26
JQ
2176 }
2177}
2178
2179static inline void set_compress_params(CompressParam *param, RAMBlock *block,
2180 ram_addr_t offset)
2181{
2182 param->block = block;
2183 param->offset = offset;
2184}
2185
ce25d337
JQ
2186static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
2187 ram_addr_t offset)
56e93d26
JQ
2188{
2189 int idx, thread_count, bytes_xmit = -1, pages = -1;
1d58872a 2190 bool wait = migrate_compress_wait_thread();
56e93d26
JQ
2191
2192 thread_count = migrate_compress_threads();
0d9f9a5c 2193 qemu_mutex_lock(&comp_done_lock);
1d58872a
XG
2194retry:
2195 for (idx = 0; idx < thread_count; idx++) {
2196 if (comp_param[idx].done) {
2197 comp_param[idx].done = false;
2198 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2199 qemu_mutex_lock(&comp_param[idx].mutex);
2200 set_compress_params(&comp_param[idx], block, offset);
2201 qemu_cond_signal(&comp_param[idx].cond);
2202 qemu_mutex_unlock(&comp_param[idx].mutex);
2203 pages = 1;
5e5fdcff 2204 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
56e93d26 2205 break;
56e93d26
JQ
2206 }
2207 }
1d58872a
XG
2208
2209 /*
2210 * wait for the free thread if the user specifies 'compress-wait-thread',
2211 * otherwise we will post the page out in the main thread as normal page.
2212 */
2213 if (pages < 0 && wait) {
2214 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2215 goto retry;
2216 }
0d9f9a5c 2217 qemu_mutex_unlock(&comp_done_lock);
56e93d26
JQ
2218
2219 return pages;
2220}
2221
3d0684b2
JQ
2222/**
2223 * find_dirty_block: find the next dirty page and update any state
2224 * associated with the search process.
b9e60928 2225 *
a5f7b1a6 2226 * Returns true if a page is found
b9e60928 2227 *
6f37bb8b 2228 * @rs: current RAM state
3d0684b2
JQ
2229 * @pss: data about the state of the current dirty page scan
2230 * @again: set to false if the search has scanned the whole of RAM
b9e60928 2231 */
f20e2865 2232static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
b9e60928 2233{
f20e2865 2234 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
6f37bb8b 2235 if (pss->complete_round && pss->block == rs->last_seen_block &&
a935e30f 2236 pss->page >= rs->last_page) {
b9e60928
DDAG
2237 /*
2238 * We've been once around the RAM and haven't found anything.
2239 * Give up.
2240 */
2241 *again = false;
2242 return false;
2243 }
a935e30f 2244 if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
b9e60928 2245 /* Didn't find anything in this RAM Block */
a935e30f 2246 pss->page = 0;
b9e60928
DDAG
2247 pss->block = QLIST_NEXT_RCU(pss->block, next);
2248 if (!pss->block) {
48df9d80
XG
2249 /*
2250 * If memory migration starts over, we will meet a dirtied page
2251 * which may still exists in compression threads's ring, so we
2252 * should flush the compressed data to make sure the new page
2253 * is not overwritten by the old one in the destination.
2254 *
2255 * Also If xbzrle is on, stop using the data compression at this
2256 * point. In theory, xbzrle can do better than compression.
2257 */
2258 flush_compressed_data(rs);
2259
b9e60928
DDAG
2260 /* Hit the end of the list */
2261 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
2262 /* Flag that we've looped */
2263 pss->complete_round = true;
6f37bb8b 2264 rs->ram_bulk_stage = false;
b9e60928
DDAG
2265 }
2266 /* Didn't find anything this time, but try again on the new block */
2267 *again = true;
2268 return false;
2269 } else {
2270 /* Can go around again, but... */
2271 *again = true;
2272 /* We've found something so probably don't need to */
2273 return true;
2274 }
2275}
2276
3d0684b2
JQ
2277/**
2278 * unqueue_page: gets a page of the queue
2279 *
a82d593b 2280 * Helper for 'get_queued_page' - gets a page off the queue
a82d593b 2281 *
3d0684b2
JQ
2282 * Returns the block of the page (or NULL if none available)
2283 *
ec481c6c 2284 * @rs: current RAM state
3d0684b2 2285 * @offset: used to return the offset within the RAMBlock
a82d593b 2286 */
f20e2865 2287static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
a82d593b
DDAG
2288{
2289 RAMBlock *block = NULL;
2290
ae526e32
XG
2291 if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
2292 return NULL;
2293 }
2294
ec481c6c
JQ
2295 qemu_mutex_lock(&rs->src_page_req_mutex);
2296 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2297 struct RAMSrcPageRequest *entry =
2298 QSIMPLEQ_FIRST(&rs->src_page_requests);
a82d593b
DDAG
2299 block = entry->rb;
2300 *offset = entry->offset;
a82d593b
DDAG
2301
2302 if (entry->len > TARGET_PAGE_SIZE) {
2303 entry->len -= TARGET_PAGE_SIZE;
2304 entry->offset += TARGET_PAGE_SIZE;
2305 } else {
2306 memory_region_unref(block->mr);
ec481c6c 2307 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
a82d593b 2308 g_free(entry);
e03a34f8 2309 migration_consume_urgent_request();
a82d593b
DDAG
2310 }
2311 }
ec481c6c 2312 qemu_mutex_unlock(&rs->src_page_req_mutex);
a82d593b
DDAG
2313
2314 return block;
2315}
2316
3d0684b2 2317/**
ff1543af 2318 * get_queued_page: unqueue a page from the postcopy requests
3d0684b2
JQ
2319 *
2320 * Skips pages that are already sent (!dirty)
a82d593b 2321 *
a5f7b1a6 2322 * Returns true if a queued page is found
a82d593b 2323 *
6f37bb8b 2324 * @rs: current RAM state
3d0684b2 2325 * @pss: data about the state of the current dirty page scan
a82d593b 2326 */
f20e2865 2327static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
a82d593b
DDAG
2328{
2329 RAMBlock *block;
2330 ram_addr_t offset;
2331 bool dirty;
2332
2333 do {
f20e2865 2334 block = unqueue_page(rs, &offset);
a82d593b
DDAG
2335 /*
2336 * We're sending this page, and since it's postcopy nothing else
2337 * will dirty it, and we must make sure it doesn't get sent again
2338 * even if this queue request was received after the background
2339 * search already sent it.
2340 */
2341 if (block) {
f20e2865
JQ
2342 unsigned long page;
2343
6b6712ef
JQ
2344 page = offset >> TARGET_PAGE_BITS;
2345 dirty = test_bit(page, block->bmap);
a82d593b 2346 if (!dirty) {
06b10688 2347 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
6b6712ef 2348 page, test_bit(page, block->unsentmap));
a82d593b 2349 } else {
f20e2865 2350 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
a82d593b
DDAG
2351 }
2352 }
2353
2354 } while (block && !dirty);
2355
2356 if (block) {
2357 /*
2358 * As soon as we start servicing pages out of order, then we have
2359 * to kill the bulk stage, since the bulk stage assumes
2360 * in (migration_bitmap_find_and_reset_dirty) that every page is
2361 * dirty, that's no longer true.
2362 */
6f37bb8b 2363 rs->ram_bulk_stage = false;
a82d593b
DDAG
2364
2365 /*
2366 * We want the background search to continue from the queued page
2367 * since the guest is likely to want other pages near to the page
2368 * it just requested.
2369 */
2370 pss->block = block;
a935e30f 2371 pss->page = offset >> TARGET_PAGE_BITS;
422314e7
WY
2372
2373 /*
2374 * This unqueued page would break the "one round" check, even is
2375 * really rare.
2376 */
2377 pss->complete_round = false;
a82d593b
DDAG
2378 }
2379
2380 return !!block;
2381}
2382
6c595cde 2383/**
5e58f968
JQ
2384 * migration_page_queue_free: drop any remaining pages in the ram
2385 * request queue
6c595cde 2386 *
3d0684b2
JQ
2387 * It should be empty at the end anyway, but in error cases there may
2388 * be some left. in case that there is any page left, we drop it.
2389 *
6c595cde 2390 */
83c13382 2391static void migration_page_queue_free(RAMState *rs)
6c595cde 2392{
ec481c6c 2393 struct RAMSrcPageRequest *mspr, *next_mspr;
6c595cde
DDAG
2394 /* This queue generally should be empty - but in the case of a failed
2395 * migration might have some droppings in.
2396 */
2397 rcu_read_lock();
ec481c6c 2398 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
6c595cde 2399 memory_region_unref(mspr->rb->mr);
ec481c6c 2400 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
6c595cde
DDAG
2401 g_free(mspr);
2402 }
2403 rcu_read_unlock();
2404}
2405
2406/**
3d0684b2
JQ
2407 * ram_save_queue_pages: queue the page for transmission
2408 *
2409 * A request from postcopy destination for example.
2410 *
2411 * Returns zero on success or negative on error
2412 *
3d0684b2
JQ
2413 * @rbname: Name of the RAMBLock of the request. NULL means the
2414 * same that last one.
2415 * @start: starting address from the start of the RAMBlock
2416 * @len: length (in bytes) to send
6c595cde 2417 */
96506894 2418int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
6c595cde
DDAG
2419{
2420 RAMBlock *ramblock;
53518d94 2421 RAMState *rs = ram_state;
6c595cde 2422
9360447d 2423 ram_counters.postcopy_requests++;
6c595cde
DDAG
2424 rcu_read_lock();
2425 if (!rbname) {
2426 /* Reuse last RAMBlock */
68a098f3 2427 ramblock = rs->last_req_rb;
6c595cde
DDAG
2428
2429 if (!ramblock) {
2430 /*
2431 * Shouldn't happen, we can't reuse the last RAMBlock if
2432 * it's the 1st request.
2433 */
2434 error_report("ram_save_queue_pages no previous block");
2435 goto err;
2436 }
2437 } else {
2438 ramblock = qemu_ram_block_by_name(rbname);
2439
2440 if (!ramblock) {
2441 /* We shouldn't be asked for a non-existent RAMBlock */
2442 error_report("ram_save_queue_pages no block '%s'", rbname);
2443 goto err;
2444 }
68a098f3 2445 rs->last_req_rb = ramblock;
6c595cde
DDAG
2446 }
2447 trace_ram_save_queue_pages(ramblock->idstr, start, len);
2448 if (start+len > ramblock->used_length) {
9458ad6b
JQ
2449 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2450 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
6c595cde
DDAG
2451 __func__, start, len, ramblock->used_length);
2452 goto err;
2453 }
2454
ec481c6c
JQ
2455 struct RAMSrcPageRequest *new_entry =
2456 g_malloc0(sizeof(struct RAMSrcPageRequest));
6c595cde
DDAG
2457 new_entry->rb = ramblock;
2458 new_entry->offset = start;
2459 new_entry->len = len;
2460
2461 memory_region_ref(ramblock->mr);
ec481c6c
JQ
2462 qemu_mutex_lock(&rs->src_page_req_mutex);
2463 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
e03a34f8 2464 migration_make_urgent_request();
ec481c6c 2465 qemu_mutex_unlock(&rs->src_page_req_mutex);
6c595cde
DDAG
2466 rcu_read_unlock();
2467
2468 return 0;
2469
2470err:
2471 rcu_read_unlock();
2472 return -1;
2473}
2474
d7400a34
XG
2475static bool save_page_use_compression(RAMState *rs)
2476{
2477 if (!migrate_use_compression()) {
2478 return false;
2479 }
2480
2481 /*
2482 * If xbzrle is on, stop using the data compression after first
2483 * round of migration even if compression is enabled. In theory,
2484 * xbzrle can do better than compression.
2485 */
2486 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
2487 return true;
2488 }
2489
2490 return false;
2491}
2492
5e5fdcff
XG
2493/*
2494 * try to compress the page before posting it out, return true if the page
2495 * has been properly handled by compression, otherwise needs other
2496 * paths to handle it
2497 */
2498static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2499{
2500 if (!save_page_use_compression(rs)) {
2501 return false;
2502 }
2503
2504 /*
2505 * When starting the process of a new block, the first page of
2506 * the block should be sent out before other pages in the same
2507 * block, and all the pages in last block should have been sent
2508 * out, keeping this order is important, because the 'cont' flag
2509 * is used to avoid resending the block name.
2510 *
2511 * We post the fist page as normal page as compression will take
2512 * much CPU resource.
2513 */
2514 if (block != rs->last_sent_block) {
2515 flush_compressed_data(rs);
2516 return false;
2517 }
2518
2519 if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2520 return true;
2521 }
2522
76e03000 2523 compression_counters.busy++;
5e5fdcff
XG
2524 return false;
2525}
2526
a82d593b 2527/**
3d0684b2 2528 * ram_save_target_page: save one target page
a82d593b 2529 *
3d0684b2 2530 * Returns the number of pages written
a82d593b 2531 *
6f37bb8b 2532 * @rs: current RAM state
3d0684b2 2533 * @pss: data about the page we want to send
a82d593b 2534 * @last_stage: if we are at the completion stage
a82d593b 2535 */
a0a8aa14 2536static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
f20e2865 2537 bool last_stage)
a82d593b 2538{
a8ec91f9
XG
2539 RAMBlock *block = pss->block;
2540 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2541 int res;
2542
2543 if (control_save_page(rs, block, offset, &res)) {
2544 return res;
2545 }
2546
5e5fdcff
XG
2547 if (save_compress_page(rs, block, offset)) {
2548 return 1;
d7400a34
XG
2549 }
2550
2551 res = save_zero_page(rs, block, offset);
2552 if (res > 0) {
2553 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2554 * page would be stale
2555 */
2556 if (!save_page_use_compression(rs)) {
2557 XBZRLE_cache_lock();
2558 xbzrle_cache_zero_page(rs, block->offset + offset);
2559 XBZRLE_cache_unlock();
2560 }
2561 ram_release_pages(block->idstr, offset, res);
2562 return res;
2563 }
2564
da3f56cb 2565 /*
5e5fdcff
XG
2566 * do not use multifd for compression as the first page in the new
2567 * block should be posted out before sending the compressed page
da3f56cb 2568 */
5e5fdcff 2569 if (!save_page_use_compression(rs) && migrate_use_multifd()) {
b9ee2f7d 2570 return ram_save_multifd_page(rs, block, offset);
a82d593b
DDAG
2571 }
2572
1faa5665 2573 return ram_save_page(rs, pss, last_stage);
a82d593b
DDAG
2574}
2575
2576/**
3d0684b2 2577 * ram_save_host_page: save a whole host page
a82d593b 2578 *
3d0684b2
JQ
2579 * Starting at *offset send pages up to the end of the current host
2580 * page. It's valid for the initial offset to point into the middle of
2581 * a host page in which case the remainder of the hostpage is sent.
2582 * Only dirty target pages are sent. Note that the host page size may
2583 * be a huge page for this block.
1eb3fc0a
DDAG
2584 * The saving stops at the boundary of the used_length of the block
2585 * if the RAMBlock isn't a multiple of the host page size.
a82d593b 2586 *
3d0684b2
JQ
2587 * Returns the number of pages written or negative on error
2588 *
6f37bb8b 2589 * @rs: current RAM state
3d0684b2 2590 * @ms: current migration state
3d0684b2 2591 * @pss: data about the page we want to send
a82d593b 2592 * @last_stage: if we are at the completion stage
a82d593b 2593 */
a0a8aa14 2594static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
f20e2865 2595 bool last_stage)
a82d593b
DDAG
2596{
2597 int tmppages, pages = 0;
a935e30f
JQ
2598 size_t pagesize_bits =
2599 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
4c011c37 2600
fbd162e6 2601 if (ramblock_is_ignored(pss->block)) {
b895de50
CLG
2602 error_report("block %s should not be migrated !", pss->block->idstr);
2603 return 0;
2604 }
2605
a82d593b 2606 do {
1faa5665
XG
2607 /* Check the pages is dirty and if it is send it */
2608 if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2609 pss->page++;
2610 continue;
2611 }
2612
f20e2865 2613 tmppages = ram_save_target_page(rs, pss, last_stage);
a82d593b
DDAG
2614 if (tmppages < 0) {
2615 return tmppages;
2616 }
2617
2618 pages += tmppages;
1faa5665
XG
2619 if (pss->block->unsentmap) {
2620 clear_bit(pss->page, pss->block->unsentmap);
2621 }
2622
a935e30f 2623 pss->page++;
1eb3fc0a
DDAG
2624 } while ((pss->page & (pagesize_bits - 1)) &&
2625 offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
a82d593b
DDAG
2626
2627 /* The offset we leave with is the last one we looked at */
a935e30f 2628 pss->page--;
a82d593b
DDAG
2629 return pages;
2630}
6c595cde 2631
56e93d26 2632/**
3d0684b2 2633 * ram_find_and_save_block: finds a dirty page and sends it to f
56e93d26
JQ
2634 *
2635 * Called within an RCU critical section.
2636 *
e8f3735f
XG
2637 * Returns the number of pages written where zero means no dirty pages,
2638 * or negative on error
56e93d26 2639 *
6f37bb8b 2640 * @rs: current RAM state
56e93d26 2641 * @last_stage: if we are at the completion stage
a82d593b
DDAG
2642 *
2643 * On systems where host-page-size > target-page-size it will send all the
2644 * pages in a host page that are dirty.
56e93d26
JQ
2645 */
2646
ce25d337 2647static int ram_find_and_save_block(RAMState *rs, bool last_stage)
56e93d26 2648{
b8fb8cb7 2649 PageSearchStatus pss;
56e93d26 2650 int pages = 0;
b9e60928 2651 bool again, found;
56e93d26 2652
0827b9e9
AA
2653 /* No dirty page as there is zero RAM */
2654 if (!ram_bytes_total()) {
2655 return pages;
2656 }
2657
6f37bb8b 2658 pss.block = rs->last_seen_block;
a935e30f 2659 pss.page = rs->last_page;
b8fb8cb7
DDAG
2660 pss.complete_round = false;
2661
2662 if (!pss.block) {
2663 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2664 }
56e93d26 2665
b9e60928 2666 do {
a82d593b 2667 again = true;
f20e2865 2668 found = get_queued_page(rs, &pss);
b9e60928 2669
a82d593b
DDAG
2670 if (!found) {
2671 /* priority queue empty, so just search for something dirty */
f20e2865 2672 found = find_dirty_block(rs, &pss, &again);
a82d593b 2673 }
f3f491fc 2674
a82d593b 2675 if (found) {
f20e2865 2676 pages = ram_save_host_page(rs, &pss, last_stage);
56e93d26 2677 }
b9e60928 2678 } while (!pages && again);
56e93d26 2679
6f37bb8b 2680 rs->last_seen_block = pss.block;
a935e30f 2681 rs->last_page = pss.page;
56e93d26
JQ
2682
2683 return pages;
2684}
2685
2686void acct_update_position(QEMUFile *f, size_t size, bool zero)
2687{
2688 uint64_t pages = size / TARGET_PAGE_SIZE;
f7ccd61b 2689
56e93d26 2690 if (zero) {
9360447d 2691 ram_counters.duplicate += pages;
56e93d26 2692 } else {
9360447d
JQ
2693 ram_counters.normal += pages;
2694 ram_counters.transferred += size;
56e93d26
JQ
2695 qemu_update_position(f, size);
2696 }
2697}
2698
fbd162e6 2699static uint64_t ram_bytes_total_common(bool count_ignored)
56e93d26
JQ
2700{
2701 RAMBlock *block;
2702 uint64_t total = 0;
2703
2704 rcu_read_lock();
fbd162e6
YK
2705 if (count_ignored) {
2706 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2707 total += block->used_length;
2708 }
2709 } else {
2710 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2711 total += block->used_length;
2712 }
99e15582 2713 }
56e93d26
JQ
2714 rcu_read_unlock();
2715 return total;
2716}
2717
fbd162e6
YK
2718uint64_t ram_bytes_total(void)
2719{
2720 return ram_bytes_total_common(false);
2721}
2722
f265e0e4 2723static void xbzrle_load_setup(void)
56e93d26 2724{
f265e0e4 2725 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
56e93d26
JQ
2726}
2727
f265e0e4
JQ
2728static void xbzrle_load_cleanup(void)
2729{
2730 g_free(XBZRLE.decoded_buf);
2731 XBZRLE.decoded_buf = NULL;
2732}
2733
7d7c96be
PX
2734static void ram_state_cleanup(RAMState **rsp)
2735{
b9ccaf6d
DDAG
2736 if (*rsp) {
2737 migration_page_queue_free(*rsp);
2738 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2739 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2740 g_free(*rsp);
2741 *rsp = NULL;
2742 }
7d7c96be
PX
2743}
2744
84593a08
PX
2745static void xbzrle_cleanup(void)
2746{
2747 XBZRLE_cache_lock();
2748 if (XBZRLE.cache) {
2749 cache_fini(XBZRLE.cache);
2750 g_free(XBZRLE.encoded_buf);
2751 g_free(XBZRLE.current_buf);
2752 g_free(XBZRLE.zero_target_page);
2753 XBZRLE.cache = NULL;
2754 XBZRLE.encoded_buf = NULL;
2755 XBZRLE.current_buf = NULL;
2756 XBZRLE.zero_target_page = NULL;
2757 }
2758 XBZRLE_cache_unlock();
2759}
2760
f265e0e4 2761static void ram_save_cleanup(void *opaque)
56e93d26 2762{
53518d94 2763 RAMState **rsp = opaque;
6b6712ef 2764 RAMBlock *block;
eb859c53 2765
2ff64038 2766 /* caller have hold iothread lock or is in a bh, so there is
4633456c 2767 * no writing race against the migration bitmap
2ff64038 2768 */
6b6712ef
JQ
2769 memory_global_dirty_log_stop();
2770
fbd162e6 2771 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
002cad6b
PX
2772 g_free(block->clear_bmap);
2773 block->clear_bmap = NULL;
6b6712ef
JQ
2774 g_free(block->bmap);
2775 block->bmap = NULL;
2776 g_free(block->unsentmap);
2777 block->unsentmap = NULL;
56e93d26
JQ
2778 }
2779
84593a08 2780 xbzrle_cleanup();
f0afa331 2781 compress_threads_save_cleanup();
7d7c96be 2782 ram_state_cleanup(rsp);
56e93d26
JQ
2783}
2784
6f37bb8b 2785static void ram_state_reset(RAMState *rs)
56e93d26 2786{
6f37bb8b
JQ
2787 rs->last_seen_block = NULL;
2788 rs->last_sent_block = NULL;
269ace29 2789 rs->last_page = 0;
6f37bb8b
JQ
2790 rs->last_version = ram_list.version;
2791 rs->ram_bulk_stage = true;
6eeb63f7 2792 rs->fpo_enabled = false;
56e93d26
JQ
2793}
2794
2795#define MAX_WAIT 50 /* ms, half buffered_file limit */
2796
4f2e4252
DDAG
2797/*
2798 * 'expected' is the value you expect the bitmap mostly to be full
2799 * of; it won't bother printing lines that are all this value.
2800 * If 'todump' is null the migration bitmap is dumped.
2801 */
6b6712ef
JQ
2802void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2803 unsigned long pages)
4f2e4252 2804{
4f2e4252
DDAG
2805 int64_t cur;
2806 int64_t linelen = 128;
2807 char linebuf[129];
2808
6b6712ef 2809 for (cur = 0; cur < pages; cur += linelen) {
4f2e4252
DDAG
2810 int64_t curb;
2811 bool found = false;
2812 /*
2813 * Last line; catch the case where the line length
2814 * is longer than remaining ram
2815 */
6b6712ef
JQ
2816 if (cur + linelen > pages) {
2817 linelen = pages - cur;
4f2e4252
DDAG
2818 }
2819 for (curb = 0; curb < linelen; curb++) {
2820 bool thisbit = test_bit(cur + curb, todump);
2821 linebuf[curb] = thisbit ? '1' : '.';
2822 found = found || (thisbit != expected);
2823 }
2824 if (found) {
2825 linebuf[curb] = '\0';
2826 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
2827 }
2828 }
2829}
2830
e0b266f0
DDAG
2831/* **** functions for postcopy ***** */
2832
ced1c616
PB
2833void ram_postcopy_migrated_memory_release(MigrationState *ms)
2834{
2835 struct RAMBlock *block;
ced1c616 2836
fbd162e6 2837 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
6b6712ef
JQ
2838 unsigned long *bitmap = block->bmap;
2839 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2840 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
ced1c616
PB
2841
2842 while (run_start < range) {
2843 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
aaa2064c 2844 ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
ced1c616
PB
2845 (run_end - run_start) << TARGET_PAGE_BITS);
2846 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2847 }
2848 }
2849}
2850
3d0684b2
JQ
2851/**
2852 * postcopy_send_discard_bm_ram: discard a RAMBlock
2853 *
2854 * Returns zero on success
2855 *
e0b266f0
DDAG
2856 * Callback from postcopy_each_ram_send_discard for each RAMBlock
2857 * Note: At this point the 'unsentmap' is the processed bitmap combined
2858 * with the dirtymap; so a '1' means it's either dirty or unsent.
3d0684b2
JQ
2859 *
2860 * @ms: current migration state
89dab31b 2861 * @block: RAMBlock to discard
e0b266f0 2862 */
810cf2bb 2863static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
e0b266f0 2864{
6b6712ef 2865 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
e0b266f0 2866 unsigned long current;
6b6712ef 2867 unsigned long *unsentmap = block->unsentmap;
e0b266f0 2868
6b6712ef 2869 for (current = 0; current < end; ) {
e0b266f0 2870 unsigned long one = find_next_bit(unsentmap, end, current);
33a5cb62 2871 unsigned long zero, discard_length;
e0b266f0 2872
33a5cb62
WY
2873 if (one >= end) {
2874 break;
2875 }
e0b266f0 2876
33a5cb62
WY
2877 zero = find_next_zero_bit(unsentmap, end, one + 1);
2878
2879 if (zero >= end) {
2880 discard_length = end - one;
e0b266f0 2881 } else {
33a5cb62
WY
2882 discard_length = zero - one;
2883 }
810cf2bb 2884 postcopy_discard_send_range(ms, one, discard_length);
33a5cb62 2885 current = one + discard_length;
e0b266f0
DDAG
2886 }
2887
2888 return 0;
2889}
2890
3d0684b2
JQ
2891/**
2892 * postcopy_each_ram_send_discard: discard all RAMBlocks
2893 *
2894 * Returns 0 for success or negative for error
2895 *
e0b266f0
DDAG
2896 * Utility for the outgoing postcopy code.
2897 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2898 * passing it bitmap indexes and name.
e0b266f0
DDAG
2899 * (qemu_ram_foreach_block ends up passing unscaled lengths
2900 * which would mean postcopy code would have to deal with target page)
3d0684b2
JQ
2901 *
2902 * @ms: current migration state
e0b266f0
DDAG
2903 */
2904static int postcopy_each_ram_send_discard(MigrationState *ms)
2905{
2906 struct RAMBlock *block;
2907 int ret;
2908
fbd162e6 2909 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
810cf2bb 2910 postcopy_discard_send_init(ms, block->idstr);
e0b266f0
DDAG
2911
2912 /*
2913 * Postcopy sends chunks of bitmap over the wire, but it
2914 * just needs indexes at this point, avoids it having
2915 * target page specific code.
2916 */
810cf2bb
WY
2917 ret = postcopy_send_discard_bm_ram(ms, block);
2918 postcopy_discard_send_finish(ms);
e0b266f0
DDAG
2919 if (ret) {
2920 return ret;
2921 }
2922 }
2923
2924 return 0;
2925}
2926
3d0684b2
JQ
2927/**
2928 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
2929 *
2930 * Helper for postcopy_chunk_hostpages; it's called twice to
2931 * canonicalize the two bitmaps, that are similar, but one is
2932 * inverted.
99e314eb 2933 *
3d0684b2
JQ
2934 * Postcopy requires that all target pages in a hostpage are dirty or
2935 * clean, not a mix. This function canonicalizes the bitmaps.
99e314eb 2936 *
3d0684b2
JQ
2937 * @ms: current migration state
2938 * @unsent_pass: if true we need to canonicalize partially unsent host pages
2939 * otherwise we need to canonicalize partially dirty host pages
2940 * @block: block that contains the page we want to canonicalize
99e314eb
DDAG
2941 */
2942static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
810cf2bb 2943 RAMBlock *block)
99e314eb 2944{
53518d94 2945 RAMState *rs = ram_state;
6b6712ef
JQ
2946 unsigned long *bitmap = block->bmap;
2947 unsigned long *unsentmap = block->unsentmap;
29c59172 2948 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
6b6712ef 2949 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
99e314eb
DDAG
2950 unsigned long run_start;
2951
29c59172
DDAG
2952 if (block->page_size == TARGET_PAGE_SIZE) {
2953 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2954 return;
2955 }
2956
99e314eb
DDAG
2957 if (unsent_pass) {
2958 /* Find a sent page */
6b6712ef 2959 run_start = find_next_zero_bit(unsentmap, pages, 0);
99e314eb
DDAG
2960 } else {
2961 /* Find a dirty page */
6b6712ef 2962 run_start = find_next_bit(bitmap, pages, 0);
99e314eb
DDAG
2963 }
2964
6b6712ef 2965 while (run_start < pages) {
99e314eb
DDAG
2966
2967 /*
2968 * If the start of this run of pages is in the middle of a host
2969 * page, then we need to fixup this host page.
2970 */
9dec3cc3 2971 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
99e314eb 2972 /* Find the end of this run */
99e314eb 2973 if (unsent_pass) {
dad45ab2 2974 run_start = find_next_bit(unsentmap, pages, run_start + 1);
99e314eb 2975 } else {
dad45ab2 2976 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
99e314eb
DDAG
2977 }
2978 /*
2979 * If the end isn't at the start of a host page, then the
2980 * run doesn't finish at the end of a host page
2981 * and we need to discard.
2982 */
99e314eb
DDAG
2983 }
2984
9dec3cc3 2985 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
99e314eb 2986 unsigned long page;
dad45ab2
WY
2987 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2988 host_ratio);
2989 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
99e314eb
DDAG
2990
2991 /* Tell the destination to discard this page */
2992 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
2993 /* For the unsent_pass we:
2994 * discard partially sent pages
2995 * For the !unsent_pass (dirty) we:
2996 * discard partially dirty pages that were sent
2997 * (any partially sent pages were already discarded
2998 * by the previous unsent_pass)
2999 */
810cf2bb 3000 postcopy_discard_send_range(ms, fixup_start_addr, host_ratio);
99e314eb
DDAG
3001 }
3002
3003 /* Clean up the bitmap */
3004 for (page = fixup_start_addr;
3005 page < fixup_start_addr + host_ratio; page++) {
3006 /* All pages in this host page are now not sent */
3007 set_bit(page, unsentmap);
3008
3009 /*
3010 * Remark them as dirty, updating the count for any pages
3011 * that weren't previously dirty.
3012 */
0d8ec885 3013 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
99e314eb
DDAG
3014 }
3015 }
3016
3017 if (unsent_pass) {
3018 /* Find the next sent page for the next iteration */
6b6712ef 3019 run_start = find_next_zero_bit(unsentmap, pages, run_start);
99e314eb
DDAG
3020 } else {
3021 /* Find the next dirty page for the next iteration */
6b6712ef 3022 run_start = find_next_bit(bitmap, pages, run_start);
99e314eb
DDAG
3023 }
3024 }
3025}
3026
3d0684b2 3027/**
89dab31b 3028 * postcopy_chunk_hostpages: discard any partially sent host page
3d0684b2 3029 *
99e314eb
DDAG
3030 * Utility for the outgoing postcopy code.
3031 *
3032 * Discard any partially sent host-page size chunks, mark any partially
29c59172
DDAG
3033 * dirty host-page size chunks as all dirty. In this case the host-page
3034 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
99e314eb 3035 *
3d0684b2
JQ
3036 * Returns zero on success
3037 *
3038 * @ms: current migration state
6b6712ef 3039 * @block: block we want to work with
99e314eb 3040 */
6b6712ef 3041static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
99e314eb 3042{
810cf2bb 3043 postcopy_discard_send_init(ms, block->idstr);
99e314eb 3044
6b6712ef 3045 /* First pass: Discard all partially sent host pages */
810cf2bb 3046 postcopy_chunk_hostpages_pass(ms, true, block);
6b6712ef
JQ
3047 /*
3048 * Second pass: Ensure that all partially dirty host pages are made
3049 * fully dirty.
3050 */
810cf2bb 3051 postcopy_chunk_hostpages_pass(ms, false, block);
99e314eb 3052
810cf2bb 3053 postcopy_discard_send_finish(ms);
99e314eb
DDAG
3054 return 0;
3055}
3056
3d0684b2
JQ
3057/**
3058 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
3059 *
3060 * Returns zero on success
3061 *
e0b266f0
DDAG
3062 * Transmit the set of pages to be discarded after precopy to the target
3063 * these are pages that:
3064 * a) Have been previously transmitted but are now dirty again
3065 * b) Pages that have never been transmitted, this ensures that
3066 * any pages on the destination that have been mapped by background
3067 * tasks get discarded (transparent huge pages is the specific concern)
3068 * Hopefully this is pretty sparse
3d0684b2
JQ
3069 *
3070 * @ms: current migration state
e0b266f0
DDAG
3071 */
3072int ram_postcopy_send_discard_bitmap(MigrationState *ms)
3073{
53518d94 3074 RAMState *rs = ram_state;
6b6712ef 3075 RAMBlock *block;
e0b266f0 3076 int ret;
e0b266f0
DDAG
3077
3078 rcu_read_lock();
3079
3080 /* This should be our last sync, the src is now paused */
eb859c53 3081 migration_bitmap_sync(rs);
e0b266f0 3082
6b6712ef
JQ
3083 /* Easiest way to make sure we don't resume in the middle of a host-page */
3084 rs->last_seen_block = NULL;
3085 rs->last_sent_block = NULL;
3086 rs->last_page = 0;
e0b266f0 3087
fbd162e6 3088 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
6b6712ef
JQ
3089 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
3090 unsigned long *bitmap = block->bmap;
3091 unsigned long *unsentmap = block->unsentmap;
3092
3093 if (!unsentmap) {
3094 /* We don't have a safe way to resize the sentmap, so
3095 * if the bitmap was resized it will be NULL at this
3096 * point.
3097 */
3098 error_report("migration ram resized during precopy phase");
3099 rcu_read_unlock();
3100 return -EINVAL;
3101 }
3102 /* Deal with TPS != HPS and huge pages */
3103 ret = postcopy_chunk_hostpages(ms, block);
3104 if (ret) {
3105 rcu_read_unlock();
3106 return ret;
3107 }
e0b266f0 3108
6b6712ef
JQ
3109 /*
3110 * Update the unsentmap to be unsentmap = unsentmap | dirty
3111 */
3112 bitmap_or(unsentmap, unsentmap, bitmap, pages);
e0b266f0 3113#ifdef DEBUG_POSTCOPY
6b6712ef 3114 ram_debug_dump_bitmap(unsentmap, true, pages);
e0b266f0 3115#endif
6b6712ef
JQ
3116 }
3117 trace_ram_postcopy_send_discard_bitmap();
e0b266f0
DDAG
3118
3119 ret = postcopy_each_ram_send_discard(ms);
3120 rcu_read_unlock();
3121
3122 return ret;
3123}
3124
3d0684b2
JQ
3125/**
3126 * ram_discard_range: discard dirtied pages at the beginning of postcopy
e0b266f0 3127 *
3d0684b2 3128 * Returns zero on success
e0b266f0 3129 *
36449157
JQ
3130 * @rbname: name of the RAMBlock of the request. NULL means the
3131 * same that last one.
3d0684b2
JQ
3132 * @start: RAMBlock starting page
3133 * @length: RAMBlock size
e0b266f0 3134 */
aaa2064c 3135int ram_discard_range(const char *rbname, uint64_t start, size_t length)
e0b266f0
DDAG
3136{
3137 int ret = -1;
3138
36449157 3139 trace_ram_discard_range(rbname, start, length);
d3a5038c 3140
e0b266f0 3141 rcu_read_lock();
36449157 3142 RAMBlock *rb = qemu_ram_block_by_name(rbname);
e0b266f0
DDAG
3143
3144 if (!rb) {
36449157 3145 error_report("ram_discard_range: Failed to find block '%s'", rbname);
e0b266f0
DDAG
3146 goto err;
3147 }
3148
814bb08f
PX
3149 /*
3150 * On source VM, we don't need to update the received bitmap since
3151 * we don't even have one.
3152 */
3153 if (rb->receivedmap) {
3154 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
3155 length >> qemu_target_page_bits());
3156 }
3157
d3a5038c 3158 ret = ram_block_discard_range(rb, start, length);
e0b266f0
DDAG
3159
3160err:
3161 rcu_read_unlock();
3162
3163 return ret;
3164}
3165
84593a08
PX
3166/*
3167 * For every allocation, we will try not to crash the VM if the
3168 * allocation failed.
3169 */
3170static int xbzrle_init(void)
3171{
3172 Error *local_err = NULL;
3173
3174 if (!migrate_use_xbzrle()) {
3175 return 0;
3176 }
3177
3178 XBZRLE_cache_lock();
3179
3180 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
3181 if (!XBZRLE.zero_target_page) {
3182 error_report("%s: Error allocating zero page", __func__);
3183 goto err_out;
3184 }
3185
3186 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
3187 TARGET_PAGE_SIZE, &local_err);
3188 if (!XBZRLE.cache) {
3189 error_report_err(local_err);
3190 goto free_zero_page;
3191 }
3192
3193 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3194 if (!XBZRLE.encoded_buf) {
3195 error_report("%s: Error allocating encoded_buf", __func__);
3196 goto free_cache;
3197 }
3198
3199 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3200 if (!XBZRLE.current_buf) {
3201 error_report("%s: Error allocating current_buf", __func__);
3202 goto free_encoded_buf;
3203 }
3204
3205 /* We are all good */
3206 XBZRLE_cache_unlock();
3207 return 0;
3208
3209free_encoded_buf:
3210 g_free(XBZRLE.encoded_buf);
3211 XBZRLE.encoded_buf = NULL;
3212free_cache:
3213 cache_fini(XBZRLE.cache);
3214 XBZRLE.cache = NULL;
3215free_zero_page:
3216 g_free(XBZRLE.zero_target_page);
3217 XBZRLE.zero_target_page = NULL;
3218err_out:
3219 XBZRLE_cache_unlock();
3220 return -ENOMEM;
3221}
3222
53518d94 3223static int ram_state_init(RAMState **rsp)
56e93d26 3224{
7d00ee6a
PX
3225 *rsp = g_try_new0(RAMState, 1);
3226
3227 if (!*rsp) {
3228 error_report("%s: Init ramstate fail", __func__);
3229 return -1;
3230 }
53518d94
JQ
3231
3232 qemu_mutex_init(&(*rsp)->bitmap_mutex);
3233 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3234 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
56e93d26 3235
7d00ee6a 3236 /*
40c4d4a8
IR
3237 * Count the total number of pages used by ram blocks not including any
3238 * gaps due to alignment or unplugs.
03158519 3239 * This must match with the initial values of dirty bitmap.
7d00ee6a 3240 */
40c4d4a8 3241 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
7d00ee6a
PX
3242 ram_state_reset(*rsp);
3243
3244 return 0;
3245}
3246
d6eff5d7 3247static void ram_list_init_bitmaps(void)
7d00ee6a 3248{
002cad6b 3249 MigrationState *ms = migrate_get_current();
d6eff5d7
PX
3250 RAMBlock *block;
3251 unsigned long pages;
002cad6b 3252 uint8_t shift;
56e93d26 3253
0827b9e9
AA
3254 /* Skip setting bitmap if there is no RAM */
3255 if (ram_bytes_total()) {
002cad6b
PX
3256 shift = ms->clear_bitmap_shift;
3257 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3258 error_report("clear_bitmap_shift (%u) too big, using "
3259 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3260 shift = CLEAR_BITMAP_SHIFT_MAX;
3261 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3262 error_report("clear_bitmap_shift (%u) too small, using "
3263 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3264 shift = CLEAR_BITMAP_SHIFT_MIN;
3265 }
3266
fbd162e6 3267 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
d6eff5d7 3268 pages = block->max_length >> TARGET_PAGE_BITS;
03158519
WY
3269 /*
3270 * The initial dirty bitmap for migration must be set with all
3271 * ones to make sure we'll migrate every guest RAM page to
3272 * destination.
40c4d4a8
IR
3273 * Here we set RAMBlock.bmap all to 1 because when rebegin a
3274 * new migration after a failed migration, ram_list.
3275 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3276 * guest memory.
03158519 3277 */
6b6712ef 3278 block->bmap = bitmap_new(pages);
40c4d4a8 3279 bitmap_set(block->bmap, 0, pages);
002cad6b
PX
3280 block->clear_bmap_shift = shift;
3281 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
6b6712ef
JQ
3282 if (migrate_postcopy_ram()) {
3283 block->unsentmap = bitmap_new(pages);
3284 bitmap_set(block->unsentmap, 0, pages);
3285 }
0827b9e9 3286 }
f3f491fc 3287 }
d6eff5d7
PX
3288}
3289
3290static void ram_init_bitmaps(RAMState *rs)
3291{
3292 /* For memory_global_dirty_log_start below. */
3293 qemu_mutex_lock_iothread();
3294 qemu_mutex_lock_ramlist();
3295 rcu_read_lock();
f3f491fc 3296
d6eff5d7 3297 ram_list_init_bitmaps();
56e93d26 3298 memory_global_dirty_log_start();
bd227060 3299 migration_bitmap_sync_precopy(rs);
d6eff5d7
PX
3300
3301 rcu_read_unlock();
56e93d26 3302 qemu_mutex_unlock_ramlist();
49877834 3303 qemu_mutex_unlock_iothread();
d6eff5d7
PX
3304}
3305
3306static int ram_init_all(RAMState **rsp)
3307{
3308 if (ram_state_init(rsp)) {
3309 return -1;
3310 }
3311
3312 if (xbzrle_init()) {
3313 ram_state_cleanup(rsp);
3314 return -1;
3315 }
3316
3317 ram_init_bitmaps(*rsp);
a91246c9
HZ
3318
3319 return 0;
3320}
3321
08614f34
PX
3322static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3323{
3324 RAMBlock *block;
3325 uint64_t pages = 0;
3326
3327 /*
3328 * Postcopy is not using xbzrle/compression, so no need for that.
3329 * Also, since source are already halted, we don't need to care
3330 * about dirty page logging as well.
3331 */
3332
fbd162e6 3333 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
08614f34
PX
3334 pages += bitmap_count_one(block->bmap,
3335 block->used_length >> TARGET_PAGE_BITS);
3336 }
3337
3338 /* This may not be aligned with current bitmaps. Recalculate. */
3339 rs->migration_dirty_pages = pages;
3340
3341 rs->last_seen_block = NULL;
3342 rs->last_sent_block = NULL;
3343 rs->last_page = 0;
3344 rs->last_version = ram_list.version;
3345 /*
3346 * Disable the bulk stage, otherwise we'll resend the whole RAM no
3347 * matter what we have sent.
3348 */
3349 rs->ram_bulk_stage = false;
3350
3351 /* Update RAMState cache of output QEMUFile */
3352 rs->f = out;
3353
3354 trace_ram_state_resume_prepare(pages);
3355}
3356
6bcb05fc
WW
3357/*
3358 * This function clears bits of the free pages reported by the caller from the
3359 * migration dirty bitmap. @addr is the host address corresponding to the
3360 * start of the continuous guest free pages, and @len is the total bytes of
3361 * those pages.
3362 */
3363void qemu_guest_free_page_hint(void *addr, size_t len)
3364{
3365 RAMBlock *block;
3366 ram_addr_t offset;
3367 size_t used_len, start, npages;
3368 MigrationState *s = migrate_get_current();
3369
3370 /* This function is currently expected to be used during live migration */
3371 if (!migration_is_setup_or_active(s->state)) {
3372 return;
3373 }
3374
3375 for (; len > 0; len -= used_len, addr += used_len) {
3376 block = qemu_ram_block_from_host(addr, false, &offset);
3377 if (unlikely(!block || offset >= block->used_length)) {
3378 /*
3379 * The implementation might not support RAMBlock resize during
3380 * live migration, but it could happen in theory with future
3381 * updates. So we add a check here to capture that case.
3382 */
3383 error_report_once("%s unexpected error", __func__);
3384 return;
3385 }
3386
3387 if (len <= block->used_length - offset) {
3388 used_len = len;
3389 } else {
3390 used_len = block->used_length - offset;
3391 }
3392
3393 start = offset >> TARGET_PAGE_BITS;
3394 npages = used_len >> TARGET_PAGE_BITS;
3395
3396 qemu_mutex_lock(&ram_state->bitmap_mutex);
3397 ram_state->migration_dirty_pages -=
3398 bitmap_count_one_with_offset(block->bmap, start, npages);
3399 bitmap_clear(block->bmap, start, npages);
3400 qemu_mutex_unlock(&ram_state->bitmap_mutex);
3401 }
3402}
3403
3d0684b2
JQ
3404/*
3405 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
a91246c9
HZ
3406 * long-running RCU critical section. When rcu-reclaims in the code
3407 * start to become numerous it will be necessary to reduce the
3408 * granularity of these critical sections.
3409 */
3410
3d0684b2
JQ
3411/**
3412 * ram_save_setup: Setup RAM for migration
3413 *
3414 * Returns zero to indicate success and negative for error
3415 *
3416 * @f: QEMUFile where to send the data
3417 * @opaque: RAMState pointer
3418 */
a91246c9
HZ
3419static int ram_save_setup(QEMUFile *f, void *opaque)
3420{
53518d94 3421 RAMState **rsp = opaque;
a91246c9
HZ
3422 RAMBlock *block;
3423
dcaf446e
XG
3424 if (compress_threads_save_setup()) {
3425 return -1;
3426 }
3427
a91246c9
HZ
3428 /* migration has already setup the bitmap, reuse it. */
3429 if (!migration_in_colo_state()) {
7d00ee6a 3430 if (ram_init_all(rsp) != 0) {
dcaf446e 3431 compress_threads_save_cleanup();
a91246c9 3432 return -1;
53518d94 3433 }
a91246c9 3434 }
53518d94 3435 (*rsp)->f = f;
a91246c9
HZ
3436
3437 rcu_read_lock();
56e93d26 3438
fbd162e6 3439 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
56e93d26 3440
b895de50 3441 RAMBLOCK_FOREACH_MIGRATABLE(block) {
56e93d26
JQ
3442 qemu_put_byte(f, strlen(block->idstr));
3443 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3444 qemu_put_be64(f, block->used_length);
ef08fb38
DDAG
3445 if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
3446 qemu_put_be64(f, block->page_size);
3447 }
fbd162e6
YK
3448 if (migrate_ignore_shared()) {
3449 qemu_put_be64(f, block->mr->addr);
fbd162e6 3450 }
56e93d26
JQ
3451 }
3452
3453 rcu_read_unlock();
3454
3455 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3456 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3457
1b81c974 3458 multifd_send_sync_main(*rsp);
56e93d26 3459 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
35374cbd 3460 qemu_fflush(f);
56e93d26
JQ
3461
3462 return 0;
3463}
3464
3d0684b2
JQ
3465/**
3466 * ram_save_iterate: iterative stage for migration
3467 *
3468 * Returns zero to indicate success and negative for error
3469 *
3470 * @f: QEMUFile where to send the data
3471 * @opaque: RAMState pointer
3472 */
56e93d26
JQ
3473static int ram_save_iterate(QEMUFile *f, void *opaque)
3474{
53518d94
JQ
3475 RAMState **temp = opaque;
3476 RAMState *rs = *temp;
56e93d26
JQ
3477 int ret;
3478 int i;
3479 int64_t t0;
5c90308f 3480 int done = 0;
56e93d26 3481
b2557345
PL
3482 if (blk_mig_bulk_active()) {
3483 /* Avoid transferring ram during bulk phase of block migration as
3484 * the bulk phase will usually take a long time and transferring
3485 * ram updates during that time is pointless. */
3486 goto out;
3487 }
3488
56e93d26 3489 rcu_read_lock();
6f37bb8b
JQ
3490 if (ram_list.version != rs->last_version) {
3491 ram_state_reset(rs);
56e93d26
JQ
3492 }
3493
3494 /* Read version before ram_list.blocks */
3495 smp_rmb();
3496
3497 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3498
3499 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3500 i = 0;
e03a34f8
DDAG
3501 while ((ret = qemu_file_rate_limit(f)) == 0 ||
3502 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
56e93d26
JQ
3503 int pages;
3504
e03a34f8
DDAG
3505 if (qemu_file_get_error(f)) {
3506 break;
3507 }
3508
ce25d337 3509 pages = ram_find_and_save_block(rs, false);
56e93d26
JQ
3510 /* no more pages to sent */
3511 if (pages == 0) {
5c90308f 3512 done = 1;
56e93d26
JQ
3513 break;
3514 }
e8f3735f
XG
3515
3516 if (pages < 0) {
3517 qemu_file_set_error(f, pages);
3518 break;
3519 }
3520
be8b02ed 3521 rs->target_page_count += pages;
070afca2 3522
56e93d26
JQ
3523 /* we want to check in the 1st loop, just in case it was the 1st time
3524 and we had to sync the dirty bitmap.
a5f7b1a6 3525 qemu_clock_get_ns() is a bit expensive, so we only check each some
56e93d26
JQ
3526 iterations
3527 */
3528 if ((i & 63) == 0) {
3529 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
3530 if (t1 > MAX_WAIT) {
55c4446b 3531 trace_ram_save_iterate_big_wait(t1, i);
56e93d26
JQ
3532 break;
3533 }
3534 }
3535 i++;
3536 }
56e93d26
JQ
3537 rcu_read_unlock();
3538
3539 /*
3540 * Must occur before EOS (or any QEMUFile operation)
3541 * because of RDMA protocol.
3542 */
3543 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3544
b2557345 3545out:
1b81c974 3546 multifd_send_sync_main(rs);
56e93d26 3547 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
35374cbd 3548 qemu_fflush(f);
9360447d 3549 ram_counters.transferred += 8;
56e93d26
JQ
3550
3551 ret = qemu_file_get_error(f);
3552 if (ret < 0) {
3553 return ret;
3554 }
3555
5c90308f 3556 return done;
56e93d26
JQ
3557}
3558
3d0684b2
JQ
3559/**
3560 * ram_save_complete: function called to send the remaining amount of ram
3561 *
e8f3735f 3562 * Returns zero to indicate success or negative on error
3d0684b2
JQ
3563 *
3564 * Called with iothread lock
3565 *
3566 * @f: QEMUFile where to send the data
3567 * @opaque: RAMState pointer
3568 */
56e93d26
JQ
3569static int ram_save_complete(QEMUFile *f, void *opaque)
3570{
53518d94
JQ
3571 RAMState **temp = opaque;
3572 RAMState *rs = *temp;
e8f3735f 3573 int ret = 0;
6f37bb8b 3574
56e93d26
JQ
3575 rcu_read_lock();
3576
5727309d 3577 if (!migration_in_postcopy()) {
bd227060 3578 migration_bitmap_sync_precopy(rs);
663e6c1d 3579 }
56e93d26
JQ
3580
3581 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3582
3583 /* try transferring iterative blocks of memory */
3584
3585 /* flush all remaining blocks regardless of rate limiting */
3586 while (true) {
3587 int pages;
3588
ce25d337 3589 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
56e93d26
JQ
3590 /* no more blocks to sent */
3591 if (pages == 0) {
3592 break;
3593 }
e8f3735f
XG
3594 if (pages < 0) {
3595 ret = pages;
3596 break;
3597 }
56e93d26
JQ
3598 }
3599
ce25d337 3600 flush_compressed_data(rs);
56e93d26 3601 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
56e93d26
JQ
3602
3603 rcu_read_unlock();
d09a6fde 3604
1b81c974 3605 multifd_send_sync_main(rs);
56e93d26 3606 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
35374cbd 3607 qemu_fflush(f);
56e93d26 3608
e8f3735f 3609 return ret;
56e93d26
JQ
3610}
3611
c31b098f 3612static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
47995026
VSO
3613 uint64_t *res_precopy_only,
3614 uint64_t *res_compatible,
3615 uint64_t *res_postcopy_only)
56e93d26 3616{
53518d94
JQ
3617 RAMState **temp = opaque;
3618 RAMState *rs = *temp;
56e93d26
JQ
3619 uint64_t remaining_size;
3620
9edabd4d 3621 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 3622
5727309d 3623 if (!migration_in_postcopy() &&
663e6c1d 3624 remaining_size < max_size) {
56e93d26
JQ
3625 qemu_mutex_lock_iothread();
3626 rcu_read_lock();
bd227060 3627 migration_bitmap_sync_precopy(rs);
56e93d26
JQ
3628 rcu_read_unlock();
3629 qemu_mutex_unlock_iothread();
9edabd4d 3630 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 3631 }
c31b098f 3632
86e1167e
VSO
3633 if (migrate_postcopy_ram()) {
3634 /* We can do postcopy, and all the data is postcopiable */
47995026 3635 *res_compatible += remaining_size;
86e1167e 3636 } else {
47995026 3637 *res_precopy_only += remaining_size;
86e1167e 3638 }
56e93d26
JQ
3639}
3640
3641static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3642{
3643 unsigned int xh_len;
3644 int xh_flags;
063e760a 3645 uint8_t *loaded_data;
56e93d26 3646
56e93d26
JQ
3647 /* extract RLE header */
3648 xh_flags = qemu_get_byte(f);
3649 xh_len = qemu_get_be16(f);
3650
3651 if (xh_flags != ENCODING_FLAG_XBZRLE) {
3652 error_report("Failed to load XBZRLE page - wrong compression!");
3653 return -1;
3654 }
3655
3656 if (xh_len > TARGET_PAGE_SIZE) {
3657 error_report("Failed to load XBZRLE page - len overflow!");
3658 return -1;
3659 }
f265e0e4 3660 loaded_data = XBZRLE.decoded_buf;
56e93d26 3661 /* load data and decode */
f265e0e4 3662 /* it can change loaded_data to point to an internal buffer */
063e760a 3663 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
56e93d26
JQ
3664
3665 /* decode RLE */
063e760a 3666 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
56e93d26
JQ
3667 TARGET_PAGE_SIZE) == -1) {
3668 error_report("Failed to load XBZRLE page - decode error!");
3669 return -1;
3670 }
3671
3672 return 0;
3673}
3674
3d0684b2
JQ
3675/**
3676 * ram_block_from_stream: read a RAMBlock id from the migration stream
3677 *
3678 * Must be called from within a rcu critical section.
3679 *
56e93d26 3680 * Returns a pointer from within the RCU-protected ram_list.
a7180877 3681 *
3d0684b2
JQ
3682 * @f: QEMUFile where to read the data from
3683 * @flags: Page flags (mostly to see if it's a continuation of previous block)
a7180877 3684 */
3d0684b2 3685static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
56e93d26
JQ
3686{
3687 static RAMBlock *block = NULL;
3688 char id[256];
3689 uint8_t len;
3690
3691 if (flags & RAM_SAVE_FLAG_CONTINUE) {
4c4bad48 3692 if (!block) {
56e93d26
JQ
3693 error_report("Ack, bad migration stream!");
3694 return NULL;
3695 }
4c4bad48 3696 return block;
56e93d26
JQ
3697 }
3698
3699 len = qemu_get_byte(f);
3700 qemu_get_buffer(f, (uint8_t *)id, len);
3701 id[len] = 0;
3702
e3dd7493 3703 block = qemu_ram_block_by_name(id);
4c4bad48
HZ
3704 if (!block) {
3705 error_report("Can't find block %s", id);
3706 return NULL;
56e93d26
JQ
3707 }
3708
fbd162e6 3709 if (ramblock_is_ignored(block)) {
b895de50
CLG
3710 error_report("block %s should not be migrated !", id);
3711 return NULL;
3712 }
3713
4c4bad48
HZ
3714 return block;
3715}
3716
3717static inline void *host_from_ram_block_offset(RAMBlock *block,
3718 ram_addr_t offset)
3719{
3720 if (!offset_in_ramblock(block, offset)) {
3721 return NULL;
3722 }
3723
3724 return block->host + offset;
56e93d26
JQ
3725}
3726
13af18f2
ZC
3727static inline void *colo_cache_from_block_offset(RAMBlock *block,
3728 ram_addr_t offset)
3729{
3730 if (!offset_in_ramblock(block, offset)) {
3731 return NULL;
3732 }
3733 if (!block->colo_cache) {
3734 error_report("%s: colo_cache is NULL in block :%s",
3735 __func__, block->idstr);
3736 return NULL;
3737 }
7d9acafa
ZC
3738
3739 /*
3740 * During colo checkpoint, we need bitmap of these migrated pages.
3741 * It help us to decide which pages in ram cache should be flushed
3742 * into VM's RAM later.
3743 */
3744 if (!test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3745 ram_state->migration_dirty_pages++;
3746 }
13af18f2
ZC
3747 return block->colo_cache + offset;
3748}
3749
3d0684b2
JQ
3750/**
3751 * ram_handle_compressed: handle the zero page case
3752 *
56e93d26
JQ
3753 * If a page (or a whole RDMA chunk) has been
3754 * determined to be zero, then zap it.
3d0684b2
JQ
3755 *
3756 * @host: host address for the zero page
3757 * @ch: what the page is filled from. We only support zero
3758 * @size: size of the zero page
56e93d26
JQ
3759 */
3760void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3761{
3762 if (ch != 0 || !is_zero_range(host, size)) {
3763 memset(host, ch, size);
3764 }
3765}
3766
797ca154
XG
3767/* return the size after decompression, or negative value on error */
3768static int
3769qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3770 const uint8_t *source, size_t source_len)
3771{
3772 int err;
3773
3774 err = inflateReset(stream);
3775 if (err != Z_OK) {
3776 return -1;
3777 }
3778
3779 stream->avail_in = source_len;
3780 stream->next_in = (uint8_t *)source;
3781 stream->avail_out = dest_len;
3782 stream->next_out = dest;
3783
3784 err = inflate(stream, Z_NO_FLUSH);
3785 if (err != Z_STREAM_END) {
3786 return -1;
3787 }
3788
3789 return stream->total_out;
3790}
3791
56e93d26
JQ
3792static void *do_data_decompress(void *opaque)
3793{
3794 DecompressParam *param = opaque;
3795 unsigned long pagesize;
33d151f4 3796 uint8_t *des;
34ab9e97 3797 int len, ret;
56e93d26 3798
33d151f4 3799 qemu_mutex_lock(&param->mutex);
90e56fb4 3800 while (!param->quit) {
33d151f4
LL
3801 if (param->des) {
3802 des = param->des;
3803 len = param->len;
3804 param->des = 0;
3805 qemu_mutex_unlock(&param->mutex);
3806
56e93d26 3807 pagesize = TARGET_PAGE_SIZE;
34ab9e97
XG
3808
3809 ret = qemu_uncompress_data(&param->stream, des, pagesize,
3810 param->compbuf, len);
f548222c 3811 if (ret < 0 && migrate_get_current()->decompress_error_check) {
34ab9e97
XG
3812 error_report("decompress data failed");
3813 qemu_file_set_error(decomp_file, ret);
3814 }
73a8912b 3815
33d151f4
LL
3816 qemu_mutex_lock(&decomp_done_lock);
3817 param->done = true;
3818 qemu_cond_signal(&decomp_done_cond);
3819 qemu_mutex_unlock(&decomp_done_lock);
3820
3821 qemu_mutex_lock(&param->mutex);
3822 } else {
3823 qemu_cond_wait(&param->cond, &param->mutex);
3824 }
56e93d26 3825 }
33d151f4 3826 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
3827
3828 return NULL;
3829}
3830
34ab9e97 3831static int wait_for_decompress_done(void)
5533b2e9
LL
3832{
3833 int idx, thread_count;
3834
3835 if (!migrate_use_compression()) {
34ab9e97 3836 return 0;
5533b2e9
LL
3837 }
3838
3839 thread_count = migrate_decompress_threads();
3840 qemu_mutex_lock(&decomp_done_lock);
3841 for (idx = 0; idx < thread_count; idx++) {
3842 while (!decomp_param[idx].done) {
3843 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3844 }
3845 }
3846 qemu_mutex_unlock(&decomp_done_lock);
34ab9e97 3847 return qemu_file_get_error(decomp_file);
5533b2e9
LL
3848}
3849
f0afa331 3850static void compress_threads_load_cleanup(void)
56e93d26
JQ
3851{
3852 int i, thread_count;
3853
3416ab5b
JQ
3854 if (!migrate_use_compression()) {
3855 return;
3856 }
56e93d26
JQ
3857 thread_count = migrate_decompress_threads();
3858 for (i = 0; i < thread_count; i++) {
797ca154
XG
3859 /*
3860 * we use it as a indicator which shows if the thread is
3861 * properly init'd or not
3862 */
3863 if (!decomp_param[i].compbuf) {
3864 break;
3865 }
3866
56e93d26 3867 qemu_mutex_lock(&decomp_param[i].mutex);
90e56fb4 3868 decomp_param[i].quit = true;
56e93d26
JQ
3869 qemu_cond_signal(&decomp_param[i].cond);
3870 qemu_mutex_unlock(&decomp_param[i].mutex);
3871 }
3872 for (i = 0; i < thread_count; i++) {
797ca154
XG
3873 if (!decomp_param[i].compbuf) {
3874 break;
3875 }
3876
56e93d26
JQ
3877 qemu_thread_join(decompress_threads + i);
3878 qemu_mutex_destroy(&decomp_param[i].mutex);
3879 qemu_cond_destroy(&decomp_param[i].cond);
797ca154 3880 inflateEnd(&decomp_param[i].stream);
56e93d26 3881 g_free(decomp_param[i].compbuf);
797ca154 3882 decomp_param[i].compbuf = NULL;
56e93d26
JQ
3883 }
3884 g_free(decompress_threads);
3885 g_free(decomp_param);
56e93d26
JQ
3886 decompress_threads = NULL;
3887 decomp_param = NULL;
34ab9e97 3888 decomp_file = NULL;
56e93d26
JQ
3889}
3890
34ab9e97 3891static int compress_threads_load_setup(QEMUFile *f)
797ca154
XG
3892{
3893 int i, thread_count;
3894
3895 if (!migrate_use_compression()) {
3896 return 0;
3897 }
3898
3899 thread_count = migrate_decompress_threads();
3900 decompress_threads = g_new0(QemuThread, thread_count);
3901 decomp_param = g_new0(DecompressParam, thread_count);
3902 qemu_mutex_init(&decomp_done_lock);
3903 qemu_cond_init(&decomp_done_cond);
34ab9e97 3904 decomp_file = f;
797ca154
XG
3905 for (i = 0; i < thread_count; i++) {
3906 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3907 goto exit;
3908 }
3909
3910 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3911 qemu_mutex_init(&decomp_param[i].mutex);
3912 qemu_cond_init(&decomp_param[i].cond);
3913 decomp_param[i].done = true;
3914 decomp_param[i].quit = false;
3915 qemu_thread_create(decompress_threads + i, "decompress",
3916 do_data_decompress, decomp_param + i,
3917 QEMU_THREAD_JOINABLE);
3918 }
3919 return 0;
3920exit:
3921 compress_threads_load_cleanup();
3922 return -1;
3923}
3924
c1bc6626 3925static void decompress_data_with_multi_threads(QEMUFile *f,
56e93d26
JQ
3926 void *host, int len)
3927{
3928 int idx, thread_count;
3929
3930 thread_count = migrate_decompress_threads();
73a8912b 3931 qemu_mutex_lock(&decomp_done_lock);
56e93d26
JQ
3932 while (true) {
3933 for (idx = 0; idx < thread_count; idx++) {
73a8912b 3934 if (decomp_param[idx].done) {
33d151f4
LL
3935 decomp_param[idx].done = false;
3936 qemu_mutex_lock(&decomp_param[idx].mutex);
c1bc6626 3937 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
56e93d26
JQ
3938 decomp_param[idx].des = host;
3939 decomp_param[idx].len = len;
33d151f4
LL
3940 qemu_cond_signal(&decomp_param[idx].cond);
3941 qemu_mutex_unlock(&decomp_param[idx].mutex);
56e93d26
JQ
3942 break;
3943 }
3944 }
3945 if (idx < thread_count) {
3946 break;
73a8912b
LL
3947 } else {
3948 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
56e93d26
JQ
3949 }
3950 }
73a8912b 3951 qemu_mutex_unlock(&decomp_done_lock);
56e93d26
JQ
3952}
3953
13af18f2
ZC
3954/*
3955 * colo cache: this is for secondary VM, we cache the whole
3956 * memory of the secondary VM, it is need to hold the global lock
3957 * to call this helper.
3958 */
3959int colo_init_ram_cache(void)
3960{
3961 RAMBlock *block;
3962
3963 rcu_read_lock();
fbd162e6 3964 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
13af18f2
ZC
3965 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3966 NULL,
3967 false);
3968 if (!block->colo_cache) {
3969 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3970 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3971 block->used_length);
3972 goto out_locked;
3973 }
3974 memcpy(block->colo_cache, block->host, block->used_length);
3975 }
3976 rcu_read_unlock();
7d9acafa
ZC
3977 /*
3978 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3979 * with to decide which page in cache should be flushed into SVM's RAM. Here
3980 * we use the same name 'ram_bitmap' as for migration.
3981 */
3982 if (ram_bytes_total()) {
3983 RAMBlock *block;
3984
fbd162e6 3985 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7d9acafa
ZC
3986 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3987
3988 block->bmap = bitmap_new(pages);
3989 bitmap_set(block->bmap, 0, pages);
3990 }
3991 }
3992 ram_state = g_new0(RAMState, 1);
3993 ram_state->migration_dirty_pages = 0;
c6e5bafb 3994 qemu_mutex_init(&ram_state->bitmap_mutex);
d1955d22 3995 memory_global_dirty_log_start();
7d9acafa 3996
13af18f2
ZC
3997 return 0;
3998
3999out_locked:
7d9acafa 4000
fbd162e6 4001 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
13af18f2
ZC
4002 if (block->colo_cache) {
4003 qemu_anon_ram_free(block->colo_cache, block->used_length);
4004 block->colo_cache = NULL;
4005 }
4006 }
4007
4008 rcu_read_unlock();
4009 return -errno;
4010}
4011
4012/* It is need to hold the global lock to call this helper */
4013void colo_release_ram_cache(void)
4014{
4015 RAMBlock *block;
4016
d1955d22 4017 memory_global_dirty_log_stop();
fbd162e6 4018 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7d9acafa
ZC
4019 g_free(block->bmap);
4020 block->bmap = NULL;
4021 }
4022
13af18f2 4023 rcu_read_lock();
7d9acafa 4024
fbd162e6 4025 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
13af18f2
ZC
4026 if (block->colo_cache) {
4027 qemu_anon_ram_free(block->colo_cache, block->used_length);
4028 block->colo_cache = NULL;
4029 }
4030 }
7d9acafa 4031
13af18f2 4032 rcu_read_unlock();
c6e5bafb 4033 qemu_mutex_destroy(&ram_state->bitmap_mutex);
7d9acafa
ZC
4034 g_free(ram_state);
4035 ram_state = NULL;
13af18f2
ZC
4036}
4037
f265e0e4
JQ
4038/**
4039 * ram_load_setup: Setup RAM for migration incoming side
4040 *
4041 * Returns zero to indicate success and negative for error
4042 *
4043 * @f: QEMUFile where to receive the data
4044 * @opaque: RAMState pointer
4045 */
4046static int ram_load_setup(QEMUFile *f, void *opaque)
4047{
34ab9e97 4048 if (compress_threads_load_setup(f)) {
797ca154
XG
4049 return -1;
4050 }
4051
f265e0e4 4052 xbzrle_load_setup();
f9494614 4053 ramblock_recv_map_init();
13af18f2 4054
f265e0e4
JQ
4055 return 0;
4056}
4057
4058static int ram_load_cleanup(void *opaque)
4059{
f9494614 4060 RAMBlock *rb;
56eb90af 4061
fbd162e6 4062 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
56eb90af
JH
4063 if (ramblock_is_pmem(rb)) {
4064 pmem_persist(rb->host, rb->used_length);
4065 }
4066 }
4067
f265e0e4 4068 xbzrle_load_cleanup();
f0afa331 4069 compress_threads_load_cleanup();
f9494614 4070
fbd162e6 4071 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
f9494614
AP
4072 g_free(rb->receivedmap);
4073 rb->receivedmap = NULL;
4074 }
13af18f2 4075
f265e0e4
JQ
4076 return 0;
4077}
4078
3d0684b2
JQ
4079/**
4080 * ram_postcopy_incoming_init: allocate postcopy data structures
4081 *
4082 * Returns 0 for success and negative if there was one error
4083 *
4084 * @mis: current migration incoming state
4085 *
4086 * Allocate data structures etc needed by incoming migration with
4087 * postcopy-ram. postcopy-ram's similarly names
4088 * postcopy_ram_incoming_init does the work.
1caddf8a
DDAG
4089 */
4090int ram_postcopy_incoming_init(MigrationIncomingState *mis)
4091{
c136180c 4092 return postcopy_ram_incoming_init(mis);
1caddf8a
DDAG
4093}
4094
3d0684b2
JQ
4095/**
4096 * ram_load_postcopy: load a page in postcopy case
4097 *
4098 * Returns 0 for success or -errno in case of error
4099 *
a7180877
DDAG
4100 * Called in postcopy mode by ram_load().
4101 * rcu_read_lock is taken prior to this being called.
3d0684b2
JQ
4102 *
4103 * @f: QEMUFile where to send the data
a7180877
DDAG
4104 */
4105static int ram_load_postcopy(QEMUFile *f)
4106{
4107 int flags = 0, ret = 0;
4108 bool place_needed = false;
1aa83678 4109 bool matches_target_page_size = false;
a7180877
DDAG
4110 MigrationIncomingState *mis = migration_incoming_get_current();
4111 /* Temporary page that is later 'placed' */
4112 void *postcopy_host_page = postcopy_get_tmp_page(mis);
c53b7ddc 4113 void *last_host = NULL;
a3b6ff6d 4114 bool all_zero = false;
a7180877
DDAG
4115
4116 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4117 ram_addr_t addr;
4118 void *host = NULL;
4119 void *page_buffer = NULL;
4120 void *place_source = NULL;
df9ff5e1 4121 RAMBlock *block = NULL;
a7180877 4122 uint8_t ch;
a7180877
DDAG
4123
4124 addr = qemu_get_be64(f);
7a9ddfbf
PX
4125
4126 /*
4127 * If qemu file error, we should stop here, and then "addr"
4128 * may be invalid
4129 */
4130 ret = qemu_file_get_error(f);
4131 if (ret) {
4132 break;
4133 }
4134
a7180877
DDAG
4135 flags = addr & ~TARGET_PAGE_MASK;
4136 addr &= TARGET_PAGE_MASK;
4137
4138 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
4139 place_needed = false;
bb890ed5 4140 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
df9ff5e1 4141 block = ram_block_from_stream(f, flags);
4c4bad48
HZ
4142
4143 host = host_from_ram_block_offset(block, addr);
a7180877
DDAG
4144 if (!host) {
4145 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4146 ret = -EINVAL;
4147 break;
4148 }
1aa83678 4149 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
a7180877 4150 /*
28abd200
DDAG
4151 * Postcopy requires that we place whole host pages atomically;
4152 * these may be huge pages for RAMBlocks that are backed by
4153 * hugetlbfs.
a7180877
DDAG
4154 * To make it atomic, the data is read into a temporary page
4155 * that's moved into place later.
4156 * The migration protocol uses, possibly smaller, target-pages
4157 * however the source ensures it always sends all the components
4158 * of a host page in order.
4159 */
4160 page_buffer = postcopy_host_page +
28abd200 4161 ((uintptr_t)host & (block->page_size - 1));
a7180877 4162 /* If all TP are zero then we can optimise the place */
28abd200 4163 if (!((uintptr_t)host & (block->page_size - 1))) {
a7180877 4164 all_zero = true;
c53b7ddc
DDAG
4165 } else {
4166 /* not the 1st TP within the HP */
4167 if (host != (last_host + TARGET_PAGE_SIZE)) {
9af9e0fe 4168 error_report("Non-sequential target page %p/%p",
c53b7ddc
DDAG
4169 host, last_host);
4170 ret = -EINVAL;
4171 break;
4172 }
a7180877
DDAG
4173 }
4174
c53b7ddc 4175
a7180877
DDAG
4176 /*
4177 * If it's the last part of a host page then we place the host
4178 * page
4179 */
4180 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
28abd200 4181 (block->page_size - 1)) == 0;
a7180877
DDAG
4182 place_source = postcopy_host_page;
4183 }
c53b7ddc 4184 last_host = host;
a7180877
DDAG
4185
4186 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
bb890ed5 4187 case RAM_SAVE_FLAG_ZERO:
a7180877
DDAG
4188 ch = qemu_get_byte(f);
4189 memset(page_buffer, ch, TARGET_PAGE_SIZE);
4190 if (ch) {
4191 all_zero = false;
4192 }
4193 break;
4194
4195 case RAM_SAVE_FLAG_PAGE:
4196 all_zero = false;
1aa83678
PX
4197 if (!matches_target_page_size) {
4198 /* For huge pages, we always use temporary buffer */
a7180877
DDAG
4199 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4200 } else {
1aa83678
PX
4201 /*
4202 * For small pages that matches target page size, we
4203 * avoid the qemu_file copy. Instead we directly use
4204 * the buffer of QEMUFile to place the page. Note: we
4205 * cannot do any QEMUFile operation before using that
4206 * buffer to make sure the buffer is valid when
4207 * placing the page.
a7180877
DDAG
4208 */
4209 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4210 TARGET_PAGE_SIZE);
4211 }
4212 break;
4213 case RAM_SAVE_FLAG_EOS:
4214 /* normal exit */
6df264ac 4215 multifd_recv_sync_main();
a7180877
DDAG
4216 break;
4217 default:
4218 error_report("Unknown combination of migration flags: %#x"
4219 " (postcopy mode)", flags);
4220 ret = -EINVAL;
7a9ddfbf
PX
4221 break;
4222 }
4223
4224 /* Detect for any possible file errors */
4225 if (!ret && qemu_file_get_error(f)) {
4226 ret = qemu_file_get_error(f);
a7180877
DDAG
4227 }
4228
7a9ddfbf 4229 if (!ret && place_needed) {
a7180877 4230 /* This gets called at the last target page in the host page */
df9ff5e1
DDAG
4231 void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
4232
a7180877 4233 if (all_zero) {
df9ff5e1 4234 ret = postcopy_place_page_zero(mis, place_dest,
8be4620b 4235 block);
a7180877 4236 } else {
df9ff5e1 4237 ret = postcopy_place_page(mis, place_dest,
8be4620b 4238 place_source, block);
a7180877
DDAG
4239 }
4240 }
a7180877
DDAG
4241 }
4242
4243 return ret;
4244}
4245
acab30b8
DHB
4246static bool postcopy_is_advised(void)
4247{
4248 PostcopyState ps = postcopy_state_get();
4249 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
4250}
4251
4252static bool postcopy_is_running(void)
4253{
4254 PostcopyState ps = postcopy_state_get();
4255 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4256}
4257
e6f4aa18
ZC
4258/*
4259 * Flush content of RAM cache into SVM's memory.
4260 * Only flush the pages that be dirtied by PVM or SVM or both.
4261 */
4262static void colo_flush_ram_cache(void)
4263{
4264 RAMBlock *block = NULL;
4265 void *dst_host;
4266 void *src_host;
4267 unsigned long offset = 0;
4268
d1955d22
HZ
4269 memory_global_dirty_log_sync();
4270 rcu_read_lock();
fbd162e6 4271 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7a3e9571 4272 ramblock_sync_dirty_bitmap(ram_state, block);
d1955d22
HZ
4273 }
4274 rcu_read_unlock();
4275
e6f4aa18
ZC
4276 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4277 rcu_read_lock();
4278 block = QLIST_FIRST_RCU(&ram_list.blocks);
4279
4280 while (block) {
4281 offset = migration_bitmap_find_dirty(ram_state, block, offset);
4282
4283 if (offset << TARGET_PAGE_BITS >= block->used_length) {
4284 offset = 0;
4285 block = QLIST_NEXT_RCU(block, next);
4286 } else {
4287 migration_bitmap_clear_dirty(ram_state, block, offset);
4288 dst_host = block->host + (offset << TARGET_PAGE_BITS);
4289 src_host = block->colo_cache + (offset << TARGET_PAGE_BITS);
4290 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
4291 }
4292 }
4293
4294 rcu_read_unlock();
4295 trace_colo_flush_ram_cache_end();
4296}
4297
10da4a36
WY
4298/**
4299 * ram_load_precopy: load pages in precopy case
4300 *
4301 * Returns 0 for success or -errno in case of error
4302 *
4303 * Called in precopy mode by ram_load().
4304 * rcu_read_lock is taken prior to this being called.
4305 *
4306 * @f: QEMUFile where to send the data
4307 */
4308static int ram_load_precopy(QEMUFile *f)
56e93d26 4309{
10da4a36 4310 int flags = 0, ret = 0, invalid_flags = 0, len = 0;
ef08fb38 4311 /* ADVISE is earlier, it shows the source has the postcopy capability on */
acab30b8 4312 bool postcopy_advised = postcopy_is_advised();
edc60127
JQ
4313 if (!migrate_use_compression()) {
4314 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4315 }
a7180877 4316
10da4a36 4317 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
56e93d26 4318 ram_addr_t addr, total_ram_bytes;
a776aa15 4319 void *host = NULL;
56e93d26
JQ
4320 uint8_t ch;
4321
4322 addr = qemu_get_be64(f);
4323 flags = addr & ~TARGET_PAGE_MASK;
4324 addr &= TARGET_PAGE_MASK;
4325
edc60127
JQ
4326 if (flags & invalid_flags) {
4327 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4328 error_report("Received an unexpected compressed page");
4329 }
4330
4331 ret = -EINVAL;
4332 break;
4333 }
4334
bb890ed5 4335 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
a776aa15 4336 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4c4bad48
HZ
4337 RAMBlock *block = ram_block_from_stream(f, flags);
4338
13af18f2
ZC
4339 /*
4340 * After going into COLO, we should load the Page into colo_cache.
4341 */
4342 if (migration_incoming_in_colo_state()) {
4343 host = colo_cache_from_block_offset(block, addr);
4344 } else {
4345 host = host_from_ram_block_offset(block, addr);
4346 }
a776aa15
DDAG
4347 if (!host) {
4348 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4349 ret = -EINVAL;
4350 break;
4351 }
13af18f2
ZC
4352
4353 if (!migration_incoming_in_colo_state()) {
4354 ramblock_recv_bitmap_set(block, host);
4355 }
4356
1db9d8e5 4357 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
a776aa15
DDAG
4358 }
4359
56e93d26
JQ
4360 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4361 case RAM_SAVE_FLAG_MEM_SIZE:
4362 /* Synchronize RAM block list */
4363 total_ram_bytes = addr;
4364 while (!ret && total_ram_bytes) {
4365 RAMBlock *block;
56e93d26
JQ
4366 char id[256];
4367 ram_addr_t length;
4368
4369 len = qemu_get_byte(f);
4370 qemu_get_buffer(f, (uint8_t *)id, len);
4371 id[len] = 0;
4372 length = qemu_get_be64(f);
4373
e3dd7493 4374 block = qemu_ram_block_by_name(id);
b895de50
CLG
4375 if (block && !qemu_ram_is_migratable(block)) {
4376 error_report("block %s should not be migrated !", id);
4377 ret = -EINVAL;
4378 } else if (block) {
e3dd7493
DDAG
4379 if (length != block->used_length) {
4380 Error *local_err = NULL;
56e93d26 4381
fa53a0e5 4382 ret = qemu_ram_resize(block, length,
e3dd7493
DDAG
4383 &local_err);
4384 if (local_err) {
4385 error_report_err(local_err);
56e93d26 4386 }
56e93d26 4387 }
ef08fb38
DDAG
4388 /* For postcopy we need to check hugepage sizes match */
4389 if (postcopy_advised &&
4390 block->page_size != qemu_host_page_size) {
4391 uint64_t remote_page_size = qemu_get_be64(f);
4392 if (remote_page_size != block->page_size) {
4393 error_report("Mismatched RAM page size %s "
4394 "(local) %zd != %" PRId64,
4395 id, block->page_size,
4396 remote_page_size);
4397 ret = -EINVAL;
4398 }
4399 }
fbd162e6
YK
4400 if (migrate_ignore_shared()) {
4401 hwaddr addr = qemu_get_be64(f);
fbd162e6
YK
4402 if (ramblock_is_ignored(block) &&
4403 block->mr->addr != addr) {
4404 error_report("Mismatched GPAs for block %s "
4405 "%" PRId64 "!= %" PRId64,
4406 id, (uint64_t)addr,
4407 (uint64_t)block->mr->addr);
4408 ret = -EINVAL;
4409 }
4410 }
e3dd7493
DDAG
4411 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4412 block->idstr);
4413 } else {
56e93d26
JQ
4414 error_report("Unknown ramblock \"%s\", cannot "
4415 "accept migration", id);
4416 ret = -EINVAL;
4417 }
4418
4419 total_ram_bytes -= length;
4420 }
4421 break;
a776aa15 4422
bb890ed5 4423 case RAM_SAVE_FLAG_ZERO:
56e93d26
JQ
4424 ch = qemu_get_byte(f);
4425 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4426 break;
a776aa15 4427
56e93d26 4428 case RAM_SAVE_FLAG_PAGE:
56e93d26
JQ
4429 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4430 break;
56e93d26 4431
a776aa15 4432 case RAM_SAVE_FLAG_COMPRESS_PAGE:
56e93d26
JQ
4433 len = qemu_get_be32(f);
4434 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4435 error_report("Invalid compressed data length: %d", len);
4436 ret = -EINVAL;
4437 break;
4438 }
c1bc6626 4439 decompress_data_with_multi_threads(f, host, len);
56e93d26 4440 break;
a776aa15 4441
56e93d26 4442 case RAM_SAVE_FLAG_XBZRLE:
56e93d26
JQ
4443 if (load_xbzrle(f, addr, host) < 0) {
4444 error_report("Failed to decompress XBZRLE page at "
4445 RAM_ADDR_FMT, addr);
4446 ret = -EINVAL;
4447 break;
4448 }
4449 break;
4450 case RAM_SAVE_FLAG_EOS:
4451 /* normal exit */
6df264ac 4452 multifd_recv_sync_main();
56e93d26
JQ
4453 break;
4454 default:
4455 if (flags & RAM_SAVE_FLAG_HOOK) {
632e3a5c 4456 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
56e93d26
JQ
4457 } else {
4458 error_report("Unknown combination of migration flags: %#x",
4459 flags);
4460 ret = -EINVAL;
4461 }
4462 }
4463 if (!ret) {
4464 ret = qemu_file_get_error(f);
4465 }
4466 }
4467
10da4a36
WY
4468 return ret;
4469}
4470
4471static int ram_load(QEMUFile *f, void *opaque, int version_id)
4472{
4473 int ret = 0;
4474 static uint64_t seq_iter;
4475 /*
4476 * If system is running in postcopy mode, page inserts to host memory must
4477 * be atomic
4478 */
4479 bool postcopy_running = postcopy_is_running();
4480
4481 seq_iter++;
4482
4483 if (version_id != 4) {
4484 return -EINVAL;
4485 }
4486
4487 /*
4488 * This RCU critical section can be very long running.
4489 * When RCU reclaims in the code start to become numerous,
4490 * it will be necessary to reduce the granularity of this
4491 * critical section.
4492 */
4493 rcu_read_lock();
4494
4495 if (postcopy_running) {
4496 ret = ram_load_postcopy(f);
4497 } else {
4498 ret = ram_load_precopy(f);
4499 }
4500
34ab9e97 4501 ret |= wait_for_decompress_done();
56e93d26 4502 rcu_read_unlock();
55c4446b 4503 trace_ram_load_complete(ret, seq_iter);
e6f4aa18
ZC
4504
4505 if (!ret && migration_incoming_in_colo_state()) {
4506 colo_flush_ram_cache();
4507 }
56e93d26
JQ
4508 return ret;
4509}
4510
c6467627
VSO
4511static bool ram_has_postcopy(void *opaque)
4512{
469dd51b 4513 RAMBlock *rb;
fbd162e6 4514 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
469dd51b
JH
4515 if (ramblock_is_pmem(rb)) {
4516 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4517 "is not supported now!", rb->idstr, rb->host);
4518 return false;
4519 }
4520 }
4521
c6467627
VSO
4522 return migrate_postcopy_ram();
4523}
4524
edd090c7
PX
4525/* Sync all the dirty bitmap with destination VM. */
4526static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4527{
4528 RAMBlock *block;
4529 QEMUFile *file = s->to_dst_file;
4530 int ramblock_count = 0;
4531
4532 trace_ram_dirty_bitmap_sync_start();
4533
fbd162e6 4534 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
edd090c7
PX
4535 qemu_savevm_send_recv_bitmap(file, block->idstr);
4536 trace_ram_dirty_bitmap_request(block->idstr);
4537 ramblock_count++;
4538 }
4539
4540 trace_ram_dirty_bitmap_sync_wait();
4541
4542 /* Wait until all the ramblocks' dirty bitmap synced */
4543 while (ramblock_count--) {
4544 qemu_sem_wait(&s->rp_state.rp_sem);
4545 }
4546
4547 trace_ram_dirty_bitmap_sync_complete();
4548
4549 return 0;
4550}
4551
4552static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4553{
4554 qemu_sem_post(&s->rp_state.rp_sem);
4555}
4556
a335debb
PX
4557/*
4558 * Read the received bitmap, revert it as the initial dirty bitmap.
4559 * This is only used when the postcopy migration is paused but wants
4560 * to resume from a middle point.
4561 */
4562int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4563{
4564 int ret = -EINVAL;
4565 QEMUFile *file = s->rp_state.from_dst_file;
4566 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
a725ef9f 4567 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
4568 uint64_t size, end_mark;
4569
4570 trace_ram_dirty_bitmap_reload_begin(block->idstr);
4571
4572 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4573 error_report("%s: incorrect state %s", __func__,
4574 MigrationStatus_str(s->state));
4575 return -EINVAL;
4576 }
4577
4578 /*
4579 * Note: see comments in ramblock_recv_bitmap_send() on why we
4580 * need the endianess convertion, and the paddings.
4581 */
4582 local_size = ROUND_UP(local_size, 8);
4583
4584 /* Add paddings */
4585 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4586
4587 size = qemu_get_be64(file);
4588
4589 /* The size of the bitmap should match with our ramblock */
4590 if (size != local_size) {
4591 error_report("%s: ramblock '%s' bitmap size mismatch "
4592 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4593 block->idstr, size, local_size);
4594 ret = -EINVAL;
4595 goto out;
4596 }
4597
4598 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4599 end_mark = qemu_get_be64(file);
4600
4601 ret = qemu_file_get_error(file);
4602 if (ret || size != local_size) {
4603 error_report("%s: read bitmap failed for ramblock '%s': %d"
4604 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4605 __func__, block->idstr, ret, local_size, size);
4606 ret = -EIO;
4607 goto out;
4608 }
4609
4610 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4611 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
4612 __func__, block->idstr, end_mark);
4613 ret = -EINVAL;
4614 goto out;
4615 }
4616
4617 /*
4618 * Endianess convertion. We are during postcopy (though paused).
4619 * The dirty bitmap won't change. We can directly modify it.
4620 */
4621 bitmap_from_le(block->bmap, le_bitmap, nbits);
4622
4623 /*
4624 * What we received is "received bitmap". Revert it as the initial
4625 * dirty bitmap for this ramblock.
4626 */
4627 bitmap_complement(block->bmap, block->bmap, nbits);
4628
4629 trace_ram_dirty_bitmap_reload_complete(block->idstr);
4630
edd090c7
PX
4631 /*
4632 * We succeeded to sync bitmap for current ramblock. If this is
4633 * the last one to sync, we need to notify the main send thread.
4634 */
4635 ram_dirty_bitmap_reload_notify(s);
4636
a335debb
PX
4637 ret = 0;
4638out:
bf269906 4639 g_free(le_bitmap);
a335debb
PX
4640 return ret;
4641}
4642
edd090c7
PX
4643static int ram_resume_prepare(MigrationState *s, void *opaque)
4644{
4645 RAMState *rs = *(RAMState **)opaque;
08614f34 4646 int ret;
edd090c7 4647
08614f34
PX
4648 ret = ram_dirty_bitmap_sync_all(s, rs);
4649 if (ret) {
4650 return ret;
4651 }
4652
4653 ram_state_resume_prepare(rs, s->to_dst_file);
4654
4655 return 0;
edd090c7
PX
4656}
4657
56e93d26 4658static SaveVMHandlers savevm_ram_handlers = {
9907e842 4659 .save_setup = ram_save_setup,
56e93d26 4660 .save_live_iterate = ram_save_iterate,
763c906b 4661 .save_live_complete_postcopy = ram_save_complete,
a3e06c3d 4662 .save_live_complete_precopy = ram_save_complete,
c6467627 4663 .has_postcopy = ram_has_postcopy,
56e93d26
JQ
4664 .save_live_pending = ram_save_pending,
4665 .load_state = ram_load,
f265e0e4
JQ
4666 .save_cleanup = ram_save_cleanup,
4667 .load_setup = ram_load_setup,
4668 .load_cleanup = ram_load_cleanup,
edd090c7 4669 .resume_prepare = ram_resume_prepare,
56e93d26
JQ
4670};
4671
4672void ram_mig_init(void)
4673{
4674 qemu_mutex_init(&XBZRLE.lock);
6f37bb8b 4675 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
56e93d26 4676}