]> git.proxmox.com Git - mirror_qemu.git/blame - migration/ram.c
hw/arm/nseries: Replace the bluetooth chardev with a "null" chardev
[mirror_qemu.git] / migration / ram.c
CommitLineData
56e93d26
JQ
1/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
76cc7b58
JQ
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
56e93d26
JQ
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
e688df6b 28
1393a485 29#include "qemu/osdep.h"
33c11879 30#include "cpu.h"
56e93d26 31#include <zlib.h>
f348b6d1 32#include "qemu/cutils.h"
56e93d26
JQ
33#include "qemu/bitops.h"
34#include "qemu/bitmap.h"
7205c9ec 35#include "qemu/main-loop.h"
709e3fe8 36#include "xbzrle.h"
7b1e1a22 37#include "ram.h"
6666c96a 38#include "migration.h"
71bb07db 39#include "socket.h"
f2a8f0a6 40#include "migration/register.h"
7b1e1a22 41#include "migration/misc.h"
08a0aee1 42#include "qemu-file.h"
be07b0ac 43#include "postcopy-ram.h"
53d37d36 44#include "page_cache.h"
56e93d26 45#include "qemu/error-report.h"
e688df6b 46#include "qapi/error.h"
9af23989 47#include "qapi/qapi-events-migration.h"
8acabf69 48#include "qapi/qmp/qerror.h"
56e93d26 49#include "trace.h"
56e93d26 50#include "exec/ram_addr.h"
f9494614 51#include "exec/target_page.h"
56e93d26 52#include "qemu/rcu_queue.h"
a91246c9 53#include "migration/colo.h"
53d37d36 54#include "block.h"
af8b7d2b
JQ
55#include "sysemu/sysemu.h"
56#include "qemu/uuid.h"
edd090c7 57#include "savevm.h"
b9ee2f7d 58#include "qemu/iov.h"
56e93d26 59
56e93d26
JQ
60/***********************************************************/
61/* ram save/restore */
62
bb890ed5
JQ
63/* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
64 * worked for pages that where filled with the same char. We switched
65 * it to only search for the zero value. And to avoid confusion with
66 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
67 */
68
56e93d26 69#define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
bb890ed5 70#define RAM_SAVE_FLAG_ZERO 0x02
56e93d26
JQ
71#define RAM_SAVE_FLAG_MEM_SIZE 0x04
72#define RAM_SAVE_FLAG_PAGE 0x08
73#define RAM_SAVE_FLAG_EOS 0x10
74#define RAM_SAVE_FLAG_CONTINUE 0x20
75#define RAM_SAVE_FLAG_XBZRLE 0x40
76/* 0x80 is reserved in migration.h start with 0x100 next */
77#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
78
56e93d26
JQ
79static inline bool is_zero_range(uint8_t *p, uint64_t size)
80{
a1febc49 81 return buffer_is_zero(p, size);
56e93d26
JQ
82}
83
9360447d
JQ
84XBZRLECacheStats xbzrle_counters;
85
56e93d26
JQ
86/* struct contains XBZRLE cache and a static page
87 used by the compression */
88static struct {
89 /* buffer used for XBZRLE encoding */
90 uint8_t *encoded_buf;
91 /* buffer for storing page content */
92 uint8_t *current_buf;
93 /* Cache for XBZRLE, Protected by lock. */
94 PageCache *cache;
95 QemuMutex lock;
c00e0928
JQ
96 /* it will store a page full of zeros */
97 uint8_t *zero_target_page;
f265e0e4
JQ
98 /* buffer used for XBZRLE decoding */
99 uint8_t *decoded_buf;
56e93d26
JQ
100} XBZRLE;
101
56e93d26
JQ
102static void XBZRLE_cache_lock(void)
103{
104 if (migrate_use_xbzrle())
105 qemu_mutex_lock(&XBZRLE.lock);
106}
107
108static void XBZRLE_cache_unlock(void)
109{
110 if (migrate_use_xbzrle())
111 qemu_mutex_unlock(&XBZRLE.lock);
112}
113
3d0684b2
JQ
114/**
115 * xbzrle_cache_resize: resize the xbzrle cache
116 *
117 * This function is called from qmp_migrate_set_cache_size in main
118 * thread, possibly while a migration is in progress. A running
119 * migration may be using the cache and might finish during this call,
120 * hence changes to the cache are protected by XBZRLE.lock().
121 *
c9dede2d 122 * Returns 0 for success or -1 for error
3d0684b2
JQ
123 *
124 * @new_size: new cache size
8acabf69 125 * @errp: set *errp if the check failed, with reason
56e93d26 126 */
c9dede2d 127int xbzrle_cache_resize(int64_t new_size, Error **errp)
56e93d26
JQ
128{
129 PageCache *new_cache;
c9dede2d 130 int64_t ret = 0;
56e93d26 131
8acabf69
JQ
132 /* Check for truncation */
133 if (new_size != (size_t)new_size) {
134 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
135 "exceeding address space");
136 return -1;
137 }
138
2a313e5c
JQ
139 if (new_size == migrate_xbzrle_cache_size()) {
140 /* nothing to do */
c9dede2d 141 return 0;
2a313e5c
JQ
142 }
143
56e93d26
JQ
144 XBZRLE_cache_lock();
145
146 if (XBZRLE.cache != NULL) {
80f8dfde 147 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
56e93d26 148 if (!new_cache) {
56e93d26
JQ
149 ret = -1;
150 goto out;
151 }
152
153 cache_fini(XBZRLE.cache);
154 XBZRLE.cache = new_cache;
155 }
56e93d26
JQ
156out:
157 XBZRLE_cache_unlock();
158 return ret;
159}
160
fbd162e6
YK
161static bool ramblock_is_ignored(RAMBlock *block)
162{
163 return !qemu_ram_is_migratable(block) ||
164 (migrate_ignore_shared() && qemu_ram_is_shared(block));
165}
166
b895de50 167/* Should be holding either ram_list.mutex, or the RCU lock. */
fbd162e6
YK
168#define RAMBLOCK_FOREACH_NOT_IGNORED(block) \
169 INTERNAL_RAMBLOCK_FOREACH(block) \
170 if (ramblock_is_ignored(block)) {} else
171
b895de50 172#define RAMBLOCK_FOREACH_MIGRATABLE(block) \
343f632c 173 INTERNAL_RAMBLOCK_FOREACH(block) \
b895de50
CLG
174 if (!qemu_ram_is_migratable(block)) {} else
175
343f632c
DDAG
176#undef RAMBLOCK_FOREACH
177
fbd162e6
YK
178int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
179{
180 RAMBlock *block;
181 int ret = 0;
182
89ac5a1d
DDAG
183 RCU_READ_LOCK_GUARD();
184
fbd162e6
YK
185 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
186 ret = func(block, opaque);
187 if (ret) {
188 break;
189 }
190 }
fbd162e6
YK
191 return ret;
192}
193
f9494614
AP
194static void ramblock_recv_map_init(void)
195{
196 RAMBlock *rb;
197
fbd162e6 198 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
f9494614
AP
199 assert(!rb->receivedmap);
200 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
201 }
202}
203
204int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
205{
206 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
207 rb->receivedmap);
208}
209
1cba9f6e
DDAG
210bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
211{
212 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
213}
214
f9494614
AP
215void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
216{
217 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
218}
219
220void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
221 size_t nr)
222{
223 bitmap_set_atomic(rb->receivedmap,
224 ramblock_recv_bitmap_offset(host_addr, rb),
225 nr);
226}
227
a335debb
PX
228#define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
229
230/*
231 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
232 *
233 * Returns >0 if success with sent bytes, or <0 if error.
234 */
235int64_t ramblock_recv_bitmap_send(QEMUFile *file,
236 const char *block_name)
237{
238 RAMBlock *block = qemu_ram_block_by_name(block_name);
239 unsigned long *le_bitmap, nbits;
240 uint64_t size;
241
242 if (!block) {
243 error_report("%s: invalid block name: %s", __func__, block_name);
244 return -1;
245 }
246
247 nbits = block->used_length >> TARGET_PAGE_BITS;
248
249 /*
250 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
251 * machines we may need 4 more bytes for padding (see below
252 * comment). So extend it a bit before hand.
253 */
254 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
255
256 /*
257 * Always use little endian when sending the bitmap. This is
258 * required that when source and destination VMs are not using the
259 * same endianess. (Note: big endian won't work.)
260 */
261 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
262
263 /* Size of the bitmap, in bytes */
a725ef9f 264 size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
265
266 /*
267 * size is always aligned to 8 bytes for 64bit machines, but it
268 * may not be true for 32bit machines. We need this padding to
269 * make sure the migration can survive even between 32bit and
270 * 64bit machines.
271 */
272 size = ROUND_UP(size, 8);
273
274 qemu_put_be64(file, size);
275 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
276 /*
277 * Mark as an end, in case the middle part is screwed up due to
278 * some "misterious" reason.
279 */
280 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
281 qemu_fflush(file);
282
bf269906 283 g_free(le_bitmap);
a335debb
PX
284
285 if (qemu_file_get_error(file)) {
286 return qemu_file_get_error(file);
287 }
288
289 return size + sizeof(size);
290}
291
ec481c6c
JQ
292/*
293 * An outstanding page request, on the source, having been received
294 * and queued
295 */
296struct RAMSrcPageRequest {
297 RAMBlock *rb;
298 hwaddr offset;
299 hwaddr len;
300
301 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
302};
303
6f37bb8b
JQ
304/* State of RAM for migration */
305struct RAMState {
204b88b8
JQ
306 /* QEMUFile used for this migration */
307 QEMUFile *f;
6f37bb8b
JQ
308 /* Last block that we have visited searching for dirty pages */
309 RAMBlock *last_seen_block;
310 /* Last block from where we have sent data */
311 RAMBlock *last_sent_block;
269ace29
JQ
312 /* Last dirty target page we have sent */
313 ram_addr_t last_page;
6f37bb8b
JQ
314 /* last ram version we have seen */
315 uint32_t last_version;
316 /* We are in the first round */
317 bool ram_bulk_stage;
6eeb63f7
WW
318 /* The free page optimization is enabled */
319 bool fpo_enabled;
8d820d6f
JQ
320 /* How many times we have dirty too many pages */
321 int dirty_rate_high_cnt;
f664da80
JQ
322 /* these variables are used for bitmap sync */
323 /* last time we did a full bitmap_sync */
324 int64_t time_last_bitmap_sync;
eac74159 325 /* bytes transferred at start_time */
c4bdf0cf 326 uint64_t bytes_xfer_prev;
a66cd90c 327 /* number of dirty pages since start_time */
68908ed6 328 uint64_t num_dirty_pages_period;
b5833fde
JQ
329 /* xbzrle misses since the beginning of the period */
330 uint64_t xbzrle_cache_miss_prev;
76e03000
XG
331
332 /* compression statistics since the beginning of the period */
333 /* amount of count that no free thread to compress data */
334 uint64_t compress_thread_busy_prev;
335 /* amount bytes after compression */
336 uint64_t compressed_size_prev;
337 /* amount of compressed pages */
338 uint64_t compress_pages_prev;
339
be8b02ed
XG
340 /* total handled target pages at the beginning of period */
341 uint64_t target_page_count_prev;
342 /* total handled target pages since start */
343 uint64_t target_page_count;
9360447d 344 /* number of dirty bits in the bitmap */
2dfaf12e 345 uint64_t migration_dirty_pages;
386a907b 346 /* Protects modification of the bitmap and migration dirty pages */
108cfae0 347 QemuMutex bitmap_mutex;
68a098f3
JQ
348 /* The RAMBlock used in the last src_page_requests */
349 RAMBlock *last_req_rb;
ec481c6c
JQ
350 /* Queue of outstanding page requests from the destination */
351 QemuMutex src_page_req_mutex;
b58deb34 352 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
6f37bb8b
JQ
353};
354typedef struct RAMState RAMState;
355
53518d94 356static RAMState *ram_state;
6f37bb8b 357
bd227060
WW
358static NotifierWithReturnList precopy_notifier_list;
359
360void precopy_infrastructure_init(void)
361{
362 notifier_with_return_list_init(&precopy_notifier_list);
363}
364
365void precopy_add_notifier(NotifierWithReturn *n)
366{
367 notifier_with_return_list_add(&precopy_notifier_list, n);
368}
369
370void precopy_remove_notifier(NotifierWithReturn *n)
371{
372 notifier_with_return_remove(n);
373}
374
375int precopy_notify(PrecopyNotifyReason reason, Error **errp)
376{
377 PrecopyNotifyData pnd;
378 pnd.reason = reason;
379 pnd.errp = errp;
380
381 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
382}
383
6eeb63f7
WW
384void precopy_enable_free_page_optimization(void)
385{
386 if (!ram_state) {
387 return;
388 }
389
390 ram_state->fpo_enabled = true;
391}
392
9edabd4d 393uint64_t ram_bytes_remaining(void)
2f4fde93 394{
bae416e5
DDAG
395 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
396 0;
2f4fde93
JQ
397}
398
9360447d 399MigrationStats ram_counters;
96506894 400
b8fb8cb7
DDAG
401/* used by the search for pages to send */
402struct PageSearchStatus {
403 /* Current block being searched */
404 RAMBlock *block;
a935e30f
JQ
405 /* Current page to search from */
406 unsigned long page;
b8fb8cb7
DDAG
407 /* Set once we wrap around */
408 bool complete_round;
409};
410typedef struct PageSearchStatus PageSearchStatus;
411
76e03000
XG
412CompressionStats compression_counters;
413
56e93d26 414struct CompressParam {
56e93d26 415 bool done;
90e56fb4 416 bool quit;
5e5fdcff 417 bool zero_page;
56e93d26
JQ
418 QEMUFile *file;
419 QemuMutex mutex;
420 QemuCond cond;
421 RAMBlock *block;
422 ram_addr_t offset;
34ab9e97
XG
423
424 /* internally used fields */
dcaf446e 425 z_stream stream;
34ab9e97 426 uint8_t *originbuf;
56e93d26
JQ
427};
428typedef struct CompressParam CompressParam;
429
430struct DecompressParam {
73a8912b 431 bool done;
90e56fb4 432 bool quit;
56e93d26
JQ
433 QemuMutex mutex;
434 QemuCond cond;
435 void *des;
d341d9f3 436 uint8_t *compbuf;
56e93d26 437 int len;
797ca154 438 z_stream stream;
56e93d26
JQ
439};
440typedef struct DecompressParam DecompressParam;
441
442static CompressParam *comp_param;
443static QemuThread *compress_threads;
444/* comp_done_cond is used to wake up the migration thread when
445 * one of the compression threads has finished the compression.
446 * comp_done_lock is used to co-work with comp_done_cond.
447 */
0d9f9a5c
LL
448static QemuMutex comp_done_lock;
449static QemuCond comp_done_cond;
56e93d26
JQ
450/* The empty QEMUFileOps will be used by file in CompressParam */
451static const QEMUFileOps empty_ops = { };
452
34ab9e97 453static QEMUFile *decomp_file;
56e93d26
JQ
454static DecompressParam *decomp_param;
455static QemuThread *decompress_threads;
73a8912b
LL
456static QemuMutex decomp_done_lock;
457static QemuCond decomp_done_cond;
56e93d26 458
5e5fdcff 459static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
6ef3771c 460 ram_addr_t offset, uint8_t *source_buf);
56e93d26
JQ
461
462static void *do_data_compress(void *opaque)
463{
464 CompressParam *param = opaque;
a7a9a88f
LL
465 RAMBlock *block;
466 ram_addr_t offset;
5e5fdcff 467 bool zero_page;
56e93d26 468
a7a9a88f 469 qemu_mutex_lock(&param->mutex);
90e56fb4 470 while (!param->quit) {
a7a9a88f
LL
471 if (param->block) {
472 block = param->block;
473 offset = param->offset;
474 param->block = NULL;
475 qemu_mutex_unlock(&param->mutex);
476
5e5fdcff
XG
477 zero_page = do_compress_ram_page(param->file, &param->stream,
478 block, offset, param->originbuf);
a7a9a88f 479
0d9f9a5c 480 qemu_mutex_lock(&comp_done_lock);
a7a9a88f 481 param->done = true;
5e5fdcff 482 param->zero_page = zero_page;
0d9f9a5c
LL
483 qemu_cond_signal(&comp_done_cond);
484 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
485
486 qemu_mutex_lock(&param->mutex);
487 } else {
56e93d26
JQ
488 qemu_cond_wait(&param->cond, &param->mutex);
489 }
56e93d26 490 }
a7a9a88f 491 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
492
493 return NULL;
494}
495
f0afa331 496static void compress_threads_save_cleanup(void)
56e93d26
JQ
497{
498 int i, thread_count;
499
05306935 500 if (!migrate_use_compression() || !comp_param) {
56e93d26
JQ
501 return;
502 }
05306935 503
56e93d26
JQ
504 thread_count = migrate_compress_threads();
505 for (i = 0; i < thread_count; i++) {
dcaf446e
XG
506 /*
507 * we use it as a indicator which shows if the thread is
508 * properly init'd or not
509 */
510 if (!comp_param[i].file) {
511 break;
512 }
05306935
FL
513
514 qemu_mutex_lock(&comp_param[i].mutex);
515 comp_param[i].quit = true;
516 qemu_cond_signal(&comp_param[i].cond);
517 qemu_mutex_unlock(&comp_param[i].mutex);
518
56e93d26 519 qemu_thread_join(compress_threads + i);
56e93d26
JQ
520 qemu_mutex_destroy(&comp_param[i].mutex);
521 qemu_cond_destroy(&comp_param[i].cond);
dcaf446e 522 deflateEnd(&comp_param[i].stream);
34ab9e97 523 g_free(comp_param[i].originbuf);
dcaf446e
XG
524 qemu_fclose(comp_param[i].file);
525 comp_param[i].file = NULL;
56e93d26 526 }
0d9f9a5c
LL
527 qemu_mutex_destroy(&comp_done_lock);
528 qemu_cond_destroy(&comp_done_cond);
56e93d26
JQ
529 g_free(compress_threads);
530 g_free(comp_param);
56e93d26
JQ
531 compress_threads = NULL;
532 comp_param = NULL;
56e93d26
JQ
533}
534
dcaf446e 535static int compress_threads_save_setup(void)
56e93d26
JQ
536{
537 int i, thread_count;
538
539 if (!migrate_use_compression()) {
dcaf446e 540 return 0;
56e93d26 541 }
56e93d26
JQ
542 thread_count = migrate_compress_threads();
543 compress_threads = g_new0(QemuThread, thread_count);
544 comp_param = g_new0(CompressParam, thread_count);
0d9f9a5c
LL
545 qemu_cond_init(&comp_done_cond);
546 qemu_mutex_init(&comp_done_lock);
56e93d26 547 for (i = 0; i < thread_count; i++) {
34ab9e97
XG
548 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
549 if (!comp_param[i].originbuf) {
550 goto exit;
551 }
552
dcaf446e
XG
553 if (deflateInit(&comp_param[i].stream,
554 migrate_compress_level()) != Z_OK) {
34ab9e97 555 g_free(comp_param[i].originbuf);
dcaf446e
XG
556 goto exit;
557 }
558
e110aa91
C
559 /* comp_param[i].file is just used as a dummy buffer to save data,
560 * set its ops to empty.
56e93d26
JQ
561 */
562 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
563 comp_param[i].done = true;
90e56fb4 564 comp_param[i].quit = false;
56e93d26
JQ
565 qemu_mutex_init(&comp_param[i].mutex);
566 qemu_cond_init(&comp_param[i].cond);
567 qemu_thread_create(compress_threads + i, "compress",
568 do_data_compress, comp_param + i,
569 QEMU_THREAD_JOINABLE);
570 }
dcaf446e
XG
571 return 0;
572
573exit:
574 compress_threads_save_cleanup();
575 return -1;
56e93d26
JQ
576}
577
f986c3d2
JQ
578/* Multiple fd's */
579
af8b7d2b
JQ
580#define MULTIFD_MAGIC 0x11223344U
581#define MULTIFD_VERSION 1
582
6df264ac
JQ
583#define MULTIFD_FLAG_SYNC (1 << 0)
584
efd1a1d6 585/* This value needs to be a multiple of qemu_target_page_size() */
4b0c7264 586#define MULTIFD_PACKET_SIZE (512 * 1024)
efd1a1d6 587
af8b7d2b
JQ
588typedef struct {
589 uint32_t magic;
590 uint32_t version;
591 unsigned char uuid[16]; /* QemuUUID */
592 uint8_t id;
5fbd8b4b
JQ
593 uint8_t unused1[7]; /* Reserved for future use */
594 uint64_t unused2[4]; /* Reserved for future use */
af8b7d2b
JQ
595} __attribute__((packed)) MultiFDInit_t;
596
2a26c979
JQ
597typedef struct {
598 uint32_t magic;
599 uint32_t version;
600 uint32_t flags;
6f862692
JQ
601 /* maximum number of allocated pages */
602 uint32_t pages_alloc;
603 uint32_t pages_used;
2a34ee59
JQ
604 /* size of the next packet that contains pages */
605 uint32_t next_packet_size;
2a26c979 606 uint64_t packet_num;
5fbd8b4b 607 uint64_t unused[4]; /* Reserved for future use */
2a26c979
JQ
608 char ramblock[256];
609 uint64_t offset[];
610} __attribute__((packed)) MultiFDPacket_t;
611
34c55a94
JQ
612typedef struct {
613 /* number of used pages */
614 uint32_t used;
615 /* number of allocated pages */
616 uint32_t allocated;
617 /* global number of generated multifd packets */
618 uint64_t packet_num;
619 /* offset of each page */
620 ram_addr_t *offset;
621 /* pointer to each page */
622 struct iovec *iov;
623 RAMBlock *block;
624} MultiFDPages_t;
625
8c4598f2
JQ
626typedef struct {
627 /* this fields are not changed once the thread is created */
628 /* channel number */
f986c3d2 629 uint8_t id;
8c4598f2 630 /* channel thread name */
f986c3d2 631 char *name;
8c4598f2 632 /* channel thread id */
f986c3d2 633 QemuThread thread;
8c4598f2 634 /* communication channel */
60df2d4a 635 QIOChannel *c;
8c4598f2 636 /* sem where to wait for more work */
f986c3d2 637 QemuSemaphore sem;
8c4598f2 638 /* this mutex protects the following parameters */
f986c3d2 639 QemuMutex mutex;
8c4598f2 640 /* is this channel thread running */
66770707 641 bool running;
8c4598f2 642 /* should this thread finish */
f986c3d2 643 bool quit;
0beb5ed3
JQ
644 /* thread has work to do */
645 int pending_job;
34c55a94
JQ
646 /* array of pages to sent */
647 MultiFDPages_t *pages;
2a26c979
JQ
648 /* packet allocated len */
649 uint32_t packet_len;
650 /* pointer to the packet */
651 MultiFDPacket_t *packet;
652 /* multifd flags for each packet */
653 uint32_t flags;
2a34ee59
JQ
654 /* size of the next packet that contains pages */
655 uint32_t next_packet_size;
2a26c979
JQ
656 /* global number of generated multifd packets */
657 uint64_t packet_num;
408ea6ae
JQ
658 /* thread local variables */
659 /* packets sent through this channel */
660 uint64_t num_packets;
661 /* pages sent through this channel */
662 uint64_t num_pages;
18cdcea3
JQ
663 /* syncs main thread and channels */
664 QemuSemaphore sem_sync;
8c4598f2
JQ
665} MultiFDSendParams;
666
667typedef struct {
668 /* this fields are not changed once the thread is created */
669 /* channel number */
670 uint8_t id;
671 /* channel thread name */
672 char *name;
673 /* channel thread id */
674 QemuThread thread;
675 /* communication channel */
676 QIOChannel *c;
8c4598f2
JQ
677 /* this mutex protects the following parameters */
678 QemuMutex mutex;
679 /* is this channel thread running */
680 bool running;
3c3ca25d
JQ
681 /* should this thread finish */
682 bool quit;
34c55a94
JQ
683 /* array of pages to receive */
684 MultiFDPages_t *pages;
2a26c979
JQ
685 /* packet allocated len */
686 uint32_t packet_len;
687 /* pointer to the packet */
688 MultiFDPacket_t *packet;
689 /* multifd flags for each packet */
690 uint32_t flags;
691 /* global number of generated multifd packets */
692 uint64_t packet_num;
408ea6ae 693 /* thread local variables */
2a34ee59
JQ
694 /* size of the next packet that contains pages */
695 uint32_t next_packet_size;
408ea6ae
JQ
696 /* packets sent through this channel */
697 uint64_t num_packets;
698 /* pages sent through this channel */
699 uint64_t num_pages;
6df264ac
JQ
700 /* syncs main thread and channels */
701 QemuSemaphore sem_sync;
8c4598f2 702} MultiFDRecvParams;
f986c3d2 703
af8b7d2b
JQ
704static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
705{
706 MultiFDInit_t msg;
707 int ret;
708
709 msg.magic = cpu_to_be32(MULTIFD_MAGIC);
710 msg.version = cpu_to_be32(MULTIFD_VERSION);
711 msg.id = p->id;
712 memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
713
714 ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
715 if (ret != 0) {
716 return -1;
717 }
718 return 0;
719}
720
721static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
722{
723 MultiFDInit_t msg;
724 int ret;
725
726 ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
727 if (ret != 0) {
728 return -1;
729 }
730
341ba0df
PM
731 msg.magic = be32_to_cpu(msg.magic);
732 msg.version = be32_to_cpu(msg.version);
af8b7d2b
JQ
733
734 if (msg.magic != MULTIFD_MAGIC) {
735 error_setg(errp, "multifd: received packet magic %x "
736 "expected %x", msg.magic, MULTIFD_MAGIC);
737 return -1;
738 }
739
740 if (msg.version != MULTIFD_VERSION) {
741 error_setg(errp, "multifd: received packet version %d "
742 "expected %d", msg.version, MULTIFD_VERSION);
743 return -1;
744 }
745
746 if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
747 char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
748 char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
749
750 error_setg(errp, "multifd: received uuid '%s' and expected "
751 "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
752 g_free(uuid);
753 g_free(msg_uuid);
754 return -1;
755 }
756
757 if (msg.id > migrate_multifd_channels()) {
758 error_setg(errp, "multifd: received channel version %d "
759 "expected %d", msg.version, MULTIFD_VERSION);
760 return -1;
761 }
762
763 return msg.id;
764}
765
34c55a94
JQ
766static MultiFDPages_t *multifd_pages_init(size_t size)
767{
768 MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
769
770 pages->allocated = size;
771 pages->iov = g_new0(struct iovec, size);
772 pages->offset = g_new0(ram_addr_t, size);
773
774 return pages;
775}
776
777static void multifd_pages_clear(MultiFDPages_t *pages)
778{
779 pages->used = 0;
780 pages->allocated = 0;
781 pages->packet_num = 0;
782 pages->block = NULL;
783 g_free(pages->iov);
784 pages->iov = NULL;
785 g_free(pages->offset);
786 pages->offset = NULL;
787 g_free(pages);
788}
789
2a26c979
JQ
790static void multifd_send_fill_packet(MultiFDSendParams *p)
791{
792 MultiFDPacket_t *packet = p->packet;
793 int i;
794
2a26c979 795 packet->flags = cpu_to_be32(p->flags);
f2148c4c 796 packet->pages_alloc = cpu_to_be32(p->pages->allocated);
6f862692 797 packet->pages_used = cpu_to_be32(p->pages->used);
2a34ee59 798 packet->next_packet_size = cpu_to_be32(p->next_packet_size);
2a26c979
JQ
799 packet->packet_num = cpu_to_be64(p->packet_num);
800
801 if (p->pages->block) {
802 strncpy(packet->ramblock, p->pages->block->idstr, 256);
803 }
804
805 for (i = 0; i < p->pages->used; i++) {
806 packet->offset[i] = cpu_to_be64(p->pages->offset[i]);
807 }
808}
809
810static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
811{
812 MultiFDPacket_t *packet = p->packet;
7ed379b2 813 uint32_t pages_max = MULTIFD_PACKET_SIZE / qemu_target_page_size();
2a26c979
JQ
814 RAMBlock *block;
815 int i;
816
341ba0df 817 packet->magic = be32_to_cpu(packet->magic);
2a26c979
JQ
818 if (packet->magic != MULTIFD_MAGIC) {
819 error_setg(errp, "multifd: received packet "
820 "magic %x and expected magic %x",
821 packet->magic, MULTIFD_MAGIC);
822 return -1;
823 }
824
341ba0df 825 packet->version = be32_to_cpu(packet->version);
2a26c979
JQ
826 if (packet->version != MULTIFD_VERSION) {
827 error_setg(errp, "multifd: received packet "
828 "version %d and expected version %d",
829 packet->version, MULTIFD_VERSION);
830 return -1;
831 }
832
833 p->flags = be32_to_cpu(packet->flags);
834
6f862692 835 packet->pages_alloc = be32_to_cpu(packet->pages_alloc);
7ed379b2 836 /*
d884e77b 837 * If we received a packet that is 100 times bigger than expected
7ed379b2
JQ
838 * just stop migration. It is a magic number.
839 */
840 if (packet->pages_alloc > pages_max * 100) {
2a26c979 841 error_setg(errp, "multifd: received packet "
7ed379b2
JQ
842 "with size %d and expected a maximum size of %d",
843 packet->pages_alloc, pages_max * 100) ;
2a26c979
JQ
844 return -1;
845 }
7ed379b2
JQ
846 /*
847 * We received a packet that is bigger than expected but inside
848 * reasonable limits (see previous comment). Just reallocate.
849 */
850 if (packet->pages_alloc > p->pages->allocated) {
851 multifd_pages_clear(p->pages);
f151f8ac 852 p->pages = multifd_pages_init(packet->pages_alloc);
7ed379b2 853 }
2a26c979 854
6f862692
JQ
855 p->pages->used = be32_to_cpu(packet->pages_used);
856 if (p->pages->used > packet->pages_alloc) {
2a26c979 857 error_setg(errp, "multifd: received packet "
6f862692
JQ
858 "with %d pages and expected maximum pages are %d",
859 p->pages->used, packet->pages_alloc) ;
2a26c979
JQ
860 return -1;
861 }
862
2a34ee59 863 p->next_packet_size = be32_to_cpu(packet->next_packet_size);
2a26c979
JQ
864 p->packet_num = be64_to_cpu(packet->packet_num);
865
866 if (p->pages->used) {
867 /* make sure that ramblock is 0 terminated */
868 packet->ramblock[255] = 0;
869 block = qemu_ram_block_by_name(packet->ramblock);
870 if (!block) {
871 error_setg(errp, "multifd: unknown ram block %s",
872 packet->ramblock);
873 return -1;
874 }
875 }
876
877 for (i = 0; i < p->pages->used; i++) {
878 ram_addr_t offset = be64_to_cpu(packet->offset[i]);
879
880 if (offset > (block->used_length - TARGET_PAGE_SIZE)) {
881 error_setg(errp, "multifd: offset too long " RAM_ADDR_FMT
882 " (max " RAM_ADDR_FMT ")",
883 offset, block->max_length);
884 return -1;
885 }
886 p->pages->iov[i].iov_base = block->host + offset;
887 p->pages->iov[i].iov_len = TARGET_PAGE_SIZE;
888 }
889
890 return 0;
891}
892
f986c3d2
JQ
893struct {
894 MultiFDSendParams *params;
34c55a94
JQ
895 /* array of pages to sent */
896 MultiFDPages_t *pages;
6df264ac
JQ
897 /* global number of generated multifd packets */
898 uint64_t packet_num;
b9ee2f7d
JQ
899 /* send channels ready */
900 QemuSemaphore channels_ready;
f986c3d2
JQ
901} *multifd_send_state;
902
b9ee2f7d
JQ
903/*
904 * How we use multifd_send_state->pages and channel->pages?
905 *
906 * We create a pages for each channel, and a main one. Each time that
907 * we need to send a batch of pages we interchange the ones between
908 * multifd_send_state and the channel that is sending it. There are
909 * two reasons for that:
910 * - to not have to do so many mallocs during migration
911 * - to make easier to know what to free at the end of migration
912 *
913 * This way we always know who is the owner of each "pages" struct,
a5f7b1a6 914 * and we don't need any locking. It belongs to the migration thread
b9ee2f7d
JQ
915 * or to the channel thread. Switching is safe because the migration
916 * thread is using the channel mutex when changing it, and the channel
917 * have to had finish with its own, otherwise pending_job can't be
918 * false.
919 */
920
1b81c974 921static int multifd_send_pages(RAMState *rs)
b9ee2f7d
JQ
922{
923 int i;
924 static int next_channel;
925 MultiFDSendParams *p = NULL; /* make happy gcc */
926 MultiFDPages_t *pages = multifd_send_state->pages;
927 uint64_t transferred;
928
929 qemu_sem_wait(&multifd_send_state->channels_ready);
930 for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
931 p = &multifd_send_state->params[i];
932
933 qemu_mutex_lock(&p->mutex);
713f762a
IR
934 if (p->quit) {
935 error_report("%s: channel %d has already quit!", __func__, i);
936 qemu_mutex_unlock(&p->mutex);
937 return -1;
938 }
b9ee2f7d
JQ
939 if (!p->pending_job) {
940 p->pending_job++;
941 next_channel = (i + 1) % migrate_multifd_channels();
942 break;
943 }
944 qemu_mutex_unlock(&p->mutex);
945 }
946 p->pages->used = 0;
947
948 p->packet_num = multifd_send_state->packet_num++;
949 p->pages->block = NULL;
950 multifd_send_state->pages = p->pages;
951 p->pages = pages;
4fcefd44 952 transferred = ((uint64_t) pages->used) * TARGET_PAGE_SIZE + p->packet_len;
1b81c974 953 qemu_file_update_transfer(rs->f, transferred);
b9ee2f7d
JQ
954 ram_counters.multifd_bytes += transferred;
955 ram_counters.transferred += transferred;;
956 qemu_mutex_unlock(&p->mutex);
957 qemu_sem_post(&p->sem);
713f762a
IR
958
959 return 1;
b9ee2f7d
JQ
960}
961
1b81c974 962static int multifd_queue_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
b9ee2f7d
JQ
963{
964 MultiFDPages_t *pages = multifd_send_state->pages;
965
966 if (!pages->block) {
967 pages->block = block;
968 }
969
970 if (pages->block == block) {
971 pages->offset[pages->used] = offset;
972 pages->iov[pages->used].iov_base = block->host + offset;
973 pages->iov[pages->used].iov_len = TARGET_PAGE_SIZE;
974 pages->used++;
975
976 if (pages->used < pages->allocated) {
713f762a 977 return 1;
b9ee2f7d
JQ
978 }
979 }
980
1b81c974 981 if (multifd_send_pages(rs) < 0) {
713f762a
IR
982 return -1;
983 }
b9ee2f7d
JQ
984
985 if (pages->block != block) {
1b81c974 986 return multifd_queue_page(rs, block, offset);
b9ee2f7d 987 }
713f762a
IR
988
989 return 1;
b9ee2f7d
JQ
990}
991
66770707 992static void multifd_send_terminate_threads(Error *err)
f986c3d2
JQ
993{
994 int i;
995
5558c91a
JQ
996 trace_multifd_send_terminate_threads(err != NULL);
997
7a169d74
JQ
998 if (err) {
999 MigrationState *s = migrate_get_current();
1000 migrate_set_error(s, err);
1001 if (s->state == MIGRATION_STATUS_SETUP ||
1002 s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
1003 s->state == MIGRATION_STATUS_DEVICE ||
1004 s->state == MIGRATION_STATUS_ACTIVE) {
1005 migrate_set_state(&s->state, s->state,
1006 MIGRATION_STATUS_FAILED);
1007 }
1008 }
1009
66770707 1010 for (i = 0; i < migrate_multifd_channels(); i++) {
f986c3d2
JQ
1011 MultiFDSendParams *p = &multifd_send_state->params[i];
1012
1013 qemu_mutex_lock(&p->mutex);
1014 p->quit = true;
1015 qemu_sem_post(&p->sem);
1016 qemu_mutex_unlock(&p->mutex);
1017 }
1018}
1019
1398b2e3 1020void multifd_save_cleanup(void)
f986c3d2
JQ
1021{
1022 int i;
f986c3d2
JQ
1023
1024 if (!migrate_use_multifd()) {
1398b2e3 1025 return;
f986c3d2 1026 }
66770707
JQ
1027 multifd_send_terminate_threads(NULL);
1028 for (i = 0; i < migrate_multifd_channels(); i++) {
f986c3d2
JQ
1029 MultiFDSendParams *p = &multifd_send_state->params[i];
1030
66770707
JQ
1031 if (p->running) {
1032 qemu_thread_join(&p->thread);
1033 }
60df2d4a
JQ
1034 socket_send_channel_destroy(p->c);
1035 p->c = NULL;
f986c3d2
JQ
1036 qemu_mutex_destroy(&p->mutex);
1037 qemu_sem_destroy(&p->sem);
18cdcea3 1038 qemu_sem_destroy(&p->sem_sync);
f986c3d2
JQ
1039 g_free(p->name);
1040 p->name = NULL;
34c55a94
JQ
1041 multifd_pages_clear(p->pages);
1042 p->pages = NULL;
2a26c979
JQ
1043 p->packet_len = 0;
1044 g_free(p->packet);
1045 p->packet = NULL;
f986c3d2 1046 }
b9ee2f7d 1047 qemu_sem_destroy(&multifd_send_state->channels_ready);
f986c3d2
JQ
1048 g_free(multifd_send_state->params);
1049 multifd_send_state->params = NULL;
34c55a94
JQ
1050 multifd_pages_clear(multifd_send_state->pages);
1051 multifd_send_state->pages = NULL;
f986c3d2
JQ
1052 g_free(multifd_send_state);
1053 multifd_send_state = NULL;
f986c3d2
JQ
1054}
1055
1b81c974 1056static void multifd_send_sync_main(RAMState *rs)
6df264ac
JQ
1057{
1058 int i;
1059
1060 if (!migrate_use_multifd()) {
1061 return;
1062 }
b9ee2f7d 1063 if (multifd_send_state->pages->used) {
1b81c974 1064 if (multifd_send_pages(rs) < 0) {
713f762a
IR
1065 error_report("%s: multifd_send_pages fail", __func__);
1066 return;
1067 }
b9ee2f7d 1068 }
6df264ac
JQ
1069 for (i = 0; i < migrate_multifd_channels(); i++) {
1070 MultiFDSendParams *p = &multifd_send_state->params[i];
1071
1072 trace_multifd_send_sync_main_signal(p->id);
1073
1074 qemu_mutex_lock(&p->mutex);
b9ee2f7d 1075
713f762a
IR
1076 if (p->quit) {
1077 error_report("%s: channel %d has already quit", __func__, i);
1078 qemu_mutex_unlock(&p->mutex);
1079 return;
1080 }
1081
b9ee2f7d 1082 p->packet_num = multifd_send_state->packet_num++;
6df264ac
JQ
1083 p->flags |= MULTIFD_FLAG_SYNC;
1084 p->pending_job++;
1b81c974 1085 qemu_file_update_transfer(rs->f, p->packet_len);
81507f6b
IR
1086 ram_counters.multifd_bytes += p->packet_len;
1087 ram_counters.transferred += p->packet_len;
6df264ac
JQ
1088 qemu_mutex_unlock(&p->mutex);
1089 qemu_sem_post(&p->sem);
1090 }
1091 for (i = 0; i < migrate_multifd_channels(); i++) {
1092 MultiFDSendParams *p = &multifd_send_state->params[i];
1093
1094 trace_multifd_send_sync_main_wait(p->id);
18cdcea3 1095 qemu_sem_wait(&p->sem_sync);
6df264ac
JQ
1096 }
1097 trace_multifd_send_sync_main(multifd_send_state->packet_num);
1098}
1099
f986c3d2
JQ
1100static void *multifd_send_thread(void *opaque)
1101{
1102 MultiFDSendParams *p = opaque;
af8b7d2b 1103 Error *local_err = NULL;
a3ec6b7d
IR
1104 int ret = 0;
1105 uint32_t flags = 0;
af8b7d2b 1106
408ea6ae 1107 trace_multifd_send_thread_start(p->id);
74637e6f 1108 rcu_register_thread();
408ea6ae 1109
af8b7d2b 1110 if (multifd_send_initial_packet(p, &local_err) < 0) {
2f4aefd3 1111 ret = -1;
af8b7d2b
JQ
1112 goto out;
1113 }
408ea6ae
JQ
1114 /* initial packet */
1115 p->num_packets = 1;
f986c3d2
JQ
1116
1117 while (true) {
d82628e4 1118 qemu_sem_wait(&p->sem);
f986c3d2 1119 qemu_mutex_lock(&p->mutex);
0beb5ed3
JQ
1120
1121 if (p->pending_job) {
1122 uint32_t used = p->pages->used;
1123 uint64_t packet_num = p->packet_num;
a3ec6b7d 1124 flags = p->flags;
0beb5ed3 1125
2a34ee59 1126 p->next_packet_size = used * qemu_target_page_size();
0beb5ed3
JQ
1127 multifd_send_fill_packet(p);
1128 p->flags = 0;
1129 p->num_packets++;
1130 p->num_pages += used;
0beb5ed3
JQ
1131 qemu_mutex_unlock(&p->mutex);
1132
2a34ee59
JQ
1133 trace_multifd_send(p->id, packet_num, used, flags,
1134 p->next_packet_size);
0beb5ed3 1135
8b2db7f5
JQ
1136 ret = qio_channel_write_all(p->c, (void *)p->packet,
1137 p->packet_len, &local_err);
1138 if (ret != 0) {
1139 break;
1140 }
1141
ad24c7cb
JQ
1142 if (used) {
1143 ret = qio_channel_writev_all(p->c, p->pages->iov,
1144 used, &local_err);
1145 if (ret != 0) {
1146 break;
1147 }
8b2db7f5 1148 }
0beb5ed3
JQ
1149
1150 qemu_mutex_lock(&p->mutex);
1151 p->pending_job--;
1152 qemu_mutex_unlock(&p->mutex);
6df264ac
JQ
1153
1154 if (flags & MULTIFD_FLAG_SYNC) {
18cdcea3 1155 qemu_sem_post(&p->sem_sync);
6df264ac 1156 }
b9ee2f7d 1157 qemu_sem_post(&multifd_send_state->channels_ready);
0beb5ed3 1158 } else if (p->quit) {
f986c3d2
JQ
1159 qemu_mutex_unlock(&p->mutex);
1160 break;
6df264ac
JQ
1161 } else {
1162 qemu_mutex_unlock(&p->mutex);
1163 /* sometimes there are spurious wakeups */
f986c3d2 1164 }
f986c3d2
JQ
1165 }
1166
af8b7d2b
JQ
1167out:
1168 if (local_err) {
7dd59d01 1169 trace_multifd_send_error(p->id);
af8b7d2b
JQ
1170 multifd_send_terminate_threads(local_err);
1171 }
1172
a3ec6b7d
IR
1173 /*
1174 * Error happen, I will exit, but I can't just leave, tell
1175 * who pay attention to me.
1176 */
1177 if (ret != 0) {
2f4aefd3 1178 qemu_sem_post(&p->sem_sync);
a3ec6b7d
IR
1179 qemu_sem_post(&multifd_send_state->channels_ready);
1180 }
1181
66770707
JQ
1182 qemu_mutex_lock(&p->mutex);
1183 p->running = false;
1184 qemu_mutex_unlock(&p->mutex);
1185
74637e6f 1186 rcu_unregister_thread();
408ea6ae
JQ
1187 trace_multifd_send_thread_end(p->id, p->num_packets, p->num_pages);
1188
f986c3d2
JQ
1189 return NULL;
1190}
1191
60df2d4a
JQ
1192static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
1193{
1194 MultiFDSendParams *p = opaque;
1195 QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
1196 Error *local_err = NULL;
1197
7dd59d01 1198 trace_multifd_new_send_channel_async(p->id);
60df2d4a 1199 if (qio_task_propagate_error(task, &local_err)) {
1398b2e3
FL
1200 migrate_set_error(migrate_get_current(), local_err);
1201 multifd_save_cleanup();
60df2d4a
JQ
1202 } else {
1203 p->c = QIO_CHANNEL(sioc);
1204 qio_channel_set_delay(p->c, false);
1205 p->running = true;
1206 qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
1207 QEMU_THREAD_JOINABLE);
60df2d4a
JQ
1208 }
1209}
1210
f986c3d2
JQ
1211int multifd_save_setup(void)
1212{
1213 int thread_count;
efd1a1d6 1214 uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
f986c3d2
JQ
1215 uint8_t i;
1216
1217 if (!migrate_use_multifd()) {
1218 return 0;
1219 }
1220 thread_count = migrate_multifd_channels();
1221 multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
1222 multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
34c55a94 1223 multifd_send_state->pages = multifd_pages_init(page_count);
b9ee2f7d 1224 qemu_sem_init(&multifd_send_state->channels_ready, 0);
34c55a94 1225
f986c3d2
JQ
1226 for (i = 0; i < thread_count; i++) {
1227 MultiFDSendParams *p = &multifd_send_state->params[i];
1228
1229 qemu_mutex_init(&p->mutex);
1230 qemu_sem_init(&p->sem, 0);
18cdcea3 1231 qemu_sem_init(&p->sem_sync, 0);
f986c3d2 1232 p->quit = false;
0beb5ed3 1233 p->pending_job = 0;
f986c3d2 1234 p->id = i;
34c55a94 1235 p->pages = multifd_pages_init(page_count);
2a26c979
JQ
1236 p->packet_len = sizeof(MultiFDPacket_t)
1237 + sizeof(ram_addr_t) * page_count;
1238 p->packet = g_malloc0(p->packet_len);
9985e1f4
WY
1239 p->packet->magic = cpu_to_be32(MULTIFD_MAGIC);
1240 p->packet->version = cpu_to_be32(MULTIFD_VERSION);
f986c3d2 1241 p->name = g_strdup_printf("multifdsend_%d", i);
60df2d4a 1242 socket_send_channel_create(multifd_new_send_channel_async, p);
f986c3d2
JQ
1243 }
1244 return 0;
1245}
1246
f986c3d2
JQ
1247struct {
1248 MultiFDRecvParams *params;
1249 /* number of created threads */
1250 int count;
6df264ac
JQ
1251 /* syncs main thread and channels */
1252 QemuSemaphore sem_sync;
1253 /* global number of generated multifd packets */
1254 uint64_t packet_num;
f986c3d2
JQ
1255} *multifd_recv_state;
1256
66770707 1257static void multifd_recv_terminate_threads(Error *err)
f986c3d2
JQ
1258{
1259 int i;
1260
5558c91a
JQ
1261 trace_multifd_recv_terminate_threads(err != NULL);
1262
7a169d74
JQ
1263 if (err) {
1264 MigrationState *s = migrate_get_current();
1265 migrate_set_error(s, err);
1266 if (s->state == MIGRATION_STATUS_SETUP ||
1267 s->state == MIGRATION_STATUS_ACTIVE) {
1268 migrate_set_state(&s->state, s->state,
1269 MIGRATION_STATUS_FAILED);
1270 }
1271 }
1272
66770707 1273 for (i = 0; i < migrate_multifd_channels(); i++) {
f986c3d2
JQ
1274 MultiFDRecvParams *p = &multifd_recv_state->params[i];
1275
1276 qemu_mutex_lock(&p->mutex);
3c3ca25d 1277 p->quit = true;
7a5cc33c
JQ
1278 /* We could arrive here for two reasons:
1279 - normal quit, i.e. everything went fine, just finished
1280 - error quit: We close the channels so the channel threads
1281 finish the qio_channel_read_all_eof() */
1282 qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
f986c3d2
JQ
1283 qemu_mutex_unlock(&p->mutex);
1284 }
1285}
1286
1287int multifd_load_cleanup(Error **errp)
1288{
1289 int i;
1290 int ret = 0;
1291
1292 if (!migrate_use_multifd()) {
1293 return 0;
1294 }
66770707
JQ
1295 multifd_recv_terminate_threads(NULL);
1296 for (i = 0; i < migrate_multifd_channels(); i++) {
f986c3d2
JQ
1297 MultiFDRecvParams *p = &multifd_recv_state->params[i];
1298
66770707 1299 if (p->running) {
3c3ca25d 1300 p->quit = true;
f193bc0c
IR
1301 /*
1302 * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code,
1303 * however try to wakeup it without harm in cleanup phase.
1304 */
1305 qemu_sem_post(&p->sem_sync);
66770707
JQ
1306 qemu_thread_join(&p->thread);
1307 }
60df2d4a
JQ
1308 object_unref(OBJECT(p->c));
1309 p->c = NULL;
f986c3d2 1310 qemu_mutex_destroy(&p->mutex);
6df264ac 1311 qemu_sem_destroy(&p->sem_sync);
f986c3d2
JQ
1312 g_free(p->name);
1313 p->name = NULL;
34c55a94
JQ
1314 multifd_pages_clear(p->pages);
1315 p->pages = NULL;
2a26c979
JQ
1316 p->packet_len = 0;
1317 g_free(p->packet);
1318 p->packet = NULL;
f986c3d2 1319 }
6df264ac 1320 qemu_sem_destroy(&multifd_recv_state->sem_sync);
f986c3d2
JQ
1321 g_free(multifd_recv_state->params);
1322 multifd_recv_state->params = NULL;
1323 g_free(multifd_recv_state);
1324 multifd_recv_state = NULL;
1325
1326 return ret;
1327}
1328
6df264ac
JQ
1329static void multifd_recv_sync_main(void)
1330{
1331 int i;
1332
1333 if (!migrate_use_multifd()) {
1334 return;
1335 }
1336 for (i = 0; i < migrate_multifd_channels(); i++) {
1337 MultiFDRecvParams *p = &multifd_recv_state->params[i];
1338
6df264ac
JQ
1339 trace_multifd_recv_sync_main_wait(p->id);
1340 qemu_sem_wait(&multifd_recv_state->sem_sync);
77568ea7
WY
1341 }
1342 for (i = 0; i < migrate_multifd_channels(); i++) {
1343 MultiFDRecvParams *p = &multifd_recv_state->params[i];
1344
6df264ac
JQ
1345 qemu_mutex_lock(&p->mutex);
1346 if (multifd_recv_state->packet_num < p->packet_num) {
1347 multifd_recv_state->packet_num = p->packet_num;
1348 }
1349 qemu_mutex_unlock(&p->mutex);
6df264ac 1350 trace_multifd_recv_sync_main_signal(p->id);
6df264ac
JQ
1351 qemu_sem_post(&p->sem_sync);
1352 }
1353 trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
1354}
1355
f986c3d2
JQ
1356static void *multifd_recv_thread(void *opaque)
1357{
1358 MultiFDRecvParams *p = opaque;
2a26c979
JQ
1359 Error *local_err = NULL;
1360 int ret;
f986c3d2 1361
408ea6ae 1362 trace_multifd_recv_thread_start(p->id);
74637e6f 1363 rcu_register_thread();
408ea6ae 1364
f986c3d2 1365 while (true) {
6df264ac
JQ
1366 uint32_t used;
1367 uint32_t flags;
0beb5ed3 1368
3c3ca25d
JQ
1369 if (p->quit) {
1370 break;
1371 }
1372
8b2db7f5
JQ
1373 ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
1374 p->packet_len, &local_err);
1375 if (ret == 0) { /* EOF */
1376 break;
1377 }
1378 if (ret == -1) { /* Error */
1379 break;
1380 }
2a26c979 1381
6df264ac
JQ
1382 qemu_mutex_lock(&p->mutex);
1383 ret = multifd_recv_unfill_packet(p, &local_err);
1384 if (ret) {
f986c3d2
JQ
1385 qemu_mutex_unlock(&p->mutex);
1386 break;
1387 }
6df264ac
JQ
1388
1389 used = p->pages->used;
1390 flags = p->flags;
2a34ee59
JQ
1391 trace_multifd_recv(p->id, p->packet_num, used, flags,
1392 p->next_packet_size);
6df264ac
JQ
1393 p->num_packets++;
1394 p->num_pages += used;
f986c3d2 1395 qemu_mutex_unlock(&p->mutex);
6df264ac 1396
ad24c7cb
JQ
1397 if (used) {
1398 ret = qio_channel_readv_all(p->c, p->pages->iov,
1399 used, &local_err);
1400 if (ret != 0) {
1401 break;
1402 }
8b2db7f5
JQ
1403 }
1404
6df264ac
JQ
1405 if (flags & MULTIFD_FLAG_SYNC) {
1406 qemu_sem_post(&multifd_recv_state->sem_sync);
1407 qemu_sem_wait(&p->sem_sync);
1408 }
f986c3d2
JQ
1409 }
1410
d82628e4
JQ
1411 if (local_err) {
1412 multifd_recv_terminate_threads(local_err);
1413 }
66770707
JQ
1414 qemu_mutex_lock(&p->mutex);
1415 p->running = false;
1416 qemu_mutex_unlock(&p->mutex);
1417
74637e6f 1418 rcu_unregister_thread();
408ea6ae
JQ
1419 trace_multifd_recv_thread_end(p->id, p->num_packets, p->num_pages);
1420
f986c3d2
JQ
1421 return NULL;
1422}
1423
1424int multifd_load_setup(void)
1425{
1426 int thread_count;
efd1a1d6 1427 uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
f986c3d2
JQ
1428 uint8_t i;
1429
1430 if (!migrate_use_multifd()) {
1431 return 0;
1432 }
1433 thread_count = migrate_multifd_channels();
1434 multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
1435 multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
66770707 1436 atomic_set(&multifd_recv_state->count, 0);
6df264ac 1437 qemu_sem_init(&multifd_recv_state->sem_sync, 0);
34c55a94 1438
f986c3d2
JQ
1439 for (i = 0; i < thread_count; i++) {
1440 MultiFDRecvParams *p = &multifd_recv_state->params[i];
1441
1442 qemu_mutex_init(&p->mutex);
6df264ac 1443 qemu_sem_init(&p->sem_sync, 0);
3c3ca25d 1444 p->quit = false;
f986c3d2 1445 p->id = i;
34c55a94 1446 p->pages = multifd_pages_init(page_count);
2a26c979
JQ
1447 p->packet_len = sizeof(MultiFDPacket_t)
1448 + sizeof(ram_addr_t) * page_count;
1449 p->packet = g_malloc0(p->packet_len);
f986c3d2 1450 p->name = g_strdup_printf("multifdrecv_%d", i);
f986c3d2
JQ
1451 }
1452 return 0;
1453}
1454
62c1e0ca
JQ
1455bool multifd_recv_all_channels_created(void)
1456{
1457 int thread_count = migrate_multifd_channels();
1458
1459 if (!migrate_use_multifd()) {
1460 return true;
1461 }
1462
1463 return thread_count == atomic_read(&multifd_recv_state->count);
1464}
1465
49ed0d24
FL
1466/*
1467 * Try to receive all multifd channels to get ready for the migration.
1468 * - Return true and do not set @errp when correctly receving all channels;
1469 * - Return false and do not set @errp when correctly receiving the current one;
1470 * - Return false and set @errp when failing to receive the current channel.
1471 */
1472bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
71bb07db 1473{
60df2d4a 1474 MultiFDRecvParams *p;
af8b7d2b
JQ
1475 Error *local_err = NULL;
1476 int id;
60df2d4a 1477
af8b7d2b
JQ
1478 id = multifd_recv_initial_packet(ioc, &local_err);
1479 if (id < 0) {
1480 multifd_recv_terminate_threads(local_err);
49ed0d24
FL
1481 error_propagate_prepend(errp, local_err,
1482 "failed to receive packet"
1483 " via multifd channel %d: ",
1484 atomic_read(&multifd_recv_state->count));
81e62053 1485 return false;
af8b7d2b 1486 }
7dd59d01 1487 trace_multifd_recv_new_channel(id);
af8b7d2b
JQ
1488
1489 p = &multifd_recv_state->params[id];
1490 if (p->c != NULL) {
1491 error_setg(&local_err, "multifd: received id '%d' already setup'",
1492 id);
1493 multifd_recv_terminate_threads(local_err);
49ed0d24 1494 error_propagate(errp, local_err);
81e62053 1495 return false;
af8b7d2b 1496 }
60df2d4a
JQ
1497 p->c = ioc;
1498 object_ref(OBJECT(ioc));
408ea6ae
JQ
1499 /* initial packet */
1500 p->num_packets = 1;
60df2d4a
JQ
1501
1502 p->running = true;
1503 qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
1504 QEMU_THREAD_JOINABLE);
1505 atomic_inc(&multifd_recv_state->count);
49ed0d24
FL
1506 return atomic_read(&multifd_recv_state->count) ==
1507 migrate_multifd_channels();
71bb07db
JQ
1508}
1509
56e93d26 1510/**
3d0684b2 1511 * save_page_header: write page header to wire
56e93d26
JQ
1512 *
1513 * If this is the 1st block, it also writes the block identification
1514 *
3d0684b2 1515 * Returns the number of bytes written
56e93d26
JQ
1516 *
1517 * @f: QEMUFile where to send the data
1518 * @block: block that contains the page we want to send
1519 * @offset: offset inside the block for the page
1520 * in the lower bits, it contains flags
1521 */
2bf3aa85
JQ
1522static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
1523 ram_addr_t offset)
56e93d26 1524{
9f5f380b 1525 size_t size, len;
56e93d26 1526
24795694
JQ
1527 if (block == rs->last_sent_block) {
1528 offset |= RAM_SAVE_FLAG_CONTINUE;
1529 }
2bf3aa85 1530 qemu_put_be64(f, offset);
56e93d26
JQ
1531 size = 8;
1532
1533 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
9f5f380b 1534 len = strlen(block->idstr);
2bf3aa85
JQ
1535 qemu_put_byte(f, len);
1536 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
9f5f380b 1537 size += 1 + len;
24795694 1538 rs->last_sent_block = block;
56e93d26
JQ
1539 }
1540 return size;
1541}
1542
3d0684b2
JQ
1543/**
1544 * mig_throttle_guest_down: throotle down the guest
1545 *
1546 * Reduce amount of guest cpu execution to hopefully slow down memory
1547 * writes. If guest dirty memory rate is reduced below the rate at
1548 * which we can transfer pages to the destination then we should be
1549 * able to complete migration. Some workloads dirty memory way too
1550 * fast and will not effectively converge, even with auto-converge.
070afca2
JH
1551 */
1552static void mig_throttle_guest_down(void)
1553{
1554 MigrationState *s = migrate_get_current();
2594f56d
DB
1555 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
1556 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
4cbc9c7f 1557 int pct_max = s->parameters.max_cpu_throttle;
070afca2
JH
1558
1559 /* We have not started throttling yet. Let's start it. */
1560 if (!cpu_throttle_active()) {
1561 cpu_throttle_set(pct_initial);
1562 } else {
1563 /* Throttling already on, just increase the rate */
4cbc9c7f
LQ
1564 cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
1565 pct_max));
070afca2
JH
1566 }
1567}
1568
3d0684b2
JQ
1569/**
1570 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
1571 *
6f37bb8b 1572 * @rs: current RAM state
3d0684b2
JQ
1573 * @current_addr: address for the zero page
1574 *
1575 * Update the xbzrle cache to reflect a page that's been sent as all 0.
56e93d26
JQ
1576 * The important thing is that a stale (not-yet-0'd) page be replaced
1577 * by the new data.
1578 * As a bonus, if the page wasn't in the cache it gets added so that
3d0684b2 1579 * when a small write is made into the 0'd page it gets XBZRLE sent.
56e93d26 1580 */
6f37bb8b 1581static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
56e93d26 1582{
6f37bb8b 1583 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
56e93d26
JQ
1584 return;
1585 }
1586
1587 /* We don't care if this fails to allocate a new cache page
1588 * as long as it updated an old one */
c00e0928 1589 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
9360447d 1590 ram_counters.dirty_sync_count);
56e93d26
JQ
1591}
1592
1593#define ENCODING_FLAG_XBZRLE 0x1
1594
1595/**
1596 * save_xbzrle_page: compress and send current page
1597 *
1598 * Returns: 1 means that we wrote the page
1599 * 0 means that page is identical to the one already sent
1600 * -1 means that xbzrle would be longer than normal
1601 *
5a987738 1602 * @rs: current RAM state
3d0684b2
JQ
1603 * @current_data: pointer to the address of the page contents
1604 * @current_addr: addr of the page
56e93d26
JQ
1605 * @block: block that contains the page we want to send
1606 * @offset: offset inside the block for the page
1607 * @last_stage: if we are at the completion stage
56e93d26 1608 */
204b88b8 1609static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
56e93d26 1610 ram_addr_t current_addr, RAMBlock *block,
072c2511 1611 ram_addr_t offset, bool last_stage)
56e93d26
JQ
1612{
1613 int encoded_len = 0, bytes_xbzrle;
1614 uint8_t *prev_cached_page;
1615
9360447d
JQ
1616 if (!cache_is_cached(XBZRLE.cache, current_addr,
1617 ram_counters.dirty_sync_count)) {
1618 xbzrle_counters.cache_miss++;
56e93d26
JQ
1619 if (!last_stage) {
1620 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
9360447d 1621 ram_counters.dirty_sync_count) == -1) {
56e93d26
JQ
1622 return -1;
1623 } else {
1624 /* update *current_data when the page has been
1625 inserted into cache */
1626 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1627 }
1628 }
1629 return -1;
1630 }
1631
1632 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1633
1634 /* save current buffer into memory */
1635 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1636
1637 /* XBZRLE encoding (if there is no overflow) */
1638 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1639 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1640 TARGET_PAGE_SIZE);
ca353803
WY
1641
1642 /*
1643 * Update the cache contents, so that it corresponds to the data
1644 * sent, in all cases except where we skip the page.
1645 */
1646 if (!last_stage && encoded_len != 0) {
1647 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1648 /*
1649 * In the case where we couldn't compress, ensure that the caller
1650 * sends the data from the cache, since the guest might have
1651 * changed the RAM since we copied it.
1652 */
1653 *current_data = prev_cached_page;
1654 }
1655
56e93d26 1656 if (encoded_len == 0) {
55c4446b 1657 trace_save_xbzrle_page_skipping();
56e93d26
JQ
1658 return 0;
1659 } else if (encoded_len == -1) {
55c4446b 1660 trace_save_xbzrle_page_overflow();
9360447d 1661 xbzrle_counters.overflow++;
56e93d26
JQ
1662 return -1;
1663 }
1664
56e93d26 1665 /* Send XBZRLE based compressed page */
2bf3aa85 1666 bytes_xbzrle = save_page_header(rs, rs->f, block,
204b88b8
JQ
1667 offset | RAM_SAVE_FLAG_XBZRLE);
1668 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1669 qemu_put_be16(rs->f, encoded_len);
1670 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
56e93d26 1671 bytes_xbzrle += encoded_len + 1 + 2;
9360447d
JQ
1672 xbzrle_counters.pages++;
1673 xbzrle_counters.bytes += bytes_xbzrle;
1674 ram_counters.transferred += bytes_xbzrle;
56e93d26
JQ
1675
1676 return 1;
1677}
1678
3d0684b2
JQ
1679/**
1680 * migration_bitmap_find_dirty: find the next dirty page from start
f3f491fc 1681 *
a5f7b1a6 1682 * Returns the page offset within memory region of the start of a dirty page
3d0684b2 1683 *
6f37bb8b 1684 * @rs: current RAM state
3d0684b2 1685 * @rb: RAMBlock where to search for dirty pages
a935e30f 1686 * @start: page where we start the search
f3f491fc 1687 */
56e93d26 1688static inline
a935e30f 1689unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
f20e2865 1690 unsigned long start)
56e93d26 1691{
6b6712ef
JQ
1692 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1693 unsigned long *bitmap = rb->bmap;
56e93d26
JQ
1694 unsigned long next;
1695
fbd162e6 1696 if (ramblock_is_ignored(rb)) {
b895de50
CLG
1697 return size;
1698 }
1699
6eeb63f7
WW
1700 /*
1701 * When the free page optimization is enabled, we need to check the bitmap
1702 * to send the non-free pages rather than all the pages in the bulk stage.
1703 */
1704 if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
6b6712ef 1705 next = start + 1;
56e93d26 1706 } else {
6b6712ef 1707 next = find_next_bit(bitmap, size, start);
56e93d26
JQ
1708 }
1709
6b6712ef 1710 return next;
56e93d26
JQ
1711}
1712
06b10688 1713static inline bool migration_bitmap_clear_dirty(RAMState *rs,
f20e2865
JQ
1714 RAMBlock *rb,
1715 unsigned long page)
a82d593b
DDAG
1716{
1717 bool ret;
a82d593b 1718
386a907b 1719 qemu_mutex_lock(&rs->bitmap_mutex);
002cad6b
PX
1720
1721 /*
1722 * Clear dirty bitmap if needed. This _must_ be called before we
1723 * send any of the page in the chunk because we need to make sure
1724 * we can capture further page content changes when we sync dirty
1725 * log the next time. So as long as we are going to send any of
1726 * the page in the chunk we clear the remote dirty bitmap for all.
1727 * Clearing it earlier won't be a problem, but too late will.
1728 */
1729 if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
1730 uint8_t shift = rb->clear_bmap_shift;
1731 hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
1732 hwaddr start = (page << TARGET_PAGE_BITS) & (-size);
1733
1734 /*
1735 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
1736 * can make things easier sometimes since then start address
1737 * of the small chunk will always be 64 pages aligned so the
1738 * bitmap will always be aligned to unsigned long. We should
1739 * even be able to remove this restriction but I'm simply
1740 * keeping it.
1741 */
1742 assert(shift >= 6);
1743 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
1744 memory_region_clear_dirty_bitmap(rb->mr, start, size);
1745 }
1746
6b6712ef 1747 ret = test_and_clear_bit(page, rb->bmap);
a82d593b
DDAG
1748
1749 if (ret) {
0d8ec885 1750 rs->migration_dirty_pages--;
a82d593b 1751 }
386a907b
WW
1752 qemu_mutex_unlock(&rs->bitmap_mutex);
1753
a82d593b
DDAG
1754 return ret;
1755}
1756
267691b6 1757/* Called with RCU critical section */
7a3e9571 1758static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
56e93d26 1759{
0d8ec885 1760 rs->migration_dirty_pages +=
5d0980a4 1761 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length,
0d8ec885 1762 &rs->num_dirty_pages_period);
56e93d26
JQ
1763}
1764
3d0684b2
JQ
1765/**
1766 * ram_pagesize_summary: calculate all the pagesizes of a VM
1767 *
1768 * Returns a summary bitmap of the page sizes of all RAMBlocks
1769 *
1770 * For VMs with just normal pages this is equivalent to the host page
1771 * size. If it's got some huge pages then it's the OR of all the
1772 * different page sizes.
e8ca1db2
DDAG
1773 */
1774uint64_t ram_pagesize_summary(void)
1775{
1776 RAMBlock *block;
1777 uint64_t summary = 0;
1778
fbd162e6 1779 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
e8ca1db2
DDAG
1780 summary |= block->page_size;
1781 }
1782
1783 return summary;
1784}
1785
aecbfe9c
XG
1786uint64_t ram_get_total_transferred_pages(void)
1787{
1788 return ram_counters.normal + ram_counters.duplicate +
1789 compression_counters.pages + xbzrle_counters.pages;
1790}
1791
b734035b
XG
1792static void migration_update_rates(RAMState *rs, int64_t end_time)
1793{
be8b02ed 1794 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
76e03000 1795 double compressed_size;
b734035b
XG
1796
1797 /* calculate period counters */
1798 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1799 / (end_time - rs->time_last_bitmap_sync);
1800
be8b02ed 1801 if (!page_count) {
b734035b
XG
1802 return;
1803 }
1804
1805 if (migrate_use_xbzrle()) {
1806 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
be8b02ed 1807 rs->xbzrle_cache_miss_prev) / page_count;
b734035b
XG
1808 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1809 }
76e03000
XG
1810
1811 if (migrate_use_compression()) {
1812 compression_counters.busy_rate = (double)(compression_counters.busy -
1813 rs->compress_thread_busy_prev) / page_count;
1814 rs->compress_thread_busy_prev = compression_counters.busy;
1815
1816 compressed_size = compression_counters.compressed_size -
1817 rs->compressed_size_prev;
1818 if (compressed_size) {
1819 double uncompressed_size = (compression_counters.pages -
1820 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1821
1822 /* Compression-Ratio = Uncompressed-size / Compressed-size */
1823 compression_counters.compression_rate =
1824 uncompressed_size / compressed_size;
1825
1826 rs->compress_pages_prev = compression_counters.pages;
1827 rs->compressed_size_prev = compression_counters.compressed_size;
1828 }
1829 }
b734035b
XG
1830}
1831
8d820d6f 1832static void migration_bitmap_sync(RAMState *rs)
56e93d26
JQ
1833{
1834 RAMBlock *block;
56e93d26 1835 int64_t end_time;
c4bdf0cf 1836 uint64_t bytes_xfer_now;
56e93d26 1837
9360447d 1838 ram_counters.dirty_sync_count++;
56e93d26 1839
f664da80
JQ
1840 if (!rs->time_last_bitmap_sync) {
1841 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
56e93d26
JQ
1842 }
1843
1844 trace_migration_bitmap_sync_start();
9c1f8f44 1845 memory_global_dirty_log_sync();
56e93d26 1846
108cfae0 1847 qemu_mutex_lock(&rs->bitmap_mutex);
89ac5a1d
DDAG
1848 WITH_RCU_READ_LOCK_GUARD() {
1849 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1850 ramblock_sync_dirty_bitmap(rs, block);
1851 }
1852 ram_counters.remaining = ram_bytes_remaining();
56e93d26 1853 }
108cfae0 1854 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26 1855
9458a9a1 1856 memory_global_after_dirty_log_sync();
a66cd90c 1857 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1ffb5dfd 1858
56e93d26
JQ
1859 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1860
1861 /* more than 1 second = 1000 millisecons */
f664da80 1862 if (end_time > rs->time_last_bitmap_sync + 1000) {
9360447d 1863 bytes_xfer_now = ram_counters.transferred;
d693c6f1 1864
9ac78b61
PL
1865 /* During block migration the auto-converge logic incorrectly detects
1866 * that ram migration makes no progress. Avoid this by disabling the
1867 * throttling logic during the bulk phase of block migration. */
1868 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
56e93d26
JQ
1869 /* The following detection logic can be refined later. For now:
1870 Check to see if the dirtied bytes is 50% more than the approx.
1871 amount of bytes that just got transferred since the last time we
070afca2
JH
1872 were in this routine. If that happens twice, start or increase
1873 throttling */
070afca2 1874
d693c6f1 1875 if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
eac74159 1876 (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
b4a3c64b 1877 (++rs->dirty_rate_high_cnt >= 2)) {
56e93d26 1878 trace_migration_throttle();
8d820d6f 1879 rs->dirty_rate_high_cnt = 0;
070afca2 1880 mig_throttle_guest_down();
d693c6f1 1881 }
56e93d26 1882 }
070afca2 1883
b734035b
XG
1884 migration_update_rates(rs, end_time);
1885
be8b02ed 1886 rs->target_page_count_prev = rs->target_page_count;
d693c6f1
FF
1887
1888 /* reset period counters */
f664da80 1889 rs->time_last_bitmap_sync = end_time;
a66cd90c 1890 rs->num_dirty_pages_period = 0;
d2a4d85a 1891 rs->bytes_xfer_prev = bytes_xfer_now;
56e93d26 1892 }
4addcd4f 1893 if (migrate_use_events()) {
3ab72385 1894 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
4addcd4f 1895 }
56e93d26
JQ
1896}
1897
bd227060
WW
1898static void migration_bitmap_sync_precopy(RAMState *rs)
1899{
1900 Error *local_err = NULL;
1901
1902 /*
1903 * The current notifier usage is just an optimization to migration, so we
1904 * don't stop the normal migration process in the error case.
1905 */
1906 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1907 error_report_err(local_err);
1908 }
1909
1910 migration_bitmap_sync(rs);
1911
1912 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1913 error_report_err(local_err);
1914 }
1915}
1916
6c97ec5f
XG
1917/**
1918 * save_zero_page_to_file: send the zero page to the file
1919 *
1920 * Returns the size of data written to the file, 0 means the page is not
1921 * a zero page
1922 *
1923 * @rs: current RAM state
1924 * @file: the file where the data is saved
1925 * @block: block that contains the page we want to send
1926 * @offset: offset inside the block for the page
1927 */
1928static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1929 RAMBlock *block, ram_addr_t offset)
1930{
1931 uint8_t *p = block->host + offset;
1932 int len = 0;
1933
1934 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1935 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1936 qemu_put_byte(file, 0);
1937 len += 1;
1938 }
1939 return len;
1940}
1941
56e93d26 1942/**
3d0684b2 1943 * save_zero_page: send the zero page to the stream
56e93d26 1944 *
3d0684b2 1945 * Returns the number of pages written.
56e93d26 1946 *
f7ccd61b 1947 * @rs: current RAM state
56e93d26
JQ
1948 * @block: block that contains the page we want to send
1949 * @offset: offset inside the block for the page
56e93d26 1950 */
7faccdc3 1951static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
56e93d26 1952{
6c97ec5f 1953 int len = save_zero_page_to_file(rs, rs->f, block, offset);
56e93d26 1954
6c97ec5f 1955 if (len) {
9360447d 1956 ram_counters.duplicate++;
6c97ec5f
XG
1957 ram_counters.transferred += len;
1958 return 1;
56e93d26 1959 }
6c97ec5f 1960 return -1;
56e93d26
JQ
1961}
1962
5727309d 1963static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
53f09a10 1964{
5727309d 1965 if (!migrate_release_ram() || !migration_in_postcopy()) {
53f09a10
PB
1966 return;
1967 }
1968
aaa2064c 1969 ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
53f09a10
PB
1970}
1971
059ff0fb
XG
1972/*
1973 * @pages: the number of pages written by the control path,
1974 * < 0 - error
1975 * > 0 - number of pages written
1976 *
1977 * Return true if the pages has been saved, otherwise false is returned.
1978 */
1979static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1980 int *pages)
1981{
1982 uint64_t bytes_xmit = 0;
1983 int ret;
1984
1985 *pages = -1;
1986 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1987 &bytes_xmit);
1988 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1989 return false;
1990 }
1991
1992 if (bytes_xmit) {
1993 ram_counters.transferred += bytes_xmit;
1994 *pages = 1;
1995 }
1996
1997 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1998 return true;
1999 }
2000
2001 if (bytes_xmit > 0) {
2002 ram_counters.normal++;
2003 } else if (bytes_xmit == 0) {
2004 ram_counters.duplicate++;
2005 }
2006
2007 return true;
2008}
2009
65dacaa0
XG
2010/*
2011 * directly send the page to the stream
2012 *
2013 * Returns the number of pages written.
2014 *
2015 * @rs: current RAM state
2016 * @block: block that contains the page we want to send
2017 * @offset: offset inside the block for the page
2018 * @buf: the page to be sent
2019 * @async: send to page asyncly
2020 */
2021static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
2022 uint8_t *buf, bool async)
2023{
2024 ram_counters.transferred += save_page_header(rs, rs->f, block,
2025 offset | RAM_SAVE_FLAG_PAGE);
2026 if (async) {
2027 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
2028 migrate_release_ram() &
2029 migration_in_postcopy());
2030 } else {
2031 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
2032 }
2033 ram_counters.transferred += TARGET_PAGE_SIZE;
2034 ram_counters.normal++;
2035 return 1;
2036}
2037
56e93d26 2038/**
3d0684b2 2039 * ram_save_page: send the given page to the stream
56e93d26 2040 *
3d0684b2 2041 * Returns the number of pages written.
3fd3c4b3
DDAG
2042 * < 0 - error
2043 * >=0 - Number of pages written - this might legally be 0
2044 * if xbzrle noticed the page was the same.
56e93d26 2045 *
6f37bb8b 2046 * @rs: current RAM state
56e93d26
JQ
2047 * @block: block that contains the page we want to send
2048 * @offset: offset inside the block for the page
2049 * @last_stage: if we are at the completion stage
56e93d26 2050 */
a0a8aa14 2051static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
56e93d26
JQ
2052{
2053 int pages = -1;
56e93d26 2054 uint8_t *p;
56e93d26 2055 bool send_async = true;
a08f6890 2056 RAMBlock *block = pss->block;
a935e30f 2057 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
059ff0fb 2058 ram_addr_t current_addr = block->offset + offset;
56e93d26 2059
2f68e399 2060 p = block->host + offset;
1db9d8e5 2061 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
56e93d26 2062
56e93d26 2063 XBZRLE_cache_lock();
d7400a34
XG
2064 if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
2065 migrate_use_xbzrle()) {
059ff0fb
XG
2066 pages = save_xbzrle_page(rs, &p, current_addr, block,
2067 offset, last_stage);
2068 if (!last_stage) {
2069 /* Can't send this cached data async, since the cache page
2070 * might get updated before it gets to the wire
56e93d26 2071 */
059ff0fb 2072 send_async = false;
56e93d26
JQ
2073 }
2074 }
2075
2076 /* XBZRLE overflow or normal page */
2077 if (pages == -1) {
65dacaa0 2078 pages = save_normal_page(rs, block, offset, p, send_async);
56e93d26
JQ
2079 }
2080
2081 XBZRLE_cache_unlock();
2082
2083 return pages;
2084}
2085
b9ee2f7d
JQ
2086static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
2087 ram_addr_t offset)
2088{
1b81c974 2089 if (multifd_queue_page(rs, block, offset) < 0) {
713f762a
IR
2090 return -1;
2091 }
b9ee2f7d
JQ
2092 ram_counters.normal++;
2093
2094 return 1;
2095}
2096
5e5fdcff 2097static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
6ef3771c 2098 ram_addr_t offset, uint8_t *source_buf)
56e93d26 2099{
53518d94 2100 RAMState *rs = ram_state;
a7a9a88f 2101 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
5e5fdcff 2102 bool zero_page = false;
6ef3771c 2103 int ret;
56e93d26 2104
5e5fdcff
XG
2105 if (save_zero_page_to_file(rs, f, block, offset)) {
2106 zero_page = true;
2107 goto exit;
2108 }
2109
6ef3771c 2110 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
34ab9e97
XG
2111
2112 /*
2113 * copy it to a internal buffer to avoid it being modified by VM
2114 * so that we can catch up the error during compression and
2115 * decompression
2116 */
2117 memcpy(source_buf, p, TARGET_PAGE_SIZE);
6ef3771c
XG
2118 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
2119 if (ret < 0) {
2120 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
b3be2896 2121 error_report("compressed data failed!");
5e5fdcff 2122 return false;
b3be2896 2123 }
56e93d26 2124
5e5fdcff 2125exit:
6ef3771c 2126 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
5e5fdcff
XG
2127 return zero_page;
2128}
2129
2130static void
2131update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
2132{
76e03000
XG
2133 ram_counters.transferred += bytes_xmit;
2134
5e5fdcff
XG
2135 if (param->zero_page) {
2136 ram_counters.duplicate++;
76e03000 2137 return;
5e5fdcff 2138 }
76e03000
XG
2139
2140 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
2141 compression_counters.compressed_size += bytes_xmit - 8;
2142 compression_counters.pages++;
56e93d26
JQ
2143}
2144
32b05495
XG
2145static bool save_page_use_compression(RAMState *rs);
2146
ce25d337 2147static void flush_compressed_data(RAMState *rs)
56e93d26
JQ
2148{
2149 int idx, len, thread_count;
2150
32b05495 2151 if (!save_page_use_compression(rs)) {
56e93d26
JQ
2152 return;
2153 }
2154 thread_count = migrate_compress_threads();
a7a9a88f 2155
0d9f9a5c 2156 qemu_mutex_lock(&comp_done_lock);
56e93d26 2157 for (idx = 0; idx < thread_count; idx++) {
a7a9a88f 2158 while (!comp_param[idx].done) {
0d9f9a5c 2159 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26 2160 }
a7a9a88f 2161 }
0d9f9a5c 2162 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
2163
2164 for (idx = 0; idx < thread_count; idx++) {
2165 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 2166 if (!comp_param[idx].quit) {
ce25d337 2167 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
5e5fdcff
XG
2168 /*
2169 * it's safe to fetch zero_page without holding comp_done_lock
2170 * as there is no further request submitted to the thread,
2171 * i.e, the thread should be waiting for a request at this point.
2172 */
2173 update_compress_thread_counts(&comp_param[idx], len);
56e93d26 2174 }
a7a9a88f 2175 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26
JQ
2176 }
2177}
2178
2179static inline void set_compress_params(CompressParam *param, RAMBlock *block,
2180 ram_addr_t offset)
2181{
2182 param->block = block;
2183 param->offset = offset;
2184}
2185
ce25d337
JQ
2186static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
2187 ram_addr_t offset)
56e93d26
JQ
2188{
2189 int idx, thread_count, bytes_xmit = -1, pages = -1;
1d58872a 2190 bool wait = migrate_compress_wait_thread();
56e93d26
JQ
2191
2192 thread_count = migrate_compress_threads();
0d9f9a5c 2193 qemu_mutex_lock(&comp_done_lock);
1d58872a
XG
2194retry:
2195 for (idx = 0; idx < thread_count; idx++) {
2196 if (comp_param[idx].done) {
2197 comp_param[idx].done = false;
2198 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2199 qemu_mutex_lock(&comp_param[idx].mutex);
2200 set_compress_params(&comp_param[idx], block, offset);
2201 qemu_cond_signal(&comp_param[idx].cond);
2202 qemu_mutex_unlock(&comp_param[idx].mutex);
2203 pages = 1;
5e5fdcff 2204 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
56e93d26 2205 break;
56e93d26
JQ
2206 }
2207 }
1d58872a
XG
2208
2209 /*
2210 * wait for the free thread if the user specifies 'compress-wait-thread',
2211 * otherwise we will post the page out in the main thread as normal page.
2212 */
2213 if (pages < 0 && wait) {
2214 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2215 goto retry;
2216 }
0d9f9a5c 2217 qemu_mutex_unlock(&comp_done_lock);
56e93d26
JQ
2218
2219 return pages;
2220}
2221
3d0684b2
JQ
2222/**
2223 * find_dirty_block: find the next dirty page and update any state
2224 * associated with the search process.
b9e60928 2225 *
a5f7b1a6 2226 * Returns true if a page is found
b9e60928 2227 *
6f37bb8b 2228 * @rs: current RAM state
3d0684b2
JQ
2229 * @pss: data about the state of the current dirty page scan
2230 * @again: set to false if the search has scanned the whole of RAM
b9e60928 2231 */
f20e2865 2232static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
b9e60928 2233{
f20e2865 2234 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
6f37bb8b 2235 if (pss->complete_round && pss->block == rs->last_seen_block &&
a935e30f 2236 pss->page >= rs->last_page) {
b9e60928
DDAG
2237 /*
2238 * We've been once around the RAM and haven't found anything.
2239 * Give up.
2240 */
2241 *again = false;
2242 return false;
2243 }
a935e30f 2244 if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
b9e60928 2245 /* Didn't find anything in this RAM Block */
a935e30f 2246 pss->page = 0;
b9e60928
DDAG
2247 pss->block = QLIST_NEXT_RCU(pss->block, next);
2248 if (!pss->block) {
48df9d80
XG
2249 /*
2250 * If memory migration starts over, we will meet a dirtied page
2251 * which may still exists in compression threads's ring, so we
2252 * should flush the compressed data to make sure the new page
2253 * is not overwritten by the old one in the destination.
2254 *
2255 * Also If xbzrle is on, stop using the data compression at this
2256 * point. In theory, xbzrle can do better than compression.
2257 */
2258 flush_compressed_data(rs);
2259
b9e60928
DDAG
2260 /* Hit the end of the list */
2261 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
2262 /* Flag that we've looped */
2263 pss->complete_round = true;
6f37bb8b 2264 rs->ram_bulk_stage = false;
b9e60928
DDAG
2265 }
2266 /* Didn't find anything this time, but try again on the new block */
2267 *again = true;
2268 return false;
2269 } else {
2270 /* Can go around again, but... */
2271 *again = true;
2272 /* We've found something so probably don't need to */
2273 return true;
2274 }
2275}
2276
3d0684b2
JQ
2277/**
2278 * unqueue_page: gets a page of the queue
2279 *
a82d593b 2280 * Helper for 'get_queued_page' - gets a page off the queue
a82d593b 2281 *
3d0684b2
JQ
2282 * Returns the block of the page (or NULL if none available)
2283 *
ec481c6c 2284 * @rs: current RAM state
3d0684b2 2285 * @offset: used to return the offset within the RAMBlock
a82d593b 2286 */
f20e2865 2287static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
a82d593b
DDAG
2288{
2289 RAMBlock *block = NULL;
2290
ae526e32
XG
2291 if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
2292 return NULL;
2293 }
2294
ec481c6c
JQ
2295 qemu_mutex_lock(&rs->src_page_req_mutex);
2296 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2297 struct RAMSrcPageRequest *entry =
2298 QSIMPLEQ_FIRST(&rs->src_page_requests);
a82d593b
DDAG
2299 block = entry->rb;
2300 *offset = entry->offset;
a82d593b
DDAG
2301
2302 if (entry->len > TARGET_PAGE_SIZE) {
2303 entry->len -= TARGET_PAGE_SIZE;
2304 entry->offset += TARGET_PAGE_SIZE;
2305 } else {
2306 memory_region_unref(block->mr);
ec481c6c 2307 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
a82d593b 2308 g_free(entry);
e03a34f8 2309 migration_consume_urgent_request();
a82d593b
DDAG
2310 }
2311 }
ec481c6c 2312 qemu_mutex_unlock(&rs->src_page_req_mutex);
a82d593b
DDAG
2313
2314 return block;
2315}
2316
3d0684b2 2317/**
ff1543af 2318 * get_queued_page: unqueue a page from the postcopy requests
3d0684b2
JQ
2319 *
2320 * Skips pages that are already sent (!dirty)
a82d593b 2321 *
a5f7b1a6 2322 * Returns true if a queued page is found
a82d593b 2323 *
6f37bb8b 2324 * @rs: current RAM state
3d0684b2 2325 * @pss: data about the state of the current dirty page scan
a82d593b 2326 */
f20e2865 2327static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
a82d593b
DDAG
2328{
2329 RAMBlock *block;
2330 ram_addr_t offset;
2331 bool dirty;
2332
2333 do {
f20e2865 2334 block = unqueue_page(rs, &offset);
a82d593b
DDAG
2335 /*
2336 * We're sending this page, and since it's postcopy nothing else
2337 * will dirty it, and we must make sure it doesn't get sent again
2338 * even if this queue request was received after the background
2339 * search already sent it.
2340 */
2341 if (block) {
f20e2865
JQ
2342 unsigned long page;
2343
6b6712ef
JQ
2344 page = offset >> TARGET_PAGE_BITS;
2345 dirty = test_bit(page, block->bmap);
a82d593b 2346 if (!dirty) {
06b10688 2347 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
64737606 2348 page);
a82d593b 2349 } else {
f20e2865 2350 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
a82d593b
DDAG
2351 }
2352 }
2353
2354 } while (block && !dirty);
2355
2356 if (block) {
2357 /*
2358 * As soon as we start servicing pages out of order, then we have
2359 * to kill the bulk stage, since the bulk stage assumes
2360 * in (migration_bitmap_find_and_reset_dirty) that every page is
2361 * dirty, that's no longer true.
2362 */
6f37bb8b 2363 rs->ram_bulk_stage = false;
a82d593b
DDAG
2364
2365 /*
2366 * We want the background search to continue from the queued page
2367 * since the guest is likely to want other pages near to the page
2368 * it just requested.
2369 */
2370 pss->block = block;
a935e30f 2371 pss->page = offset >> TARGET_PAGE_BITS;
422314e7
WY
2372
2373 /*
2374 * This unqueued page would break the "one round" check, even is
2375 * really rare.
2376 */
2377 pss->complete_round = false;
a82d593b
DDAG
2378 }
2379
2380 return !!block;
2381}
2382
6c595cde 2383/**
5e58f968
JQ
2384 * migration_page_queue_free: drop any remaining pages in the ram
2385 * request queue
6c595cde 2386 *
3d0684b2
JQ
2387 * It should be empty at the end anyway, but in error cases there may
2388 * be some left. in case that there is any page left, we drop it.
2389 *
6c595cde 2390 */
83c13382 2391static void migration_page_queue_free(RAMState *rs)
6c595cde 2392{
ec481c6c 2393 struct RAMSrcPageRequest *mspr, *next_mspr;
6c595cde
DDAG
2394 /* This queue generally should be empty - but in the case of a failed
2395 * migration might have some droppings in.
2396 */
89ac5a1d 2397 RCU_READ_LOCK_GUARD();
ec481c6c 2398 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
6c595cde 2399 memory_region_unref(mspr->rb->mr);
ec481c6c 2400 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
6c595cde
DDAG
2401 g_free(mspr);
2402 }
6c595cde
DDAG
2403}
2404
2405/**
3d0684b2
JQ
2406 * ram_save_queue_pages: queue the page for transmission
2407 *
2408 * A request from postcopy destination for example.
2409 *
2410 * Returns zero on success or negative on error
2411 *
3d0684b2
JQ
2412 * @rbname: Name of the RAMBLock of the request. NULL means the
2413 * same that last one.
2414 * @start: starting address from the start of the RAMBlock
2415 * @len: length (in bytes) to send
6c595cde 2416 */
96506894 2417int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
6c595cde
DDAG
2418{
2419 RAMBlock *ramblock;
53518d94 2420 RAMState *rs = ram_state;
6c595cde 2421
9360447d 2422 ram_counters.postcopy_requests++;
89ac5a1d
DDAG
2423 RCU_READ_LOCK_GUARD();
2424
6c595cde
DDAG
2425 if (!rbname) {
2426 /* Reuse last RAMBlock */
68a098f3 2427 ramblock = rs->last_req_rb;
6c595cde
DDAG
2428
2429 if (!ramblock) {
2430 /*
2431 * Shouldn't happen, we can't reuse the last RAMBlock if
2432 * it's the 1st request.
2433 */
2434 error_report("ram_save_queue_pages no previous block");
2435 goto err;
2436 }
2437 } else {
2438 ramblock = qemu_ram_block_by_name(rbname);
2439
2440 if (!ramblock) {
2441 /* We shouldn't be asked for a non-existent RAMBlock */
2442 error_report("ram_save_queue_pages no block '%s'", rbname);
2443 goto err;
2444 }
68a098f3 2445 rs->last_req_rb = ramblock;
6c595cde
DDAG
2446 }
2447 trace_ram_save_queue_pages(ramblock->idstr, start, len);
2448 if (start+len > ramblock->used_length) {
9458ad6b
JQ
2449 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2450 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
6c595cde
DDAG
2451 __func__, start, len, ramblock->used_length);
2452 goto err;
2453 }
2454
ec481c6c
JQ
2455 struct RAMSrcPageRequest *new_entry =
2456 g_malloc0(sizeof(struct RAMSrcPageRequest));
6c595cde
DDAG
2457 new_entry->rb = ramblock;
2458 new_entry->offset = start;
2459 new_entry->len = len;
2460
2461 memory_region_ref(ramblock->mr);
ec481c6c
JQ
2462 qemu_mutex_lock(&rs->src_page_req_mutex);
2463 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
e03a34f8 2464 migration_make_urgent_request();
ec481c6c 2465 qemu_mutex_unlock(&rs->src_page_req_mutex);
6c595cde
DDAG
2466
2467 return 0;
2468
2469err:
6c595cde
DDAG
2470 return -1;
2471}
2472
d7400a34
XG
2473static bool save_page_use_compression(RAMState *rs)
2474{
2475 if (!migrate_use_compression()) {
2476 return false;
2477 }
2478
2479 /*
2480 * If xbzrle is on, stop using the data compression after first
2481 * round of migration even if compression is enabled. In theory,
2482 * xbzrle can do better than compression.
2483 */
2484 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
2485 return true;
2486 }
2487
2488 return false;
2489}
2490
5e5fdcff
XG
2491/*
2492 * try to compress the page before posting it out, return true if the page
2493 * has been properly handled by compression, otherwise needs other
2494 * paths to handle it
2495 */
2496static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2497{
2498 if (!save_page_use_compression(rs)) {
2499 return false;
2500 }
2501
2502 /*
2503 * When starting the process of a new block, the first page of
2504 * the block should be sent out before other pages in the same
2505 * block, and all the pages in last block should have been sent
2506 * out, keeping this order is important, because the 'cont' flag
2507 * is used to avoid resending the block name.
2508 *
2509 * We post the fist page as normal page as compression will take
2510 * much CPU resource.
2511 */
2512 if (block != rs->last_sent_block) {
2513 flush_compressed_data(rs);
2514 return false;
2515 }
2516
2517 if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2518 return true;
2519 }
2520
76e03000 2521 compression_counters.busy++;
5e5fdcff
XG
2522 return false;
2523}
2524
a82d593b 2525/**
3d0684b2 2526 * ram_save_target_page: save one target page
a82d593b 2527 *
3d0684b2 2528 * Returns the number of pages written
a82d593b 2529 *
6f37bb8b 2530 * @rs: current RAM state
3d0684b2 2531 * @pss: data about the page we want to send
a82d593b 2532 * @last_stage: if we are at the completion stage
a82d593b 2533 */
a0a8aa14 2534static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
f20e2865 2535 bool last_stage)
a82d593b 2536{
a8ec91f9
XG
2537 RAMBlock *block = pss->block;
2538 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2539 int res;
2540
2541 if (control_save_page(rs, block, offset, &res)) {
2542 return res;
2543 }
2544
5e5fdcff
XG
2545 if (save_compress_page(rs, block, offset)) {
2546 return 1;
d7400a34
XG
2547 }
2548
2549 res = save_zero_page(rs, block, offset);
2550 if (res > 0) {
2551 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2552 * page would be stale
2553 */
2554 if (!save_page_use_compression(rs)) {
2555 XBZRLE_cache_lock();
2556 xbzrle_cache_zero_page(rs, block->offset + offset);
2557 XBZRLE_cache_unlock();
2558 }
2559 ram_release_pages(block->idstr, offset, res);
2560 return res;
2561 }
2562
da3f56cb 2563 /*
5e5fdcff
XG
2564 * do not use multifd for compression as the first page in the new
2565 * block should be posted out before sending the compressed page
da3f56cb 2566 */
5e5fdcff 2567 if (!save_page_use_compression(rs) && migrate_use_multifd()) {
b9ee2f7d 2568 return ram_save_multifd_page(rs, block, offset);
a82d593b
DDAG
2569 }
2570
1faa5665 2571 return ram_save_page(rs, pss, last_stage);
a82d593b
DDAG
2572}
2573
2574/**
3d0684b2 2575 * ram_save_host_page: save a whole host page
a82d593b 2576 *
3d0684b2
JQ
2577 * Starting at *offset send pages up to the end of the current host
2578 * page. It's valid for the initial offset to point into the middle of
2579 * a host page in which case the remainder of the hostpage is sent.
2580 * Only dirty target pages are sent. Note that the host page size may
2581 * be a huge page for this block.
1eb3fc0a
DDAG
2582 * The saving stops at the boundary of the used_length of the block
2583 * if the RAMBlock isn't a multiple of the host page size.
a82d593b 2584 *
3d0684b2
JQ
2585 * Returns the number of pages written or negative on error
2586 *
6f37bb8b 2587 * @rs: current RAM state
3d0684b2 2588 * @ms: current migration state
3d0684b2 2589 * @pss: data about the page we want to send
a82d593b 2590 * @last_stage: if we are at the completion stage
a82d593b 2591 */
a0a8aa14 2592static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
f20e2865 2593 bool last_stage)
a82d593b
DDAG
2594{
2595 int tmppages, pages = 0;
a935e30f
JQ
2596 size_t pagesize_bits =
2597 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
4c011c37 2598
fbd162e6 2599 if (ramblock_is_ignored(pss->block)) {
b895de50
CLG
2600 error_report("block %s should not be migrated !", pss->block->idstr);
2601 return 0;
2602 }
2603
a82d593b 2604 do {
1faa5665
XG
2605 /* Check the pages is dirty and if it is send it */
2606 if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2607 pss->page++;
2608 continue;
2609 }
2610
f20e2865 2611 tmppages = ram_save_target_page(rs, pss, last_stage);
a82d593b
DDAG
2612 if (tmppages < 0) {
2613 return tmppages;
2614 }
2615
2616 pages += tmppages;
a935e30f 2617 pss->page++;
1eb3fc0a
DDAG
2618 } while ((pss->page & (pagesize_bits - 1)) &&
2619 offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
a82d593b
DDAG
2620
2621 /* The offset we leave with is the last one we looked at */
a935e30f 2622 pss->page--;
a82d593b
DDAG
2623 return pages;
2624}
6c595cde 2625
56e93d26 2626/**
3d0684b2 2627 * ram_find_and_save_block: finds a dirty page and sends it to f
56e93d26
JQ
2628 *
2629 * Called within an RCU critical section.
2630 *
e8f3735f
XG
2631 * Returns the number of pages written where zero means no dirty pages,
2632 * or negative on error
56e93d26 2633 *
6f37bb8b 2634 * @rs: current RAM state
56e93d26 2635 * @last_stage: if we are at the completion stage
a82d593b
DDAG
2636 *
2637 * On systems where host-page-size > target-page-size it will send all the
2638 * pages in a host page that are dirty.
56e93d26
JQ
2639 */
2640
ce25d337 2641static int ram_find_and_save_block(RAMState *rs, bool last_stage)
56e93d26 2642{
b8fb8cb7 2643 PageSearchStatus pss;
56e93d26 2644 int pages = 0;
b9e60928 2645 bool again, found;
56e93d26 2646
0827b9e9
AA
2647 /* No dirty page as there is zero RAM */
2648 if (!ram_bytes_total()) {
2649 return pages;
2650 }
2651
6f37bb8b 2652 pss.block = rs->last_seen_block;
a935e30f 2653 pss.page = rs->last_page;
b8fb8cb7
DDAG
2654 pss.complete_round = false;
2655
2656 if (!pss.block) {
2657 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2658 }
56e93d26 2659
b9e60928 2660 do {
a82d593b 2661 again = true;
f20e2865 2662 found = get_queued_page(rs, &pss);
b9e60928 2663
a82d593b
DDAG
2664 if (!found) {
2665 /* priority queue empty, so just search for something dirty */
f20e2865 2666 found = find_dirty_block(rs, &pss, &again);
a82d593b 2667 }
f3f491fc 2668
a82d593b 2669 if (found) {
f20e2865 2670 pages = ram_save_host_page(rs, &pss, last_stage);
56e93d26 2671 }
b9e60928 2672 } while (!pages && again);
56e93d26 2673
6f37bb8b 2674 rs->last_seen_block = pss.block;
a935e30f 2675 rs->last_page = pss.page;
56e93d26
JQ
2676
2677 return pages;
2678}
2679
2680void acct_update_position(QEMUFile *f, size_t size, bool zero)
2681{
2682 uint64_t pages = size / TARGET_PAGE_SIZE;
f7ccd61b 2683
56e93d26 2684 if (zero) {
9360447d 2685 ram_counters.duplicate += pages;
56e93d26 2686 } else {
9360447d
JQ
2687 ram_counters.normal += pages;
2688 ram_counters.transferred += size;
56e93d26
JQ
2689 qemu_update_position(f, size);
2690 }
2691}
2692
fbd162e6 2693static uint64_t ram_bytes_total_common(bool count_ignored)
56e93d26
JQ
2694{
2695 RAMBlock *block;
2696 uint64_t total = 0;
2697
89ac5a1d
DDAG
2698 RCU_READ_LOCK_GUARD();
2699
fbd162e6
YK
2700 if (count_ignored) {
2701 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2702 total += block->used_length;
2703 }
2704 } else {
2705 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2706 total += block->used_length;
2707 }
99e15582 2708 }
56e93d26
JQ
2709 return total;
2710}
2711
fbd162e6
YK
2712uint64_t ram_bytes_total(void)
2713{
2714 return ram_bytes_total_common(false);
2715}
2716
f265e0e4 2717static void xbzrle_load_setup(void)
56e93d26 2718{
f265e0e4 2719 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
56e93d26
JQ
2720}
2721
f265e0e4
JQ
2722static void xbzrle_load_cleanup(void)
2723{
2724 g_free(XBZRLE.decoded_buf);
2725 XBZRLE.decoded_buf = NULL;
2726}
2727
7d7c96be
PX
2728static void ram_state_cleanup(RAMState **rsp)
2729{
b9ccaf6d
DDAG
2730 if (*rsp) {
2731 migration_page_queue_free(*rsp);
2732 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2733 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2734 g_free(*rsp);
2735 *rsp = NULL;
2736 }
7d7c96be
PX
2737}
2738
84593a08
PX
2739static void xbzrle_cleanup(void)
2740{
2741 XBZRLE_cache_lock();
2742 if (XBZRLE.cache) {
2743 cache_fini(XBZRLE.cache);
2744 g_free(XBZRLE.encoded_buf);
2745 g_free(XBZRLE.current_buf);
2746 g_free(XBZRLE.zero_target_page);
2747 XBZRLE.cache = NULL;
2748 XBZRLE.encoded_buf = NULL;
2749 XBZRLE.current_buf = NULL;
2750 XBZRLE.zero_target_page = NULL;
2751 }
2752 XBZRLE_cache_unlock();
2753}
2754
f265e0e4 2755static void ram_save_cleanup(void *opaque)
56e93d26 2756{
53518d94 2757 RAMState **rsp = opaque;
6b6712ef 2758 RAMBlock *block;
eb859c53 2759
2ff64038 2760 /* caller have hold iothread lock or is in a bh, so there is
4633456c 2761 * no writing race against the migration bitmap
2ff64038 2762 */
6b6712ef
JQ
2763 memory_global_dirty_log_stop();
2764
fbd162e6 2765 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
002cad6b
PX
2766 g_free(block->clear_bmap);
2767 block->clear_bmap = NULL;
6b6712ef
JQ
2768 g_free(block->bmap);
2769 block->bmap = NULL;
56e93d26
JQ
2770 }
2771
84593a08 2772 xbzrle_cleanup();
f0afa331 2773 compress_threads_save_cleanup();
7d7c96be 2774 ram_state_cleanup(rsp);
56e93d26
JQ
2775}
2776
6f37bb8b 2777static void ram_state_reset(RAMState *rs)
56e93d26 2778{
6f37bb8b
JQ
2779 rs->last_seen_block = NULL;
2780 rs->last_sent_block = NULL;
269ace29 2781 rs->last_page = 0;
6f37bb8b
JQ
2782 rs->last_version = ram_list.version;
2783 rs->ram_bulk_stage = true;
6eeb63f7 2784 rs->fpo_enabled = false;
56e93d26
JQ
2785}
2786
2787#define MAX_WAIT 50 /* ms, half buffered_file limit */
2788
4f2e4252
DDAG
2789/*
2790 * 'expected' is the value you expect the bitmap mostly to be full
2791 * of; it won't bother printing lines that are all this value.
2792 * If 'todump' is null the migration bitmap is dumped.
2793 */
6b6712ef
JQ
2794void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2795 unsigned long pages)
4f2e4252 2796{
4f2e4252
DDAG
2797 int64_t cur;
2798 int64_t linelen = 128;
2799 char linebuf[129];
2800
6b6712ef 2801 for (cur = 0; cur < pages; cur += linelen) {
4f2e4252
DDAG
2802 int64_t curb;
2803 bool found = false;
2804 /*
2805 * Last line; catch the case where the line length
2806 * is longer than remaining ram
2807 */
6b6712ef
JQ
2808 if (cur + linelen > pages) {
2809 linelen = pages - cur;
4f2e4252
DDAG
2810 }
2811 for (curb = 0; curb < linelen; curb++) {
2812 bool thisbit = test_bit(cur + curb, todump);
2813 linebuf[curb] = thisbit ? '1' : '.';
2814 found = found || (thisbit != expected);
2815 }
2816 if (found) {
2817 linebuf[curb] = '\0';
2818 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
2819 }
2820 }
2821}
2822
e0b266f0
DDAG
2823/* **** functions for postcopy ***** */
2824
ced1c616
PB
2825void ram_postcopy_migrated_memory_release(MigrationState *ms)
2826{
2827 struct RAMBlock *block;
ced1c616 2828
fbd162e6 2829 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
6b6712ef
JQ
2830 unsigned long *bitmap = block->bmap;
2831 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2832 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
ced1c616
PB
2833
2834 while (run_start < range) {
2835 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
aaa2064c 2836 ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
ced1c616
PB
2837 (run_end - run_start) << TARGET_PAGE_BITS);
2838 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2839 }
2840 }
2841}
2842
3d0684b2
JQ
2843/**
2844 * postcopy_send_discard_bm_ram: discard a RAMBlock
2845 *
2846 * Returns zero on success
2847 *
e0b266f0 2848 * Callback from postcopy_each_ram_send_discard for each RAMBlock
3d0684b2
JQ
2849 *
2850 * @ms: current migration state
89dab31b 2851 * @block: RAMBlock to discard
e0b266f0 2852 */
810cf2bb 2853static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
e0b266f0 2854{
6b6712ef 2855 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
e0b266f0 2856 unsigned long current;
1e7cf8c3 2857 unsigned long *bitmap = block->bmap;
e0b266f0 2858
6b6712ef 2859 for (current = 0; current < end; ) {
1e7cf8c3 2860 unsigned long one = find_next_bit(bitmap, end, current);
33a5cb62 2861 unsigned long zero, discard_length;
e0b266f0 2862
33a5cb62
WY
2863 if (one >= end) {
2864 break;
2865 }
e0b266f0 2866
1e7cf8c3 2867 zero = find_next_zero_bit(bitmap, end, one + 1);
33a5cb62
WY
2868
2869 if (zero >= end) {
2870 discard_length = end - one;
e0b266f0 2871 } else {
33a5cb62
WY
2872 discard_length = zero - one;
2873 }
810cf2bb 2874 postcopy_discard_send_range(ms, one, discard_length);
33a5cb62 2875 current = one + discard_length;
e0b266f0
DDAG
2876 }
2877
2878 return 0;
2879}
2880
3d0684b2
JQ
2881/**
2882 * postcopy_each_ram_send_discard: discard all RAMBlocks
2883 *
2884 * Returns 0 for success or negative for error
2885 *
e0b266f0
DDAG
2886 * Utility for the outgoing postcopy code.
2887 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2888 * passing it bitmap indexes and name.
e0b266f0
DDAG
2889 * (qemu_ram_foreach_block ends up passing unscaled lengths
2890 * which would mean postcopy code would have to deal with target page)
3d0684b2
JQ
2891 *
2892 * @ms: current migration state
e0b266f0
DDAG
2893 */
2894static int postcopy_each_ram_send_discard(MigrationState *ms)
2895{
2896 struct RAMBlock *block;
2897 int ret;
2898
fbd162e6 2899 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
810cf2bb 2900 postcopy_discard_send_init(ms, block->idstr);
e0b266f0
DDAG
2901
2902 /*
2903 * Postcopy sends chunks of bitmap over the wire, but it
2904 * just needs indexes at this point, avoids it having
2905 * target page specific code.
2906 */
810cf2bb
WY
2907 ret = postcopy_send_discard_bm_ram(ms, block);
2908 postcopy_discard_send_finish(ms);
e0b266f0
DDAG
2909 if (ret) {
2910 return ret;
2911 }
2912 }
2913
2914 return 0;
2915}
2916
3d0684b2 2917/**
8324ef86 2918 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
3d0684b2
JQ
2919 *
2920 * Helper for postcopy_chunk_hostpages; it's called twice to
2921 * canonicalize the two bitmaps, that are similar, but one is
2922 * inverted.
99e314eb 2923 *
3d0684b2
JQ
2924 * Postcopy requires that all target pages in a hostpage are dirty or
2925 * clean, not a mix. This function canonicalizes the bitmaps.
99e314eb 2926 *
3d0684b2 2927 * @ms: current migration state
3d0684b2 2928 * @block: block that contains the page we want to canonicalize
99e314eb 2929 */
1e7cf8c3 2930static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
99e314eb 2931{
53518d94 2932 RAMState *rs = ram_state;
6b6712ef 2933 unsigned long *bitmap = block->bmap;
29c59172 2934 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
6b6712ef 2935 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
99e314eb
DDAG
2936 unsigned long run_start;
2937
29c59172
DDAG
2938 if (block->page_size == TARGET_PAGE_SIZE) {
2939 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2940 return;
2941 }
2942
1e7cf8c3
WY
2943 /* Find a dirty page */
2944 run_start = find_next_bit(bitmap, pages, 0);
99e314eb 2945
6b6712ef 2946 while (run_start < pages) {
99e314eb
DDAG
2947
2948 /*
2949 * If the start of this run of pages is in the middle of a host
2950 * page, then we need to fixup this host page.
2951 */
9dec3cc3 2952 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
99e314eb 2953 /* Find the end of this run */
1e7cf8c3 2954 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
99e314eb
DDAG
2955 /*
2956 * If the end isn't at the start of a host page, then the
2957 * run doesn't finish at the end of a host page
2958 * and we need to discard.
2959 */
99e314eb
DDAG
2960 }
2961
9dec3cc3 2962 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
99e314eb 2963 unsigned long page;
dad45ab2
WY
2964 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2965 host_ratio);
2966 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
99e314eb 2967
99e314eb
DDAG
2968 /* Clean up the bitmap */
2969 for (page = fixup_start_addr;
2970 page < fixup_start_addr + host_ratio; page++) {
99e314eb
DDAG
2971 /*
2972 * Remark them as dirty, updating the count for any pages
2973 * that weren't previously dirty.
2974 */
0d8ec885 2975 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
99e314eb
DDAG
2976 }
2977 }
2978
1e7cf8c3
WY
2979 /* Find the next dirty page for the next iteration */
2980 run_start = find_next_bit(bitmap, pages, run_start);
99e314eb
DDAG
2981 }
2982}
2983
3d0684b2 2984/**
89dab31b 2985 * postcopy_chunk_hostpages: discard any partially sent host page
3d0684b2 2986 *
99e314eb
DDAG
2987 * Utility for the outgoing postcopy code.
2988 *
2989 * Discard any partially sent host-page size chunks, mark any partially
29c59172
DDAG
2990 * dirty host-page size chunks as all dirty. In this case the host-page
2991 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
99e314eb 2992 *
3d0684b2
JQ
2993 * Returns zero on success
2994 *
2995 * @ms: current migration state
6b6712ef 2996 * @block: block we want to work with
99e314eb 2997 */
6b6712ef 2998static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
99e314eb 2999{
810cf2bb 3000 postcopy_discard_send_init(ms, block->idstr);
99e314eb 3001
6b6712ef 3002 /*
1e7cf8c3 3003 * Ensure that all partially dirty host pages are made fully dirty.
6b6712ef 3004 */
1e7cf8c3 3005 postcopy_chunk_hostpages_pass(ms, block);
99e314eb 3006
810cf2bb 3007 postcopy_discard_send_finish(ms);
99e314eb
DDAG
3008 return 0;
3009}
3010
3d0684b2
JQ
3011/**
3012 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
3013 *
3014 * Returns zero on success
3015 *
e0b266f0
DDAG
3016 * Transmit the set of pages to be discarded after precopy to the target
3017 * these are pages that:
3018 * a) Have been previously transmitted but are now dirty again
3019 * b) Pages that have never been transmitted, this ensures that
3020 * any pages on the destination that have been mapped by background
3021 * tasks get discarded (transparent huge pages is the specific concern)
3022 * Hopefully this is pretty sparse
3d0684b2
JQ
3023 *
3024 * @ms: current migration state
e0b266f0
DDAG
3025 */
3026int ram_postcopy_send_discard_bitmap(MigrationState *ms)
3027{
53518d94 3028 RAMState *rs = ram_state;
6b6712ef 3029 RAMBlock *block;
e0b266f0 3030 int ret;
e0b266f0 3031
89ac5a1d 3032 RCU_READ_LOCK_GUARD();
e0b266f0
DDAG
3033
3034 /* This should be our last sync, the src is now paused */
eb859c53 3035 migration_bitmap_sync(rs);
e0b266f0 3036
6b6712ef
JQ
3037 /* Easiest way to make sure we don't resume in the middle of a host-page */
3038 rs->last_seen_block = NULL;
3039 rs->last_sent_block = NULL;
3040 rs->last_page = 0;
e0b266f0 3041
fbd162e6 3042 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
6b6712ef
JQ
3043 /* Deal with TPS != HPS and huge pages */
3044 ret = postcopy_chunk_hostpages(ms, block);
3045 if (ret) {
6b6712ef
JQ
3046 return ret;
3047 }
e0b266f0 3048
e0b266f0 3049#ifdef DEBUG_POSTCOPY
1e7cf8c3
WY
3050 ram_debug_dump_bitmap(block->bmap, true,
3051 block->used_length >> TARGET_PAGE_BITS);
e0b266f0 3052#endif
6b6712ef
JQ
3053 }
3054 trace_ram_postcopy_send_discard_bitmap();
e0b266f0
DDAG
3055
3056 ret = postcopy_each_ram_send_discard(ms);
e0b266f0
DDAG
3057
3058 return ret;
3059}
3060
3d0684b2
JQ
3061/**
3062 * ram_discard_range: discard dirtied pages at the beginning of postcopy
e0b266f0 3063 *
3d0684b2 3064 * Returns zero on success
e0b266f0 3065 *
36449157
JQ
3066 * @rbname: name of the RAMBlock of the request. NULL means the
3067 * same that last one.
3d0684b2
JQ
3068 * @start: RAMBlock starting page
3069 * @length: RAMBlock size
e0b266f0 3070 */
aaa2064c 3071int ram_discard_range(const char *rbname, uint64_t start, size_t length)
e0b266f0
DDAG
3072{
3073 int ret = -1;
3074
36449157 3075 trace_ram_discard_range(rbname, start, length);
d3a5038c 3076
89ac5a1d 3077 RCU_READ_LOCK_GUARD();
36449157 3078 RAMBlock *rb = qemu_ram_block_by_name(rbname);
e0b266f0
DDAG
3079
3080 if (!rb) {
36449157 3081 error_report("ram_discard_range: Failed to find block '%s'", rbname);
e0b266f0
DDAG
3082 goto err;
3083 }
3084
814bb08f
PX
3085 /*
3086 * On source VM, we don't need to update the received bitmap since
3087 * we don't even have one.
3088 */
3089 if (rb->receivedmap) {
3090 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
3091 length >> qemu_target_page_bits());
3092 }
3093
d3a5038c 3094 ret = ram_block_discard_range(rb, start, length);
e0b266f0
DDAG
3095
3096err:
e0b266f0
DDAG
3097 return ret;
3098}
3099
84593a08
PX
3100/*
3101 * For every allocation, we will try not to crash the VM if the
3102 * allocation failed.
3103 */
3104static int xbzrle_init(void)
3105{
3106 Error *local_err = NULL;
3107
3108 if (!migrate_use_xbzrle()) {
3109 return 0;
3110 }
3111
3112 XBZRLE_cache_lock();
3113
3114 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
3115 if (!XBZRLE.zero_target_page) {
3116 error_report("%s: Error allocating zero page", __func__);
3117 goto err_out;
3118 }
3119
3120 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
3121 TARGET_PAGE_SIZE, &local_err);
3122 if (!XBZRLE.cache) {
3123 error_report_err(local_err);
3124 goto free_zero_page;
3125 }
3126
3127 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3128 if (!XBZRLE.encoded_buf) {
3129 error_report("%s: Error allocating encoded_buf", __func__);
3130 goto free_cache;
3131 }
3132
3133 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3134 if (!XBZRLE.current_buf) {
3135 error_report("%s: Error allocating current_buf", __func__);
3136 goto free_encoded_buf;
3137 }
3138
3139 /* We are all good */
3140 XBZRLE_cache_unlock();
3141 return 0;
3142
3143free_encoded_buf:
3144 g_free(XBZRLE.encoded_buf);
3145 XBZRLE.encoded_buf = NULL;
3146free_cache:
3147 cache_fini(XBZRLE.cache);
3148 XBZRLE.cache = NULL;
3149free_zero_page:
3150 g_free(XBZRLE.zero_target_page);
3151 XBZRLE.zero_target_page = NULL;
3152err_out:
3153 XBZRLE_cache_unlock();
3154 return -ENOMEM;
3155}
3156
53518d94 3157static int ram_state_init(RAMState **rsp)
56e93d26 3158{
7d00ee6a
PX
3159 *rsp = g_try_new0(RAMState, 1);
3160
3161 if (!*rsp) {
3162 error_report("%s: Init ramstate fail", __func__);
3163 return -1;
3164 }
53518d94
JQ
3165
3166 qemu_mutex_init(&(*rsp)->bitmap_mutex);
3167 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3168 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
56e93d26 3169
7d00ee6a 3170 /*
40c4d4a8
IR
3171 * Count the total number of pages used by ram blocks not including any
3172 * gaps due to alignment or unplugs.
03158519 3173 * This must match with the initial values of dirty bitmap.
7d00ee6a 3174 */
40c4d4a8 3175 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
7d00ee6a
PX
3176 ram_state_reset(*rsp);
3177
3178 return 0;
3179}
3180
d6eff5d7 3181static void ram_list_init_bitmaps(void)
7d00ee6a 3182{
002cad6b 3183 MigrationState *ms = migrate_get_current();
d6eff5d7
PX
3184 RAMBlock *block;
3185 unsigned long pages;
002cad6b 3186 uint8_t shift;
56e93d26 3187
0827b9e9
AA
3188 /* Skip setting bitmap if there is no RAM */
3189 if (ram_bytes_total()) {
002cad6b
PX
3190 shift = ms->clear_bitmap_shift;
3191 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3192 error_report("clear_bitmap_shift (%u) too big, using "
3193 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3194 shift = CLEAR_BITMAP_SHIFT_MAX;
3195 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3196 error_report("clear_bitmap_shift (%u) too small, using "
3197 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3198 shift = CLEAR_BITMAP_SHIFT_MIN;
3199 }
3200
fbd162e6 3201 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
d6eff5d7 3202 pages = block->max_length >> TARGET_PAGE_BITS;
03158519
WY
3203 /*
3204 * The initial dirty bitmap for migration must be set with all
3205 * ones to make sure we'll migrate every guest RAM page to
3206 * destination.
40c4d4a8
IR
3207 * Here we set RAMBlock.bmap all to 1 because when rebegin a
3208 * new migration after a failed migration, ram_list.
3209 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3210 * guest memory.
03158519 3211 */
6b6712ef 3212 block->bmap = bitmap_new(pages);
40c4d4a8 3213 bitmap_set(block->bmap, 0, pages);
002cad6b
PX
3214 block->clear_bmap_shift = shift;
3215 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
0827b9e9 3216 }
f3f491fc 3217 }
d6eff5d7
PX
3218}
3219
3220static void ram_init_bitmaps(RAMState *rs)
3221{
3222 /* For memory_global_dirty_log_start below. */
3223 qemu_mutex_lock_iothread();
3224 qemu_mutex_lock_ramlist();
f3f491fc 3225
89ac5a1d
DDAG
3226 WITH_RCU_READ_LOCK_GUARD() {
3227 ram_list_init_bitmaps();
3228 memory_global_dirty_log_start();
3229 migration_bitmap_sync_precopy(rs);
3230 }
56e93d26 3231 qemu_mutex_unlock_ramlist();
49877834 3232 qemu_mutex_unlock_iothread();
d6eff5d7
PX
3233}
3234
3235static int ram_init_all(RAMState **rsp)
3236{
3237 if (ram_state_init(rsp)) {
3238 return -1;
3239 }
3240
3241 if (xbzrle_init()) {
3242 ram_state_cleanup(rsp);
3243 return -1;
3244 }
3245
3246 ram_init_bitmaps(*rsp);
a91246c9
HZ
3247
3248 return 0;
3249}
3250
08614f34
PX
3251static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3252{
3253 RAMBlock *block;
3254 uint64_t pages = 0;
3255
3256 /*
3257 * Postcopy is not using xbzrle/compression, so no need for that.
3258 * Also, since source are already halted, we don't need to care
3259 * about dirty page logging as well.
3260 */
3261
fbd162e6 3262 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
08614f34
PX
3263 pages += bitmap_count_one(block->bmap,
3264 block->used_length >> TARGET_PAGE_BITS);
3265 }
3266
3267 /* This may not be aligned with current bitmaps. Recalculate. */
3268 rs->migration_dirty_pages = pages;
3269
3270 rs->last_seen_block = NULL;
3271 rs->last_sent_block = NULL;
3272 rs->last_page = 0;
3273 rs->last_version = ram_list.version;
3274 /*
3275 * Disable the bulk stage, otherwise we'll resend the whole RAM no
3276 * matter what we have sent.
3277 */
3278 rs->ram_bulk_stage = false;
3279
3280 /* Update RAMState cache of output QEMUFile */
3281 rs->f = out;
3282
3283 trace_ram_state_resume_prepare(pages);
3284}
3285
6bcb05fc
WW
3286/*
3287 * This function clears bits of the free pages reported by the caller from the
3288 * migration dirty bitmap. @addr is the host address corresponding to the
3289 * start of the continuous guest free pages, and @len is the total bytes of
3290 * those pages.
3291 */
3292void qemu_guest_free_page_hint(void *addr, size_t len)
3293{
3294 RAMBlock *block;
3295 ram_addr_t offset;
3296 size_t used_len, start, npages;
3297 MigrationState *s = migrate_get_current();
3298
3299 /* This function is currently expected to be used during live migration */
3300 if (!migration_is_setup_or_active(s->state)) {
3301 return;
3302 }
3303
3304 for (; len > 0; len -= used_len, addr += used_len) {
3305 block = qemu_ram_block_from_host(addr, false, &offset);
3306 if (unlikely(!block || offset >= block->used_length)) {
3307 /*
3308 * The implementation might not support RAMBlock resize during
3309 * live migration, but it could happen in theory with future
3310 * updates. So we add a check here to capture that case.
3311 */
3312 error_report_once("%s unexpected error", __func__);
3313 return;
3314 }
3315
3316 if (len <= block->used_length - offset) {
3317 used_len = len;
3318 } else {
3319 used_len = block->used_length - offset;
3320 }
3321
3322 start = offset >> TARGET_PAGE_BITS;
3323 npages = used_len >> TARGET_PAGE_BITS;
3324
3325 qemu_mutex_lock(&ram_state->bitmap_mutex);
3326 ram_state->migration_dirty_pages -=
3327 bitmap_count_one_with_offset(block->bmap, start, npages);
3328 bitmap_clear(block->bmap, start, npages);
3329 qemu_mutex_unlock(&ram_state->bitmap_mutex);
3330 }
3331}
3332
3d0684b2
JQ
3333/*
3334 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
a91246c9
HZ
3335 * long-running RCU critical section. When rcu-reclaims in the code
3336 * start to become numerous it will be necessary to reduce the
3337 * granularity of these critical sections.
3338 */
3339
3d0684b2
JQ
3340/**
3341 * ram_save_setup: Setup RAM for migration
3342 *
3343 * Returns zero to indicate success and negative for error
3344 *
3345 * @f: QEMUFile where to send the data
3346 * @opaque: RAMState pointer
3347 */
a91246c9
HZ
3348static int ram_save_setup(QEMUFile *f, void *opaque)
3349{
53518d94 3350 RAMState **rsp = opaque;
a91246c9
HZ
3351 RAMBlock *block;
3352
dcaf446e
XG
3353 if (compress_threads_save_setup()) {
3354 return -1;
3355 }
3356
a91246c9
HZ
3357 /* migration has already setup the bitmap, reuse it. */
3358 if (!migration_in_colo_state()) {
7d00ee6a 3359 if (ram_init_all(rsp) != 0) {
dcaf446e 3360 compress_threads_save_cleanup();
a91246c9 3361 return -1;
53518d94 3362 }
a91246c9 3363 }
53518d94 3364 (*rsp)->f = f;
a91246c9 3365
0e6ebd48
DDAG
3366 WITH_RCU_READ_LOCK_GUARD() {
3367 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
56e93d26 3368
0e6ebd48
DDAG
3369 RAMBLOCK_FOREACH_MIGRATABLE(block) {
3370 qemu_put_byte(f, strlen(block->idstr));
3371 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3372 qemu_put_be64(f, block->used_length);
3373 if (migrate_postcopy_ram() && block->page_size !=
3374 qemu_host_page_size) {
3375 qemu_put_be64(f, block->page_size);
3376 }
3377 if (migrate_ignore_shared()) {
3378 qemu_put_be64(f, block->mr->addr);
3379 }
fbd162e6 3380 }
56e93d26
JQ
3381 }
3382
56e93d26
JQ
3383 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3384 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3385
1b81c974 3386 multifd_send_sync_main(*rsp);
56e93d26 3387 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
35374cbd 3388 qemu_fflush(f);
56e93d26
JQ
3389
3390 return 0;
3391}
3392
3d0684b2
JQ
3393/**
3394 * ram_save_iterate: iterative stage for migration
3395 *
3396 * Returns zero to indicate success and negative for error
3397 *
3398 * @f: QEMUFile where to send the data
3399 * @opaque: RAMState pointer
3400 */
56e93d26
JQ
3401static int ram_save_iterate(QEMUFile *f, void *opaque)
3402{
53518d94
JQ
3403 RAMState **temp = opaque;
3404 RAMState *rs = *temp;
56e93d26
JQ
3405 int ret;
3406 int i;
3407 int64_t t0;
5c90308f 3408 int done = 0;
56e93d26 3409
b2557345
PL
3410 if (blk_mig_bulk_active()) {
3411 /* Avoid transferring ram during bulk phase of block migration as
3412 * the bulk phase will usually take a long time and transferring
3413 * ram updates during that time is pointless. */
3414 goto out;
3415 }
3416
89ac5a1d
DDAG
3417 WITH_RCU_READ_LOCK_GUARD() {
3418 if (ram_list.version != rs->last_version) {
3419 ram_state_reset(rs);
3420 }
56e93d26 3421
89ac5a1d
DDAG
3422 /* Read version before ram_list.blocks */
3423 smp_rmb();
56e93d26 3424
89ac5a1d 3425 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
56e93d26 3426
89ac5a1d
DDAG
3427 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3428 i = 0;
3429 while ((ret = qemu_file_rate_limit(f)) == 0 ||
3430 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3431 int pages;
e03a34f8 3432
89ac5a1d
DDAG
3433 if (qemu_file_get_error(f)) {
3434 break;
3435 }
e8f3735f 3436
89ac5a1d
DDAG
3437 pages = ram_find_and_save_block(rs, false);
3438 /* no more pages to sent */
3439 if (pages == 0) {
3440 done = 1;
3441 break;
3442 }
e8f3735f 3443
89ac5a1d
DDAG
3444 if (pages < 0) {
3445 qemu_file_set_error(f, pages);
56e93d26
JQ
3446 break;
3447 }
89ac5a1d
DDAG
3448
3449 rs->target_page_count += pages;
3450
3451 /*
3452 * we want to check in the 1st loop, just in case it was the 1st
3453 * time and we had to sync the dirty bitmap.
3454 * qemu_clock_get_ns() is a bit expensive, so we only check each
3455 * some iterations
3456 */
3457 if ((i & 63) == 0) {
3458 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3459 1000000;
3460 if (t1 > MAX_WAIT) {
3461 trace_ram_save_iterate_big_wait(t1, i);
3462 break;
3463 }
3464 }
3465 i++;
56e93d26 3466 }
56e93d26 3467 }
56e93d26
JQ
3468
3469 /*
3470 * Must occur before EOS (or any QEMUFile operation)
3471 * because of RDMA protocol.
3472 */
3473 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3474
b2557345 3475out:
1b81c974 3476 multifd_send_sync_main(rs);
56e93d26 3477 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
35374cbd 3478 qemu_fflush(f);
9360447d 3479 ram_counters.transferred += 8;
56e93d26
JQ
3480
3481 ret = qemu_file_get_error(f);
3482 if (ret < 0) {
3483 return ret;
3484 }
3485
5c90308f 3486 return done;
56e93d26
JQ
3487}
3488
3d0684b2
JQ
3489/**
3490 * ram_save_complete: function called to send the remaining amount of ram
3491 *
e8f3735f 3492 * Returns zero to indicate success or negative on error
3d0684b2
JQ
3493 *
3494 * Called with iothread lock
3495 *
3496 * @f: QEMUFile where to send the data
3497 * @opaque: RAMState pointer
3498 */
56e93d26
JQ
3499static int ram_save_complete(QEMUFile *f, void *opaque)
3500{
53518d94
JQ
3501 RAMState **temp = opaque;
3502 RAMState *rs = *temp;
e8f3735f 3503 int ret = 0;
6f37bb8b 3504
89ac5a1d
DDAG
3505 WITH_RCU_READ_LOCK_GUARD() {
3506 if (!migration_in_postcopy()) {
3507 migration_bitmap_sync_precopy(rs);
3508 }
56e93d26 3509
89ac5a1d 3510 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
56e93d26 3511
89ac5a1d 3512 /* try transferring iterative blocks of memory */
56e93d26 3513
89ac5a1d
DDAG
3514 /* flush all remaining blocks regardless of rate limiting */
3515 while (true) {
3516 int pages;
56e93d26 3517
89ac5a1d
DDAG
3518 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3519 /* no more blocks to sent */
3520 if (pages == 0) {
3521 break;
3522 }
3523 if (pages < 0) {
3524 ret = pages;
3525 break;
3526 }
e8f3735f 3527 }
56e93d26 3528
89ac5a1d
DDAG
3529 flush_compressed_data(rs);
3530 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3531 }
d09a6fde 3532
1b81c974 3533 multifd_send_sync_main(rs);
56e93d26 3534 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
35374cbd 3535 qemu_fflush(f);
56e93d26 3536
e8f3735f 3537 return ret;
56e93d26
JQ
3538}
3539
c31b098f 3540static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
47995026
VSO
3541 uint64_t *res_precopy_only,
3542 uint64_t *res_compatible,
3543 uint64_t *res_postcopy_only)
56e93d26 3544{
53518d94
JQ
3545 RAMState **temp = opaque;
3546 RAMState *rs = *temp;
56e93d26
JQ
3547 uint64_t remaining_size;
3548
9edabd4d 3549 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 3550
5727309d 3551 if (!migration_in_postcopy() &&
663e6c1d 3552 remaining_size < max_size) {
56e93d26 3553 qemu_mutex_lock_iothread();
89ac5a1d
DDAG
3554 WITH_RCU_READ_LOCK_GUARD() {
3555 migration_bitmap_sync_precopy(rs);
3556 }
56e93d26 3557 qemu_mutex_unlock_iothread();
9edabd4d 3558 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 3559 }
c31b098f 3560
86e1167e
VSO
3561 if (migrate_postcopy_ram()) {
3562 /* We can do postcopy, and all the data is postcopiable */
47995026 3563 *res_compatible += remaining_size;
86e1167e 3564 } else {
47995026 3565 *res_precopy_only += remaining_size;
86e1167e 3566 }
56e93d26
JQ
3567}
3568
3569static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3570{
3571 unsigned int xh_len;
3572 int xh_flags;
063e760a 3573 uint8_t *loaded_data;
56e93d26 3574
56e93d26
JQ
3575 /* extract RLE header */
3576 xh_flags = qemu_get_byte(f);
3577 xh_len = qemu_get_be16(f);
3578
3579 if (xh_flags != ENCODING_FLAG_XBZRLE) {
3580 error_report("Failed to load XBZRLE page - wrong compression!");
3581 return -1;
3582 }
3583
3584 if (xh_len > TARGET_PAGE_SIZE) {
3585 error_report("Failed to load XBZRLE page - len overflow!");
3586 return -1;
3587 }
f265e0e4 3588 loaded_data = XBZRLE.decoded_buf;
56e93d26 3589 /* load data and decode */
f265e0e4 3590 /* it can change loaded_data to point to an internal buffer */
063e760a 3591 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
56e93d26
JQ
3592
3593 /* decode RLE */
063e760a 3594 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
56e93d26
JQ
3595 TARGET_PAGE_SIZE) == -1) {
3596 error_report("Failed to load XBZRLE page - decode error!");
3597 return -1;
3598 }
3599
3600 return 0;
3601}
3602
3d0684b2
JQ
3603/**
3604 * ram_block_from_stream: read a RAMBlock id from the migration stream
3605 *
3606 * Must be called from within a rcu critical section.
3607 *
56e93d26 3608 * Returns a pointer from within the RCU-protected ram_list.
a7180877 3609 *
3d0684b2
JQ
3610 * @f: QEMUFile where to read the data from
3611 * @flags: Page flags (mostly to see if it's a continuation of previous block)
a7180877 3612 */
3d0684b2 3613static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
56e93d26
JQ
3614{
3615 static RAMBlock *block = NULL;
3616 char id[256];
3617 uint8_t len;
3618
3619 if (flags & RAM_SAVE_FLAG_CONTINUE) {
4c4bad48 3620 if (!block) {
56e93d26
JQ
3621 error_report("Ack, bad migration stream!");
3622 return NULL;
3623 }
4c4bad48 3624 return block;
56e93d26
JQ
3625 }
3626
3627 len = qemu_get_byte(f);
3628 qemu_get_buffer(f, (uint8_t *)id, len);
3629 id[len] = 0;
3630
e3dd7493 3631 block = qemu_ram_block_by_name(id);
4c4bad48
HZ
3632 if (!block) {
3633 error_report("Can't find block %s", id);
3634 return NULL;
56e93d26
JQ
3635 }
3636
fbd162e6 3637 if (ramblock_is_ignored(block)) {
b895de50
CLG
3638 error_report("block %s should not be migrated !", id);
3639 return NULL;
3640 }
3641
4c4bad48
HZ
3642 return block;
3643}
3644
3645static inline void *host_from_ram_block_offset(RAMBlock *block,
3646 ram_addr_t offset)
3647{
3648 if (!offset_in_ramblock(block, offset)) {
3649 return NULL;
3650 }
3651
3652 return block->host + offset;
56e93d26
JQ
3653}
3654
13af18f2
ZC
3655static inline void *colo_cache_from_block_offset(RAMBlock *block,
3656 ram_addr_t offset)
3657{
3658 if (!offset_in_ramblock(block, offset)) {
3659 return NULL;
3660 }
3661 if (!block->colo_cache) {
3662 error_report("%s: colo_cache is NULL in block :%s",
3663 __func__, block->idstr);
3664 return NULL;
3665 }
7d9acafa
ZC
3666
3667 /*
3668 * During colo checkpoint, we need bitmap of these migrated pages.
3669 * It help us to decide which pages in ram cache should be flushed
3670 * into VM's RAM later.
3671 */
3672 if (!test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3673 ram_state->migration_dirty_pages++;
3674 }
13af18f2
ZC
3675 return block->colo_cache + offset;
3676}
3677
3d0684b2
JQ
3678/**
3679 * ram_handle_compressed: handle the zero page case
3680 *
56e93d26
JQ
3681 * If a page (or a whole RDMA chunk) has been
3682 * determined to be zero, then zap it.
3d0684b2
JQ
3683 *
3684 * @host: host address for the zero page
3685 * @ch: what the page is filled from. We only support zero
3686 * @size: size of the zero page
56e93d26
JQ
3687 */
3688void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3689{
3690 if (ch != 0 || !is_zero_range(host, size)) {
3691 memset(host, ch, size);
3692 }
3693}
3694
797ca154
XG
3695/* return the size after decompression, or negative value on error */
3696static int
3697qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3698 const uint8_t *source, size_t source_len)
3699{
3700 int err;
3701
3702 err = inflateReset(stream);
3703 if (err != Z_OK) {
3704 return -1;
3705 }
3706
3707 stream->avail_in = source_len;
3708 stream->next_in = (uint8_t *)source;
3709 stream->avail_out = dest_len;
3710 stream->next_out = dest;
3711
3712 err = inflate(stream, Z_NO_FLUSH);
3713 if (err != Z_STREAM_END) {
3714 return -1;
3715 }
3716
3717 return stream->total_out;
3718}
3719
56e93d26
JQ
3720static void *do_data_decompress(void *opaque)
3721{
3722 DecompressParam *param = opaque;
3723 unsigned long pagesize;
33d151f4 3724 uint8_t *des;
34ab9e97 3725 int len, ret;
56e93d26 3726
33d151f4 3727 qemu_mutex_lock(&param->mutex);
90e56fb4 3728 while (!param->quit) {
33d151f4
LL
3729 if (param->des) {
3730 des = param->des;
3731 len = param->len;
3732 param->des = 0;
3733 qemu_mutex_unlock(&param->mutex);
3734
56e93d26 3735 pagesize = TARGET_PAGE_SIZE;
34ab9e97
XG
3736
3737 ret = qemu_uncompress_data(&param->stream, des, pagesize,
3738 param->compbuf, len);
f548222c 3739 if (ret < 0 && migrate_get_current()->decompress_error_check) {
34ab9e97
XG
3740 error_report("decompress data failed");
3741 qemu_file_set_error(decomp_file, ret);
3742 }
73a8912b 3743
33d151f4
LL
3744 qemu_mutex_lock(&decomp_done_lock);
3745 param->done = true;
3746 qemu_cond_signal(&decomp_done_cond);
3747 qemu_mutex_unlock(&decomp_done_lock);
3748
3749 qemu_mutex_lock(&param->mutex);
3750 } else {
3751 qemu_cond_wait(&param->cond, &param->mutex);
3752 }
56e93d26 3753 }
33d151f4 3754 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
3755
3756 return NULL;
3757}
3758
34ab9e97 3759static int wait_for_decompress_done(void)
5533b2e9
LL
3760{
3761 int idx, thread_count;
3762
3763 if (!migrate_use_compression()) {
34ab9e97 3764 return 0;
5533b2e9
LL
3765 }
3766
3767 thread_count = migrate_decompress_threads();
3768 qemu_mutex_lock(&decomp_done_lock);
3769 for (idx = 0; idx < thread_count; idx++) {
3770 while (!decomp_param[idx].done) {
3771 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3772 }
3773 }
3774 qemu_mutex_unlock(&decomp_done_lock);
34ab9e97 3775 return qemu_file_get_error(decomp_file);
5533b2e9
LL
3776}
3777
f0afa331 3778static void compress_threads_load_cleanup(void)
56e93d26
JQ
3779{
3780 int i, thread_count;
3781
3416ab5b
JQ
3782 if (!migrate_use_compression()) {
3783 return;
3784 }
56e93d26
JQ
3785 thread_count = migrate_decompress_threads();
3786 for (i = 0; i < thread_count; i++) {
797ca154
XG
3787 /*
3788 * we use it as a indicator which shows if the thread is
3789 * properly init'd or not
3790 */
3791 if (!decomp_param[i].compbuf) {
3792 break;
3793 }
3794
56e93d26 3795 qemu_mutex_lock(&decomp_param[i].mutex);
90e56fb4 3796 decomp_param[i].quit = true;
56e93d26
JQ
3797 qemu_cond_signal(&decomp_param[i].cond);
3798 qemu_mutex_unlock(&decomp_param[i].mutex);
3799 }
3800 for (i = 0; i < thread_count; i++) {
797ca154
XG
3801 if (!decomp_param[i].compbuf) {
3802 break;
3803 }
3804
56e93d26
JQ
3805 qemu_thread_join(decompress_threads + i);
3806 qemu_mutex_destroy(&decomp_param[i].mutex);
3807 qemu_cond_destroy(&decomp_param[i].cond);
797ca154 3808 inflateEnd(&decomp_param[i].stream);
56e93d26 3809 g_free(decomp_param[i].compbuf);
797ca154 3810 decomp_param[i].compbuf = NULL;
56e93d26
JQ
3811 }
3812 g_free(decompress_threads);
3813 g_free(decomp_param);
56e93d26
JQ
3814 decompress_threads = NULL;
3815 decomp_param = NULL;
34ab9e97 3816 decomp_file = NULL;
56e93d26
JQ
3817}
3818
34ab9e97 3819static int compress_threads_load_setup(QEMUFile *f)
797ca154
XG
3820{
3821 int i, thread_count;
3822
3823 if (!migrate_use_compression()) {
3824 return 0;
3825 }
3826
3827 thread_count = migrate_decompress_threads();
3828 decompress_threads = g_new0(QemuThread, thread_count);
3829 decomp_param = g_new0(DecompressParam, thread_count);
3830 qemu_mutex_init(&decomp_done_lock);
3831 qemu_cond_init(&decomp_done_cond);
34ab9e97 3832 decomp_file = f;
797ca154
XG
3833 for (i = 0; i < thread_count; i++) {
3834 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3835 goto exit;
3836 }
3837
3838 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3839 qemu_mutex_init(&decomp_param[i].mutex);
3840 qemu_cond_init(&decomp_param[i].cond);
3841 decomp_param[i].done = true;
3842 decomp_param[i].quit = false;
3843 qemu_thread_create(decompress_threads + i, "decompress",
3844 do_data_decompress, decomp_param + i,
3845 QEMU_THREAD_JOINABLE);
3846 }
3847 return 0;
3848exit:
3849 compress_threads_load_cleanup();
3850 return -1;
3851}
3852
c1bc6626 3853static void decompress_data_with_multi_threads(QEMUFile *f,
56e93d26
JQ
3854 void *host, int len)
3855{
3856 int idx, thread_count;
3857
3858 thread_count = migrate_decompress_threads();
73a8912b 3859 qemu_mutex_lock(&decomp_done_lock);
56e93d26
JQ
3860 while (true) {
3861 for (idx = 0; idx < thread_count; idx++) {
73a8912b 3862 if (decomp_param[idx].done) {
33d151f4
LL
3863 decomp_param[idx].done = false;
3864 qemu_mutex_lock(&decomp_param[idx].mutex);
c1bc6626 3865 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
56e93d26
JQ
3866 decomp_param[idx].des = host;
3867 decomp_param[idx].len = len;
33d151f4
LL
3868 qemu_cond_signal(&decomp_param[idx].cond);
3869 qemu_mutex_unlock(&decomp_param[idx].mutex);
56e93d26
JQ
3870 break;
3871 }
3872 }
3873 if (idx < thread_count) {
3874 break;
73a8912b
LL
3875 } else {
3876 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
56e93d26
JQ
3877 }
3878 }
73a8912b 3879 qemu_mutex_unlock(&decomp_done_lock);
56e93d26
JQ
3880}
3881
13af18f2
ZC
3882/*
3883 * colo cache: this is for secondary VM, we cache the whole
3884 * memory of the secondary VM, it is need to hold the global lock
3885 * to call this helper.
3886 */
3887int colo_init_ram_cache(void)
3888{
3889 RAMBlock *block;
3890
3891 rcu_read_lock();
fbd162e6 3892 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
13af18f2
ZC
3893 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3894 NULL,
3895 false);
3896 if (!block->colo_cache) {
3897 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3898 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3899 block->used_length);
89ac5a1d
DDAG
3900 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3901 if (block->colo_cache) {
3902 qemu_anon_ram_free(block->colo_cache, block->used_length);
3903 block->colo_cache = NULL;
3904 }
3905 }
3906 return -errno;
13af18f2
ZC
3907 }
3908 memcpy(block->colo_cache, block->host, block->used_length);
3909 }
3910 rcu_read_unlock();
7d9acafa
ZC
3911 /*
3912 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3913 * with to decide which page in cache should be flushed into SVM's RAM. Here
3914 * we use the same name 'ram_bitmap' as for migration.
3915 */
3916 if (ram_bytes_total()) {
3917 RAMBlock *block;
3918
fbd162e6 3919 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7d9acafa
ZC
3920 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3921
3922 block->bmap = bitmap_new(pages);
3923 bitmap_set(block->bmap, 0, pages);
3924 }
3925 }
3926 ram_state = g_new0(RAMState, 1);
3927 ram_state->migration_dirty_pages = 0;
c6e5bafb 3928 qemu_mutex_init(&ram_state->bitmap_mutex);
d1955d22 3929 memory_global_dirty_log_start();
7d9acafa 3930
13af18f2 3931 return 0;
13af18f2
ZC
3932}
3933
3934/* It is need to hold the global lock to call this helper */
3935void colo_release_ram_cache(void)
3936{
3937 RAMBlock *block;
3938
d1955d22 3939 memory_global_dirty_log_stop();
fbd162e6 3940 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7d9acafa
ZC
3941 g_free(block->bmap);
3942 block->bmap = NULL;
3943 }
3944
89ac5a1d
DDAG
3945 WITH_RCU_READ_LOCK_GUARD() {
3946 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3947 if (block->colo_cache) {
3948 qemu_anon_ram_free(block->colo_cache, block->used_length);
3949 block->colo_cache = NULL;
3950 }
13af18f2
ZC
3951 }
3952 }
c6e5bafb 3953 qemu_mutex_destroy(&ram_state->bitmap_mutex);
7d9acafa
ZC
3954 g_free(ram_state);
3955 ram_state = NULL;
13af18f2
ZC
3956}
3957
f265e0e4
JQ
3958/**
3959 * ram_load_setup: Setup RAM for migration incoming side
3960 *
3961 * Returns zero to indicate success and negative for error
3962 *
3963 * @f: QEMUFile where to receive the data
3964 * @opaque: RAMState pointer
3965 */
3966static int ram_load_setup(QEMUFile *f, void *opaque)
3967{
34ab9e97 3968 if (compress_threads_load_setup(f)) {
797ca154
XG
3969 return -1;
3970 }
3971
f265e0e4 3972 xbzrle_load_setup();
f9494614 3973 ramblock_recv_map_init();
13af18f2 3974
f265e0e4
JQ
3975 return 0;
3976}
3977
3978static int ram_load_cleanup(void *opaque)
3979{
f9494614 3980 RAMBlock *rb;
56eb90af 3981
fbd162e6 3982 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
bd108a44 3983 qemu_ram_block_writeback(rb);
56eb90af
JH
3984 }
3985
f265e0e4 3986 xbzrle_load_cleanup();
f0afa331 3987 compress_threads_load_cleanup();
f9494614 3988
fbd162e6 3989 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
f9494614
AP
3990 g_free(rb->receivedmap);
3991 rb->receivedmap = NULL;
3992 }
13af18f2 3993
f265e0e4
JQ
3994 return 0;
3995}
3996
3d0684b2
JQ
3997/**
3998 * ram_postcopy_incoming_init: allocate postcopy data structures
3999 *
4000 * Returns 0 for success and negative if there was one error
4001 *
4002 * @mis: current migration incoming state
4003 *
4004 * Allocate data structures etc needed by incoming migration with
4005 * postcopy-ram. postcopy-ram's similarly names
4006 * postcopy_ram_incoming_init does the work.
1caddf8a
DDAG
4007 */
4008int ram_postcopy_incoming_init(MigrationIncomingState *mis)
4009{
c136180c 4010 return postcopy_ram_incoming_init(mis);
1caddf8a
DDAG
4011}
4012
3d0684b2
JQ
4013/**
4014 * ram_load_postcopy: load a page in postcopy case
4015 *
4016 * Returns 0 for success or -errno in case of error
4017 *
a7180877
DDAG
4018 * Called in postcopy mode by ram_load().
4019 * rcu_read_lock is taken prior to this being called.
3d0684b2
JQ
4020 *
4021 * @f: QEMUFile where to send the data
a7180877
DDAG
4022 */
4023static int ram_load_postcopy(QEMUFile *f)
4024{
4025 int flags = 0, ret = 0;
4026 bool place_needed = false;
1aa83678 4027 bool matches_target_page_size = false;
a7180877
DDAG
4028 MigrationIncomingState *mis = migration_incoming_get_current();
4029 /* Temporary page that is later 'placed' */
3414322a 4030 void *postcopy_host_page = mis->postcopy_tmp_page;
c53b7ddc 4031 void *last_host = NULL;
a3b6ff6d 4032 bool all_zero = false;
a7180877
DDAG
4033
4034 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4035 ram_addr_t addr;
4036 void *host = NULL;
4037 void *page_buffer = NULL;
4038 void *place_source = NULL;
df9ff5e1 4039 RAMBlock *block = NULL;
a7180877 4040 uint8_t ch;
a7180877
DDAG
4041
4042 addr = qemu_get_be64(f);
7a9ddfbf
PX
4043
4044 /*
4045 * If qemu file error, we should stop here, and then "addr"
4046 * may be invalid
4047 */
4048 ret = qemu_file_get_error(f);
4049 if (ret) {
4050 break;
4051 }
4052
a7180877
DDAG
4053 flags = addr & ~TARGET_PAGE_MASK;
4054 addr &= TARGET_PAGE_MASK;
4055
4056 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
4057 place_needed = false;
bb890ed5 4058 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
df9ff5e1 4059 block = ram_block_from_stream(f, flags);
4c4bad48
HZ
4060
4061 host = host_from_ram_block_offset(block, addr);
a7180877
DDAG
4062 if (!host) {
4063 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4064 ret = -EINVAL;
4065 break;
4066 }
1aa83678 4067 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
a7180877 4068 /*
28abd200
DDAG
4069 * Postcopy requires that we place whole host pages atomically;
4070 * these may be huge pages for RAMBlocks that are backed by
4071 * hugetlbfs.
a7180877
DDAG
4072 * To make it atomic, the data is read into a temporary page
4073 * that's moved into place later.
4074 * The migration protocol uses, possibly smaller, target-pages
4075 * however the source ensures it always sends all the components
4076 * of a host page in order.
4077 */
4078 page_buffer = postcopy_host_page +
28abd200 4079 ((uintptr_t)host & (block->page_size - 1));
a7180877 4080 /* If all TP are zero then we can optimise the place */
28abd200 4081 if (!((uintptr_t)host & (block->page_size - 1))) {
a7180877 4082 all_zero = true;
c53b7ddc
DDAG
4083 } else {
4084 /* not the 1st TP within the HP */
4085 if (host != (last_host + TARGET_PAGE_SIZE)) {
9af9e0fe 4086 error_report("Non-sequential target page %p/%p",
c53b7ddc
DDAG
4087 host, last_host);
4088 ret = -EINVAL;
4089 break;
4090 }
a7180877
DDAG
4091 }
4092
c53b7ddc 4093
a7180877
DDAG
4094 /*
4095 * If it's the last part of a host page then we place the host
4096 * page
4097 */
4098 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
28abd200 4099 (block->page_size - 1)) == 0;
a7180877
DDAG
4100 place_source = postcopy_host_page;
4101 }
c53b7ddc 4102 last_host = host;
a7180877
DDAG
4103
4104 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
bb890ed5 4105 case RAM_SAVE_FLAG_ZERO:
a7180877
DDAG
4106 ch = qemu_get_byte(f);
4107 memset(page_buffer, ch, TARGET_PAGE_SIZE);
4108 if (ch) {
4109 all_zero = false;
4110 }
4111 break;
4112
4113 case RAM_SAVE_FLAG_PAGE:
4114 all_zero = false;
1aa83678
PX
4115 if (!matches_target_page_size) {
4116 /* For huge pages, we always use temporary buffer */
a7180877
DDAG
4117 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4118 } else {
1aa83678
PX
4119 /*
4120 * For small pages that matches target page size, we
4121 * avoid the qemu_file copy. Instead we directly use
4122 * the buffer of QEMUFile to place the page. Note: we
4123 * cannot do any QEMUFile operation before using that
4124 * buffer to make sure the buffer is valid when
4125 * placing the page.
a7180877
DDAG
4126 */
4127 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4128 TARGET_PAGE_SIZE);
4129 }
4130 break;
4131 case RAM_SAVE_FLAG_EOS:
4132 /* normal exit */
6df264ac 4133 multifd_recv_sync_main();
a7180877
DDAG
4134 break;
4135 default:
4136 error_report("Unknown combination of migration flags: %#x"
4137 " (postcopy mode)", flags);
4138 ret = -EINVAL;
7a9ddfbf
PX
4139 break;
4140 }
4141
4142 /* Detect for any possible file errors */
4143 if (!ret && qemu_file_get_error(f)) {
4144 ret = qemu_file_get_error(f);
a7180877
DDAG
4145 }
4146
7a9ddfbf 4147 if (!ret && place_needed) {
a7180877 4148 /* This gets called at the last target page in the host page */
df9ff5e1
DDAG
4149 void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
4150
a7180877 4151 if (all_zero) {
df9ff5e1 4152 ret = postcopy_place_page_zero(mis, place_dest,
8be4620b 4153 block);
a7180877 4154 } else {
df9ff5e1 4155 ret = postcopy_place_page(mis, place_dest,
8be4620b 4156 place_source, block);
a7180877
DDAG
4157 }
4158 }
a7180877
DDAG
4159 }
4160
4161 return ret;
4162}
4163
acab30b8
DHB
4164static bool postcopy_is_advised(void)
4165{
4166 PostcopyState ps = postcopy_state_get();
4167 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
4168}
4169
4170static bool postcopy_is_running(void)
4171{
4172 PostcopyState ps = postcopy_state_get();
4173 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4174}
4175
e6f4aa18
ZC
4176/*
4177 * Flush content of RAM cache into SVM's memory.
4178 * Only flush the pages that be dirtied by PVM or SVM or both.
4179 */
4180static void colo_flush_ram_cache(void)
4181{
4182 RAMBlock *block = NULL;
4183 void *dst_host;
4184 void *src_host;
4185 unsigned long offset = 0;
4186
d1955d22 4187 memory_global_dirty_log_sync();
89ac5a1d
DDAG
4188 WITH_RCU_READ_LOCK_GUARD() {
4189 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4190 ramblock_sync_dirty_bitmap(ram_state, block);
4191 }
d1955d22 4192 }
d1955d22 4193
e6f4aa18 4194 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
89ac5a1d
DDAG
4195 WITH_RCU_READ_LOCK_GUARD() {
4196 block = QLIST_FIRST_RCU(&ram_list.blocks);
e6f4aa18 4197
89ac5a1d
DDAG
4198 while (block) {
4199 offset = migration_bitmap_find_dirty(ram_state, block, offset);
e6f4aa18 4200
89ac5a1d
DDAG
4201 if (offset << TARGET_PAGE_BITS >= block->used_length) {
4202 offset = 0;
4203 block = QLIST_NEXT_RCU(block, next);
4204 } else {
4205 migration_bitmap_clear_dirty(ram_state, block, offset);
4206 dst_host = block->host + (offset << TARGET_PAGE_BITS);
4207 src_host = block->colo_cache + (offset << TARGET_PAGE_BITS);
4208 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
4209 }
e6f4aa18
ZC
4210 }
4211 }
e6f4aa18
ZC
4212 trace_colo_flush_ram_cache_end();
4213}
4214
10da4a36
WY
4215/**
4216 * ram_load_precopy: load pages in precopy case
4217 *
4218 * Returns 0 for success or -errno in case of error
4219 *
4220 * Called in precopy mode by ram_load().
4221 * rcu_read_lock is taken prior to this being called.
4222 *
4223 * @f: QEMUFile where to send the data
4224 */
4225static int ram_load_precopy(QEMUFile *f)
56e93d26 4226{
10da4a36 4227 int flags = 0, ret = 0, invalid_flags = 0, len = 0;
ef08fb38 4228 /* ADVISE is earlier, it shows the source has the postcopy capability on */
acab30b8 4229 bool postcopy_advised = postcopy_is_advised();
edc60127
JQ
4230 if (!migrate_use_compression()) {
4231 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4232 }
a7180877 4233
10da4a36 4234 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
56e93d26 4235 ram_addr_t addr, total_ram_bytes;
a776aa15 4236 void *host = NULL;
56e93d26
JQ
4237 uint8_t ch;
4238
4239 addr = qemu_get_be64(f);
4240 flags = addr & ~TARGET_PAGE_MASK;
4241 addr &= TARGET_PAGE_MASK;
4242
edc60127
JQ
4243 if (flags & invalid_flags) {
4244 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4245 error_report("Received an unexpected compressed page");
4246 }
4247
4248 ret = -EINVAL;
4249 break;
4250 }
4251
bb890ed5 4252 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
a776aa15 4253 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4c4bad48
HZ
4254 RAMBlock *block = ram_block_from_stream(f, flags);
4255
13af18f2
ZC
4256 /*
4257 * After going into COLO, we should load the Page into colo_cache.
4258 */
4259 if (migration_incoming_in_colo_state()) {
4260 host = colo_cache_from_block_offset(block, addr);
4261 } else {
4262 host = host_from_ram_block_offset(block, addr);
4263 }
a776aa15
DDAG
4264 if (!host) {
4265 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4266 ret = -EINVAL;
4267 break;
4268 }
13af18f2
ZC
4269
4270 if (!migration_incoming_in_colo_state()) {
4271 ramblock_recv_bitmap_set(block, host);
4272 }
4273
1db9d8e5 4274 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
a776aa15
DDAG
4275 }
4276
56e93d26
JQ
4277 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4278 case RAM_SAVE_FLAG_MEM_SIZE:
4279 /* Synchronize RAM block list */
4280 total_ram_bytes = addr;
4281 while (!ret && total_ram_bytes) {
4282 RAMBlock *block;
56e93d26
JQ
4283 char id[256];
4284 ram_addr_t length;
4285
4286 len = qemu_get_byte(f);
4287 qemu_get_buffer(f, (uint8_t *)id, len);
4288 id[len] = 0;
4289 length = qemu_get_be64(f);
4290
e3dd7493 4291 block = qemu_ram_block_by_name(id);
b895de50
CLG
4292 if (block && !qemu_ram_is_migratable(block)) {
4293 error_report("block %s should not be migrated !", id);
4294 ret = -EINVAL;
4295 } else if (block) {
e3dd7493
DDAG
4296 if (length != block->used_length) {
4297 Error *local_err = NULL;
56e93d26 4298
fa53a0e5 4299 ret = qemu_ram_resize(block, length,
e3dd7493
DDAG
4300 &local_err);
4301 if (local_err) {
4302 error_report_err(local_err);
56e93d26 4303 }
56e93d26 4304 }
ef08fb38
DDAG
4305 /* For postcopy we need to check hugepage sizes match */
4306 if (postcopy_advised &&
4307 block->page_size != qemu_host_page_size) {
4308 uint64_t remote_page_size = qemu_get_be64(f);
4309 if (remote_page_size != block->page_size) {
4310 error_report("Mismatched RAM page size %s "
4311 "(local) %zd != %" PRId64,
4312 id, block->page_size,
4313 remote_page_size);
4314 ret = -EINVAL;
4315 }
4316 }
fbd162e6
YK
4317 if (migrate_ignore_shared()) {
4318 hwaddr addr = qemu_get_be64(f);
fbd162e6
YK
4319 if (ramblock_is_ignored(block) &&
4320 block->mr->addr != addr) {
4321 error_report("Mismatched GPAs for block %s "
4322 "%" PRId64 "!= %" PRId64,
4323 id, (uint64_t)addr,
4324 (uint64_t)block->mr->addr);
4325 ret = -EINVAL;
4326 }
4327 }
e3dd7493
DDAG
4328 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4329 block->idstr);
4330 } else {
56e93d26
JQ
4331 error_report("Unknown ramblock \"%s\", cannot "
4332 "accept migration", id);
4333 ret = -EINVAL;
4334 }
4335
4336 total_ram_bytes -= length;
4337 }
4338 break;
a776aa15 4339
bb890ed5 4340 case RAM_SAVE_FLAG_ZERO:
56e93d26
JQ
4341 ch = qemu_get_byte(f);
4342 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4343 break;
a776aa15 4344
56e93d26 4345 case RAM_SAVE_FLAG_PAGE:
56e93d26
JQ
4346 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4347 break;
56e93d26 4348
a776aa15 4349 case RAM_SAVE_FLAG_COMPRESS_PAGE:
56e93d26
JQ
4350 len = qemu_get_be32(f);
4351 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4352 error_report("Invalid compressed data length: %d", len);
4353 ret = -EINVAL;
4354 break;
4355 }
c1bc6626 4356 decompress_data_with_multi_threads(f, host, len);
56e93d26 4357 break;
a776aa15 4358
56e93d26 4359 case RAM_SAVE_FLAG_XBZRLE:
56e93d26
JQ
4360 if (load_xbzrle(f, addr, host) < 0) {
4361 error_report("Failed to decompress XBZRLE page at "
4362 RAM_ADDR_FMT, addr);
4363 ret = -EINVAL;
4364 break;
4365 }
4366 break;
4367 case RAM_SAVE_FLAG_EOS:
4368 /* normal exit */
6df264ac 4369 multifd_recv_sync_main();
56e93d26
JQ
4370 break;
4371 default:
4372 if (flags & RAM_SAVE_FLAG_HOOK) {
632e3a5c 4373 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
56e93d26
JQ
4374 } else {
4375 error_report("Unknown combination of migration flags: %#x",
4376 flags);
4377 ret = -EINVAL;
4378 }
4379 }
4380 if (!ret) {
4381 ret = qemu_file_get_error(f);
4382 }
4383 }
4384
10da4a36
WY
4385 return ret;
4386}
4387
4388static int ram_load(QEMUFile *f, void *opaque, int version_id)
4389{
4390 int ret = 0;
4391 static uint64_t seq_iter;
4392 /*
4393 * If system is running in postcopy mode, page inserts to host memory must
4394 * be atomic
4395 */
4396 bool postcopy_running = postcopy_is_running();
4397
4398 seq_iter++;
4399
4400 if (version_id != 4) {
4401 return -EINVAL;
4402 }
4403
4404 /*
4405 * This RCU critical section can be very long running.
4406 * When RCU reclaims in the code start to become numerous,
4407 * it will be necessary to reduce the granularity of this
4408 * critical section.
4409 */
89ac5a1d
DDAG
4410 WITH_RCU_READ_LOCK_GUARD() {
4411 if (postcopy_running) {
4412 ret = ram_load_postcopy(f);
4413 } else {
4414 ret = ram_load_precopy(f);
4415 }
10da4a36 4416
89ac5a1d 4417 ret |= wait_for_decompress_done();
10da4a36 4418 }
55c4446b 4419 trace_ram_load_complete(ret, seq_iter);
e6f4aa18
ZC
4420
4421 if (!ret && migration_incoming_in_colo_state()) {
4422 colo_flush_ram_cache();
4423 }
56e93d26
JQ
4424 return ret;
4425}
4426
c6467627
VSO
4427static bool ram_has_postcopy(void *opaque)
4428{
469dd51b 4429 RAMBlock *rb;
fbd162e6 4430 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
469dd51b
JH
4431 if (ramblock_is_pmem(rb)) {
4432 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4433 "is not supported now!", rb->idstr, rb->host);
4434 return false;
4435 }
4436 }
4437
c6467627
VSO
4438 return migrate_postcopy_ram();
4439}
4440
edd090c7
PX
4441/* Sync all the dirty bitmap with destination VM. */
4442static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4443{
4444 RAMBlock *block;
4445 QEMUFile *file = s->to_dst_file;
4446 int ramblock_count = 0;
4447
4448 trace_ram_dirty_bitmap_sync_start();
4449
fbd162e6 4450 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
edd090c7
PX
4451 qemu_savevm_send_recv_bitmap(file, block->idstr);
4452 trace_ram_dirty_bitmap_request(block->idstr);
4453 ramblock_count++;
4454 }
4455
4456 trace_ram_dirty_bitmap_sync_wait();
4457
4458 /* Wait until all the ramblocks' dirty bitmap synced */
4459 while (ramblock_count--) {
4460 qemu_sem_wait(&s->rp_state.rp_sem);
4461 }
4462
4463 trace_ram_dirty_bitmap_sync_complete();
4464
4465 return 0;
4466}
4467
4468static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4469{
4470 qemu_sem_post(&s->rp_state.rp_sem);
4471}
4472
a335debb
PX
4473/*
4474 * Read the received bitmap, revert it as the initial dirty bitmap.
4475 * This is only used when the postcopy migration is paused but wants
4476 * to resume from a middle point.
4477 */
4478int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4479{
4480 int ret = -EINVAL;
4481 QEMUFile *file = s->rp_state.from_dst_file;
4482 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
a725ef9f 4483 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
4484 uint64_t size, end_mark;
4485
4486 trace_ram_dirty_bitmap_reload_begin(block->idstr);
4487
4488 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4489 error_report("%s: incorrect state %s", __func__,
4490 MigrationStatus_str(s->state));
4491 return -EINVAL;
4492 }
4493
4494 /*
4495 * Note: see comments in ramblock_recv_bitmap_send() on why we
4496 * need the endianess convertion, and the paddings.
4497 */
4498 local_size = ROUND_UP(local_size, 8);
4499
4500 /* Add paddings */
4501 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4502
4503 size = qemu_get_be64(file);
4504
4505 /* The size of the bitmap should match with our ramblock */
4506 if (size != local_size) {
4507 error_report("%s: ramblock '%s' bitmap size mismatch "
4508 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4509 block->idstr, size, local_size);
4510 ret = -EINVAL;
4511 goto out;
4512 }
4513
4514 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4515 end_mark = qemu_get_be64(file);
4516
4517 ret = qemu_file_get_error(file);
4518 if (ret || size != local_size) {
4519 error_report("%s: read bitmap failed for ramblock '%s': %d"
4520 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4521 __func__, block->idstr, ret, local_size, size);
4522 ret = -EIO;
4523 goto out;
4524 }
4525
4526 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4527 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
4528 __func__, block->idstr, end_mark);
4529 ret = -EINVAL;
4530 goto out;
4531 }
4532
4533 /*
4534 * Endianess convertion. We are during postcopy (though paused).
4535 * The dirty bitmap won't change. We can directly modify it.
4536 */
4537 bitmap_from_le(block->bmap, le_bitmap, nbits);
4538
4539 /*
4540 * What we received is "received bitmap". Revert it as the initial
4541 * dirty bitmap for this ramblock.
4542 */
4543 bitmap_complement(block->bmap, block->bmap, nbits);
4544
4545 trace_ram_dirty_bitmap_reload_complete(block->idstr);
4546
edd090c7
PX
4547 /*
4548 * We succeeded to sync bitmap for current ramblock. If this is
4549 * the last one to sync, we need to notify the main send thread.
4550 */
4551 ram_dirty_bitmap_reload_notify(s);
4552
a335debb
PX
4553 ret = 0;
4554out:
bf269906 4555 g_free(le_bitmap);
a335debb
PX
4556 return ret;
4557}
4558
edd090c7
PX
4559static int ram_resume_prepare(MigrationState *s, void *opaque)
4560{
4561 RAMState *rs = *(RAMState **)opaque;
08614f34 4562 int ret;
edd090c7 4563
08614f34
PX
4564 ret = ram_dirty_bitmap_sync_all(s, rs);
4565 if (ret) {
4566 return ret;
4567 }
4568
4569 ram_state_resume_prepare(rs, s->to_dst_file);
4570
4571 return 0;
edd090c7
PX
4572}
4573
56e93d26 4574static SaveVMHandlers savevm_ram_handlers = {
9907e842 4575 .save_setup = ram_save_setup,
56e93d26 4576 .save_live_iterate = ram_save_iterate,
763c906b 4577 .save_live_complete_postcopy = ram_save_complete,
a3e06c3d 4578 .save_live_complete_precopy = ram_save_complete,
c6467627 4579 .has_postcopy = ram_has_postcopy,
56e93d26
JQ
4580 .save_live_pending = ram_save_pending,
4581 .load_state = ram_load,
f265e0e4
JQ
4582 .save_cleanup = ram_save_cleanup,
4583 .load_setup = ram_load_setup,
4584 .load_cleanup = ram_load_cleanup,
edd090c7 4585 .resume_prepare = ram_resume_prepare,
56e93d26
JQ
4586};
4587
4588void ram_mig_init(void)
4589{
4590 qemu_mutex_init(&XBZRLE.lock);
ce62df53 4591 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
56e93d26 4592}