]> git.proxmox.com Git - mirror_qemu.git/blame - migration/ram.c
COLO: Load dirty pages into SVM's RAM cache firstly
[mirror_qemu.git] / migration / ram.c
CommitLineData
56e93d26
JQ
1/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
76cc7b58
JQ
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
56e93d26
JQ
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
e688df6b 28
1393a485 29#include "qemu/osdep.h"
33c11879 30#include "cpu.h"
56e93d26 31#include <zlib.h>
f348b6d1 32#include "qemu/cutils.h"
56e93d26
JQ
33#include "qemu/bitops.h"
34#include "qemu/bitmap.h"
7205c9ec 35#include "qemu/main-loop.h"
56eb90af 36#include "qemu/pmem.h"
709e3fe8 37#include "xbzrle.h"
7b1e1a22 38#include "ram.h"
6666c96a 39#include "migration.h"
71bb07db 40#include "socket.h"
f2a8f0a6 41#include "migration/register.h"
7b1e1a22 42#include "migration/misc.h"
08a0aee1 43#include "qemu-file.h"
be07b0ac 44#include "postcopy-ram.h"
53d37d36 45#include "page_cache.h"
56e93d26 46#include "qemu/error-report.h"
e688df6b 47#include "qapi/error.h"
9af23989 48#include "qapi/qapi-events-migration.h"
8acabf69 49#include "qapi/qmp/qerror.h"
56e93d26 50#include "trace.h"
56e93d26 51#include "exec/ram_addr.h"
f9494614 52#include "exec/target_page.h"
56e93d26 53#include "qemu/rcu_queue.h"
a91246c9 54#include "migration/colo.h"
53d37d36 55#include "block.h"
af8b7d2b
JQ
56#include "sysemu/sysemu.h"
57#include "qemu/uuid.h"
edd090c7 58#include "savevm.h"
b9ee2f7d 59#include "qemu/iov.h"
56e93d26 60
56e93d26
JQ
61/***********************************************************/
62/* ram save/restore */
63
bb890ed5
JQ
64/* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
65 * worked for pages that where filled with the same char. We switched
66 * it to only search for the zero value. And to avoid confusion with
67 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
68 */
69
56e93d26 70#define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
bb890ed5 71#define RAM_SAVE_FLAG_ZERO 0x02
56e93d26
JQ
72#define RAM_SAVE_FLAG_MEM_SIZE 0x04
73#define RAM_SAVE_FLAG_PAGE 0x08
74#define RAM_SAVE_FLAG_EOS 0x10
75#define RAM_SAVE_FLAG_CONTINUE 0x20
76#define RAM_SAVE_FLAG_XBZRLE 0x40
77/* 0x80 is reserved in migration.h start with 0x100 next */
78#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
79
56e93d26
JQ
80static inline bool is_zero_range(uint8_t *p, uint64_t size)
81{
a1febc49 82 return buffer_is_zero(p, size);
56e93d26
JQ
83}
84
9360447d
JQ
85XBZRLECacheStats xbzrle_counters;
86
56e93d26
JQ
87/* struct contains XBZRLE cache and a static page
88 used by the compression */
89static struct {
90 /* buffer used for XBZRLE encoding */
91 uint8_t *encoded_buf;
92 /* buffer for storing page content */
93 uint8_t *current_buf;
94 /* Cache for XBZRLE, Protected by lock. */
95 PageCache *cache;
96 QemuMutex lock;
c00e0928
JQ
97 /* it will store a page full of zeros */
98 uint8_t *zero_target_page;
f265e0e4
JQ
99 /* buffer used for XBZRLE decoding */
100 uint8_t *decoded_buf;
56e93d26
JQ
101} XBZRLE;
102
56e93d26
JQ
103static void XBZRLE_cache_lock(void)
104{
105 if (migrate_use_xbzrle())
106 qemu_mutex_lock(&XBZRLE.lock);
107}
108
109static void XBZRLE_cache_unlock(void)
110{
111 if (migrate_use_xbzrle())
112 qemu_mutex_unlock(&XBZRLE.lock);
113}
114
3d0684b2
JQ
115/**
116 * xbzrle_cache_resize: resize the xbzrle cache
117 *
118 * This function is called from qmp_migrate_set_cache_size in main
119 * thread, possibly while a migration is in progress. A running
120 * migration may be using the cache and might finish during this call,
121 * hence changes to the cache are protected by XBZRLE.lock().
122 *
c9dede2d 123 * Returns 0 for success or -1 for error
3d0684b2
JQ
124 *
125 * @new_size: new cache size
8acabf69 126 * @errp: set *errp if the check failed, with reason
56e93d26 127 */
c9dede2d 128int xbzrle_cache_resize(int64_t new_size, Error **errp)
56e93d26
JQ
129{
130 PageCache *new_cache;
c9dede2d 131 int64_t ret = 0;
56e93d26 132
8acabf69
JQ
133 /* Check for truncation */
134 if (new_size != (size_t)new_size) {
135 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
136 "exceeding address space");
137 return -1;
138 }
139
2a313e5c
JQ
140 if (new_size == migrate_xbzrle_cache_size()) {
141 /* nothing to do */
c9dede2d 142 return 0;
2a313e5c
JQ
143 }
144
56e93d26
JQ
145 XBZRLE_cache_lock();
146
147 if (XBZRLE.cache != NULL) {
80f8dfde 148 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
56e93d26 149 if (!new_cache) {
56e93d26
JQ
150 ret = -1;
151 goto out;
152 }
153
154 cache_fini(XBZRLE.cache);
155 XBZRLE.cache = new_cache;
156 }
56e93d26
JQ
157out:
158 XBZRLE_cache_unlock();
159 return ret;
160}
161
b895de50
CLG
162/* Should be holding either ram_list.mutex, or the RCU lock. */
163#define RAMBLOCK_FOREACH_MIGRATABLE(block) \
343f632c 164 INTERNAL_RAMBLOCK_FOREACH(block) \
b895de50
CLG
165 if (!qemu_ram_is_migratable(block)) {} else
166
343f632c
DDAG
167#undef RAMBLOCK_FOREACH
168
f9494614
AP
169static void ramblock_recv_map_init(void)
170{
171 RAMBlock *rb;
172
b895de50 173 RAMBLOCK_FOREACH_MIGRATABLE(rb) {
f9494614
AP
174 assert(!rb->receivedmap);
175 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
176 }
177}
178
179int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
180{
181 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
182 rb->receivedmap);
183}
184
1cba9f6e
DDAG
185bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
186{
187 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
188}
189
f9494614
AP
190void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
191{
192 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
193}
194
195void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
196 size_t nr)
197{
198 bitmap_set_atomic(rb->receivedmap,
199 ramblock_recv_bitmap_offset(host_addr, rb),
200 nr);
201}
202
a335debb
PX
203#define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
204
205/*
206 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
207 *
208 * Returns >0 if success with sent bytes, or <0 if error.
209 */
210int64_t ramblock_recv_bitmap_send(QEMUFile *file,
211 const char *block_name)
212{
213 RAMBlock *block = qemu_ram_block_by_name(block_name);
214 unsigned long *le_bitmap, nbits;
215 uint64_t size;
216
217 if (!block) {
218 error_report("%s: invalid block name: %s", __func__, block_name);
219 return -1;
220 }
221
222 nbits = block->used_length >> TARGET_PAGE_BITS;
223
224 /*
225 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
226 * machines we may need 4 more bytes for padding (see below
227 * comment). So extend it a bit before hand.
228 */
229 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
230
231 /*
232 * Always use little endian when sending the bitmap. This is
233 * required that when source and destination VMs are not using the
234 * same endianess. (Note: big endian won't work.)
235 */
236 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
237
238 /* Size of the bitmap, in bytes */
a725ef9f 239 size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
240
241 /*
242 * size is always aligned to 8 bytes for 64bit machines, but it
243 * may not be true for 32bit machines. We need this padding to
244 * make sure the migration can survive even between 32bit and
245 * 64bit machines.
246 */
247 size = ROUND_UP(size, 8);
248
249 qemu_put_be64(file, size);
250 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
251 /*
252 * Mark as an end, in case the middle part is screwed up due to
253 * some "misterious" reason.
254 */
255 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
256 qemu_fflush(file);
257
bf269906 258 g_free(le_bitmap);
a335debb
PX
259
260 if (qemu_file_get_error(file)) {
261 return qemu_file_get_error(file);
262 }
263
264 return size + sizeof(size);
265}
266
ec481c6c
JQ
267/*
268 * An outstanding page request, on the source, having been received
269 * and queued
270 */
271struct RAMSrcPageRequest {
272 RAMBlock *rb;
273 hwaddr offset;
274 hwaddr len;
275
276 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
277};
278
6f37bb8b
JQ
279/* State of RAM for migration */
280struct RAMState {
204b88b8
JQ
281 /* QEMUFile used for this migration */
282 QEMUFile *f;
6f37bb8b
JQ
283 /* Last block that we have visited searching for dirty pages */
284 RAMBlock *last_seen_block;
285 /* Last block from where we have sent data */
286 RAMBlock *last_sent_block;
269ace29
JQ
287 /* Last dirty target page we have sent */
288 ram_addr_t last_page;
6f37bb8b
JQ
289 /* last ram version we have seen */
290 uint32_t last_version;
291 /* We are in the first round */
292 bool ram_bulk_stage;
8d820d6f
JQ
293 /* How many times we have dirty too many pages */
294 int dirty_rate_high_cnt;
f664da80
JQ
295 /* these variables are used for bitmap sync */
296 /* last time we did a full bitmap_sync */
297 int64_t time_last_bitmap_sync;
eac74159 298 /* bytes transferred at start_time */
c4bdf0cf 299 uint64_t bytes_xfer_prev;
a66cd90c 300 /* number of dirty pages since start_time */
68908ed6 301 uint64_t num_dirty_pages_period;
b5833fde
JQ
302 /* xbzrle misses since the beginning of the period */
303 uint64_t xbzrle_cache_miss_prev;
76e03000
XG
304
305 /* compression statistics since the beginning of the period */
306 /* amount of count that no free thread to compress data */
307 uint64_t compress_thread_busy_prev;
308 /* amount bytes after compression */
309 uint64_t compressed_size_prev;
310 /* amount of compressed pages */
311 uint64_t compress_pages_prev;
312
be8b02ed
XG
313 /* total handled target pages at the beginning of period */
314 uint64_t target_page_count_prev;
315 /* total handled target pages since start */
316 uint64_t target_page_count;
9360447d 317 /* number of dirty bits in the bitmap */
2dfaf12e
PX
318 uint64_t migration_dirty_pages;
319 /* protects modification of the bitmap */
108cfae0 320 QemuMutex bitmap_mutex;
68a098f3
JQ
321 /* The RAMBlock used in the last src_page_requests */
322 RAMBlock *last_req_rb;
ec481c6c
JQ
323 /* Queue of outstanding page requests from the destination */
324 QemuMutex src_page_req_mutex;
325 QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
6f37bb8b
JQ
326};
327typedef struct RAMState RAMState;
328
53518d94 329static RAMState *ram_state;
6f37bb8b 330
9edabd4d 331uint64_t ram_bytes_remaining(void)
2f4fde93 332{
bae416e5
DDAG
333 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
334 0;
2f4fde93
JQ
335}
336
9360447d 337MigrationStats ram_counters;
96506894 338
b8fb8cb7
DDAG
339/* used by the search for pages to send */
340struct PageSearchStatus {
341 /* Current block being searched */
342 RAMBlock *block;
a935e30f
JQ
343 /* Current page to search from */
344 unsigned long page;
b8fb8cb7
DDAG
345 /* Set once we wrap around */
346 bool complete_round;
347};
348typedef struct PageSearchStatus PageSearchStatus;
349
76e03000
XG
350CompressionStats compression_counters;
351
56e93d26 352struct CompressParam {
56e93d26 353 bool done;
90e56fb4 354 bool quit;
5e5fdcff 355 bool zero_page;
56e93d26
JQ
356 QEMUFile *file;
357 QemuMutex mutex;
358 QemuCond cond;
359 RAMBlock *block;
360 ram_addr_t offset;
34ab9e97
XG
361
362 /* internally used fields */
dcaf446e 363 z_stream stream;
34ab9e97 364 uint8_t *originbuf;
56e93d26
JQ
365};
366typedef struct CompressParam CompressParam;
367
368struct DecompressParam {
73a8912b 369 bool done;
90e56fb4 370 bool quit;
56e93d26
JQ
371 QemuMutex mutex;
372 QemuCond cond;
373 void *des;
d341d9f3 374 uint8_t *compbuf;
56e93d26 375 int len;
797ca154 376 z_stream stream;
56e93d26
JQ
377};
378typedef struct DecompressParam DecompressParam;
379
380static CompressParam *comp_param;
381static QemuThread *compress_threads;
382/* comp_done_cond is used to wake up the migration thread when
383 * one of the compression threads has finished the compression.
384 * comp_done_lock is used to co-work with comp_done_cond.
385 */
0d9f9a5c
LL
386static QemuMutex comp_done_lock;
387static QemuCond comp_done_cond;
56e93d26
JQ
388/* The empty QEMUFileOps will be used by file in CompressParam */
389static const QEMUFileOps empty_ops = { };
390
34ab9e97 391static QEMUFile *decomp_file;
56e93d26
JQ
392static DecompressParam *decomp_param;
393static QemuThread *decompress_threads;
73a8912b
LL
394static QemuMutex decomp_done_lock;
395static QemuCond decomp_done_cond;
56e93d26 396
5e5fdcff 397static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
6ef3771c 398 ram_addr_t offset, uint8_t *source_buf);
56e93d26
JQ
399
400static void *do_data_compress(void *opaque)
401{
402 CompressParam *param = opaque;
a7a9a88f
LL
403 RAMBlock *block;
404 ram_addr_t offset;
5e5fdcff 405 bool zero_page;
56e93d26 406
a7a9a88f 407 qemu_mutex_lock(&param->mutex);
90e56fb4 408 while (!param->quit) {
a7a9a88f
LL
409 if (param->block) {
410 block = param->block;
411 offset = param->offset;
412 param->block = NULL;
413 qemu_mutex_unlock(&param->mutex);
414
5e5fdcff
XG
415 zero_page = do_compress_ram_page(param->file, &param->stream,
416 block, offset, param->originbuf);
a7a9a88f 417
0d9f9a5c 418 qemu_mutex_lock(&comp_done_lock);
a7a9a88f 419 param->done = true;
5e5fdcff 420 param->zero_page = zero_page;
0d9f9a5c
LL
421 qemu_cond_signal(&comp_done_cond);
422 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
423
424 qemu_mutex_lock(&param->mutex);
425 } else {
56e93d26
JQ
426 qemu_cond_wait(&param->cond, &param->mutex);
427 }
56e93d26 428 }
a7a9a88f 429 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
430
431 return NULL;
432}
433
f0afa331 434static void compress_threads_save_cleanup(void)
56e93d26
JQ
435{
436 int i, thread_count;
437
05306935 438 if (!migrate_use_compression() || !comp_param) {
56e93d26
JQ
439 return;
440 }
05306935 441
56e93d26
JQ
442 thread_count = migrate_compress_threads();
443 for (i = 0; i < thread_count; i++) {
dcaf446e
XG
444 /*
445 * we use it as a indicator which shows if the thread is
446 * properly init'd or not
447 */
448 if (!comp_param[i].file) {
449 break;
450 }
05306935
FL
451
452 qemu_mutex_lock(&comp_param[i].mutex);
453 comp_param[i].quit = true;
454 qemu_cond_signal(&comp_param[i].cond);
455 qemu_mutex_unlock(&comp_param[i].mutex);
456
56e93d26 457 qemu_thread_join(compress_threads + i);
56e93d26
JQ
458 qemu_mutex_destroy(&comp_param[i].mutex);
459 qemu_cond_destroy(&comp_param[i].cond);
dcaf446e 460 deflateEnd(&comp_param[i].stream);
34ab9e97 461 g_free(comp_param[i].originbuf);
dcaf446e
XG
462 qemu_fclose(comp_param[i].file);
463 comp_param[i].file = NULL;
56e93d26 464 }
0d9f9a5c
LL
465 qemu_mutex_destroy(&comp_done_lock);
466 qemu_cond_destroy(&comp_done_cond);
56e93d26
JQ
467 g_free(compress_threads);
468 g_free(comp_param);
56e93d26
JQ
469 compress_threads = NULL;
470 comp_param = NULL;
56e93d26
JQ
471}
472
dcaf446e 473static int compress_threads_save_setup(void)
56e93d26
JQ
474{
475 int i, thread_count;
476
477 if (!migrate_use_compression()) {
dcaf446e 478 return 0;
56e93d26 479 }
56e93d26
JQ
480 thread_count = migrate_compress_threads();
481 compress_threads = g_new0(QemuThread, thread_count);
482 comp_param = g_new0(CompressParam, thread_count);
0d9f9a5c
LL
483 qemu_cond_init(&comp_done_cond);
484 qemu_mutex_init(&comp_done_lock);
56e93d26 485 for (i = 0; i < thread_count; i++) {
34ab9e97
XG
486 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
487 if (!comp_param[i].originbuf) {
488 goto exit;
489 }
490
dcaf446e
XG
491 if (deflateInit(&comp_param[i].stream,
492 migrate_compress_level()) != Z_OK) {
34ab9e97 493 g_free(comp_param[i].originbuf);
dcaf446e
XG
494 goto exit;
495 }
496
e110aa91
C
497 /* comp_param[i].file is just used as a dummy buffer to save data,
498 * set its ops to empty.
56e93d26
JQ
499 */
500 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
501 comp_param[i].done = true;
90e56fb4 502 comp_param[i].quit = false;
56e93d26
JQ
503 qemu_mutex_init(&comp_param[i].mutex);
504 qemu_cond_init(&comp_param[i].cond);
505 qemu_thread_create(compress_threads + i, "compress",
506 do_data_compress, comp_param + i,
507 QEMU_THREAD_JOINABLE);
508 }
dcaf446e
XG
509 return 0;
510
511exit:
512 compress_threads_save_cleanup();
513 return -1;
56e93d26
JQ
514}
515
f986c3d2
JQ
516/* Multiple fd's */
517
af8b7d2b
JQ
518#define MULTIFD_MAGIC 0x11223344U
519#define MULTIFD_VERSION 1
520
6df264ac
JQ
521#define MULTIFD_FLAG_SYNC (1 << 0)
522
af8b7d2b
JQ
523typedef struct {
524 uint32_t magic;
525 uint32_t version;
526 unsigned char uuid[16]; /* QemuUUID */
527 uint8_t id;
528} __attribute__((packed)) MultiFDInit_t;
529
2a26c979
JQ
530typedef struct {
531 uint32_t magic;
532 uint32_t version;
533 uint32_t flags;
534 uint32_t size;
535 uint32_t used;
536 uint64_t packet_num;
537 char ramblock[256];
538 uint64_t offset[];
539} __attribute__((packed)) MultiFDPacket_t;
540
34c55a94
JQ
541typedef struct {
542 /* number of used pages */
543 uint32_t used;
544 /* number of allocated pages */
545 uint32_t allocated;
546 /* global number of generated multifd packets */
547 uint64_t packet_num;
548 /* offset of each page */
549 ram_addr_t *offset;
550 /* pointer to each page */
551 struct iovec *iov;
552 RAMBlock *block;
553} MultiFDPages_t;
554
8c4598f2
JQ
555typedef struct {
556 /* this fields are not changed once the thread is created */
557 /* channel number */
f986c3d2 558 uint8_t id;
8c4598f2 559 /* channel thread name */
f986c3d2 560 char *name;
8c4598f2 561 /* channel thread id */
f986c3d2 562 QemuThread thread;
8c4598f2 563 /* communication channel */
60df2d4a 564 QIOChannel *c;
8c4598f2 565 /* sem where to wait for more work */
f986c3d2 566 QemuSemaphore sem;
8c4598f2 567 /* this mutex protects the following parameters */
f986c3d2 568 QemuMutex mutex;
8c4598f2 569 /* is this channel thread running */
66770707 570 bool running;
8c4598f2 571 /* should this thread finish */
f986c3d2 572 bool quit;
0beb5ed3
JQ
573 /* thread has work to do */
574 int pending_job;
34c55a94
JQ
575 /* array of pages to sent */
576 MultiFDPages_t *pages;
2a26c979
JQ
577 /* packet allocated len */
578 uint32_t packet_len;
579 /* pointer to the packet */
580 MultiFDPacket_t *packet;
581 /* multifd flags for each packet */
582 uint32_t flags;
583 /* global number of generated multifd packets */
584 uint64_t packet_num;
408ea6ae
JQ
585 /* thread local variables */
586 /* packets sent through this channel */
587 uint64_t num_packets;
588 /* pages sent through this channel */
589 uint64_t num_pages;
6df264ac
JQ
590 /* syncs main thread and channels */
591 QemuSemaphore sem_sync;
8c4598f2
JQ
592} MultiFDSendParams;
593
594typedef struct {
595 /* this fields are not changed once the thread is created */
596 /* channel number */
597 uint8_t id;
598 /* channel thread name */
599 char *name;
600 /* channel thread id */
601 QemuThread thread;
602 /* communication channel */
603 QIOChannel *c;
8c4598f2
JQ
604 /* this mutex protects the following parameters */
605 QemuMutex mutex;
606 /* is this channel thread running */
607 bool running;
34c55a94
JQ
608 /* array of pages to receive */
609 MultiFDPages_t *pages;
2a26c979
JQ
610 /* packet allocated len */
611 uint32_t packet_len;
612 /* pointer to the packet */
613 MultiFDPacket_t *packet;
614 /* multifd flags for each packet */
615 uint32_t flags;
616 /* global number of generated multifd packets */
617 uint64_t packet_num;
408ea6ae
JQ
618 /* thread local variables */
619 /* packets sent through this channel */
620 uint64_t num_packets;
621 /* pages sent through this channel */
622 uint64_t num_pages;
6df264ac
JQ
623 /* syncs main thread and channels */
624 QemuSemaphore sem_sync;
8c4598f2 625} MultiFDRecvParams;
f986c3d2 626
af8b7d2b
JQ
627static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
628{
629 MultiFDInit_t msg;
630 int ret;
631
632 msg.magic = cpu_to_be32(MULTIFD_MAGIC);
633 msg.version = cpu_to_be32(MULTIFD_VERSION);
634 msg.id = p->id;
635 memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
636
637 ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
638 if (ret != 0) {
639 return -1;
640 }
641 return 0;
642}
643
644static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
645{
646 MultiFDInit_t msg;
647 int ret;
648
649 ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
650 if (ret != 0) {
651 return -1;
652 }
653
341ba0df
PM
654 msg.magic = be32_to_cpu(msg.magic);
655 msg.version = be32_to_cpu(msg.version);
af8b7d2b
JQ
656
657 if (msg.magic != MULTIFD_MAGIC) {
658 error_setg(errp, "multifd: received packet magic %x "
659 "expected %x", msg.magic, MULTIFD_MAGIC);
660 return -1;
661 }
662
663 if (msg.version != MULTIFD_VERSION) {
664 error_setg(errp, "multifd: received packet version %d "
665 "expected %d", msg.version, MULTIFD_VERSION);
666 return -1;
667 }
668
669 if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
670 char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
671 char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
672
673 error_setg(errp, "multifd: received uuid '%s' and expected "
674 "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
675 g_free(uuid);
676 g_free(msg_uuid);
677 return -1;
678 }
679
680 if (msg.id > migrate_multifd_channels()) {
681 error_setg(errp, "multifd: received channel version %d "
682 "expected %d", msg.version, MULTIFD_VERSION);
683 return -1;
684 }
685
686 return msg.id;
687}
688
34c55a94
JQ
689static MultiFDPages_t *multifd_pages_init(size_t size)
690{
691 MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
692
693 pages->allocated = size;
694 pages->iov = g_new0(struct iovec, size);
695 pages->offset = g_new0(ram_addr_t, size);
696
697 return pages;
698}
699
700static void multifd_pages_clear(MultiFDPages_t *pages)
701{
702 pages->used = 0;
703 pages->allocated = 0;
704 pages->packet_num = 0;
705 pages->block = NULL;
706 g_free(pages->iov);
707 pages->iov = NULL;
708 g_free(pages->offset);
709 pages->offset = NULL;
710 g_free(pages);
711}
712
2a26c979
JQ
713static void multifd_send_fill_packet(MultiFDSendParams *p)
714{
715 MultiFDPacket_t *packet = p->packet;
716 int i;
717
718 packet->magic = cpu_to_be32(MULTIFD_MAGIC);
719 packet->version = cpu_to_be32(MULTIFD_VERSION);
720 packet->flags = cpu_to_be32(p->flags);
721 packet->size = cpu_to_be32(migrate_multifd_page_count());
722 packet->used = cpu_to_be32(p->pages->used);
723 packet->packet_num = cpu_to_be64(p->packet_num);
724
725 if (p->pages->block) {
726 strncpy(packet->ramblock, p->pages->block->idstr, 256);
727 }
728
729 for (i = 0; i < p->pages->used; i++) {
730 packet->offset[i] = cpu_to_be64(p->pages->offset[i]);
731 }
732}
733
734static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
735{
736 MultiFDPacket_t *packet = p->packet;
737 RAMBlock *block;
738 int i;
739
341ba0df 740 packet->magic = be32_to_cpu(packet->magic);
2a26c979
JQ
741 if (packet->magic != MULTIFD_MAGIC) {
742 error_setg(errp, "multifd: received packet "
743 "magic %x and expected magic %x",
744 packet->magic, MULTIFD_MAGIC);
745 return -1;
746 }
747
341ba0df 748 packet->version = be32_to_cpu(packet->version);
2a26c979
JQ
749 if (packet->version != MULTIFD_VERSION) {
750 error_setg(errp, "multifd: received packet "
751 "version %d and expected version %d",
752 packet->version, MULTIFD_VERSION);
753 return -1;
754 }
755
756 p->flags = be32_to_cpu(packet->flags);
757
341ba0df 758 packet->size = be32_to_cpu(packet->size);
2a26c979
JQ
759 if (packet->size > migrate_multifd_page_count()) {
760 error_setg(errp, "multifd: received packet "
761 "with size %d and expected maximum size %d",
762 packet->size, migrate_multifd_page_count()) ;
763 return -1;
764 }
765
766 p->pages->used = be32_to_cpu(packet->used);
767 if (p->pages->used > packet->size) {
768 error_setg(errp, "multifd: received packet "
769 "with size %d and expected maximum size %d",
770 p->pages->used, packet->size) ;
771 return -1;
772 }
773
774 p->packet_num = be64_to_cpu(packet->packet_num);
775
776 if (p->pages->used) {
777 /* make sure that ramblock is 0 terminated */
778 packet->ramblock[255] = 0;
779 block = qemu_ram_block_by_name(packet->ramblock);
780 if (!block) {
781 error_setg(errp, "multifd: unknown ram block %s",
782 packet->ramblock);
783 return -1;
784 }
785 }
786
787 for (i = 0; i < p->pages->used; i++) {
788 ram_addr_t offset = be64_to_cpu(packet->offset[i]);
789
790 if (offset > (block->used_length - TARGET_PAGE_SIZE)) {
791 error_setg(errp, "multifd: offset too long " RAM_ADDR_FMT
792 " (max " RAM_ADDR_FMT ")",
793 offset, block->max_length);
794 return -1;
795 }
796 p->pages->iov[i].iov_base = block->host + offset;
797 p->pages->iov[i].iov_len = TARGET_PAGE_SIZE;
798 }
799
800 return 0;
801}
802
f986c3d2
JQ
803struct {
804 MultiFDSendParams *params;
805 /* number of created threads */
806 int count;
34c55a94
JQ
807 /* array of pages to sent */
808 MultiFDPages_t *pages;
6df264ac
JQ
809 /* syncs main thread and channels */
810 QemuSemaphore sem_sync;
811 /* global number of generated multifd packets */
812 uint64_t packet_num;
b9ee2f7d
JQ
813 /* send channels ready */
814 QemuSemaphore channels_ready;
f986c3d2
JQ
815} *multifd_send_state;
816
b9ee2f7d
JQ
817/*
818 * How we use multifd_send_state->pages and channel->pages?
819 *
820 * We create a pages for each channel, and a main one. Each time that
821 * we need to send a batch of pages we interchange the ones between
822 * multifd_send_state and the channel that is sending it. There are
823 * two reasons for that:
824 * - to not have to do so many mallocs during migration
825 * - to make easier to know what to free at the end of migration
826 *
827 * This way we always know who is the owner of each "pages" struct,
828 * and we don't need any loocking. It belongs to the migration thread
829 * or to the channel thread. Switching is safe because the migration
830 * thread is using the channel mutex when changing it, and the channel
831 * have to had finish with its own, otherwise pending_job can't be
832 * false.
833 */
834
835static void multifd_send_pages(void)
836{
837 int i;
838 static int next_channel;
839 MultiFDSendParams *p = NULL; /* make happy gcc */
840 MultiFDPages_t *pages = multifd_send_state->pages;
841 uint64_t transferred;
842
843 qemu_sem_wait(&multifd_send_state->channels_ready);
844 for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
845 p = &multifd_send_state->params[i];
846
847 qemu_mutex_lock(&p->mutex);
848 if (!p->pending_job) {
849 p->pending_job++;
850 next_channel = (i + 1) % migrate_multifd_channels();
851 break;
852 }
853 qemu_mutex_unlock(&p->mutex);
854 }
855 p->pages->used = 0;
856
857 p->packet_num = multifd_send_state->packet_num++;
858 p->pages->block = NULL;
859 multifd_send_state->pages = p->pages;
860 p->pages = pages;
4fcefd44 861 transferred = ((uint64_t) pages->used) * TARGET_PAGE_SIZE + p->packet_len;
b9ee2f7d
JQ
862 ram_counters.multifd_bytes += transferred;
863 ram_counters.transferred += transferred;;
864 qemu_mutex_unlock(&p->mutex);
865 qemu_sem_post(&p->sem);
866}
867
868static void multifd_queue_page(RAMBlock *block, ram_addr_t offset)
869{
870 MultiFDPages_t *pages = multifd_send_state->pages;
871
872 if (!pages->block) {
873 pages->block = block;
874 }
875
876 if (pages->block == block) {
877 pages->offset[pages->used] = offset;
878 pages->iov[pages->used].iov_base = block->host + offset;
879 pages->iov[pages->used].iov_len = TARGET_PAGE_SIZE;
880 pages->used++;
881
882 if (pages->used < pages->allocated) {
883 return;
884 }
885 }
886
887 multifd_send_pages();
888
889 if (pages->block != block) {
890 multifd_queue_page(block, offset);
891 }
892}
893
66770707 894static void multifd_send_terminate_threads(Error *err)
f986c3d2
JQ
895{
896 int i;
897
7a169d74
JQ
898 if (err) {
899 MigrationState *s = migrate_get_current();
900 migrate_set_error(s, err);
901 if (s->state == MIGRATION_STATUS_SETUP ||
902 s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
903 s->state == MIGRATION_STATUS_DEVICE ||
904 s->state == MIGRATION_STATUS_ACTIVE) {
905 migrate_set_state(&s->state, s->state,
906 MIGRATION_STATUS_FAILED);
907 }
908 }
909
66770707 910 for (i = 0; i < migrate_multifd_channels(); i++) {
f986c3d2
JQ
911 MultiFDSendParams *p = &multifd_send_state->params[i];
912
913 qemu_mutex_lock(&p->mutex);
914 p->quit = true;
915 qemu_sem_post(&p->sem);
916 qemu_mutex_unlock(&p->mutex);
917 }
918}
919
920int multifd_save_cleanup(Error **errp)
921{
922 int i;
923 int ret = 0;
924
925 if (!migrate_use_multifd()) {
926 return 0;
927 }
66770707
JQ
928 multifd_send_terminate_threads(NULL);
929 for (i = 0; i < migrate_multifd_channels(); i++) {
f986c3d2
JQ
930 MultiFDSendParams *p = &multifd_send_state->params[i];
931
66770707
JQ
932 if (p->running) {
933 qemu_thread_join(&p->thread);
934 }
60df2d4a
JQ
935 socket_send_channel_destroy(p->c);
936 p->c = NULL;
f986c3d2
JQ
937 qemu_mutex_destroy(&p->mutex);
938 qemu_sem_destroy(&p->sem);
6df264ac 939 qemu_sem_destroy(&p->sem_sync);
f986c3d2
JQ
940 g_free(p->name);
941 p->name = NULL;
34c55a94
JQ
942 multifd_pages_clear(p->pages);
943 p->pages = NULL;
2a26c979
JQ
944 p->packet_len = 0;
945 g_free(p->packet);
946 p->packet = NULL;
f986c3d2 947 }
b9ee2f7d 948 qemu_sem_destroy(&multifd_send_state->channels_ready);
6df264ac 949 qemu_sem_destroy(&multifd_send_state->sem_sync);
f986c3d2
JQ
950 g_free(multifd_send_state->params);
951 multifd_send_state->params = NULL;
34c55a94
JQ
952 multifd_pages_clear(multifd_send_state->pages);
953 multifd_send_state->pages = NULL;
f986c3d2
JQ
954 g_free(multifd_send_state);
955 multifd_send_state = NULL;
956 return ret;
957}
958
6df264ac
JQ
959static void multifd_send_sync_main(void)
960{
961 int i;
962
963 if (!migrate_use_multifd()) {
964 return;
965 }
b9ee2f7d
JQ
966 if (multifd_send_state->pages->used) {
967 multifd_send_pages();
968 }
6df264ac
JQ
969 for (i = 0; i < migrate_multifd_channels(); i++) {
970 MultiFDSendParams *p = &multifd_send_state->params[i];
971
972 trace_multifd_send_sync_main_signal(p->id);
973
974 qemu_mutex_lock(&p->mutex);
b9ee2f7d
JQ
975
976 p->packet_num = multifd_send_state->packet_num++;
6df264ac
JQ
977 p->flags |= MULTIFD_FLAG_SYNC;
978 p->pending_job++;
979 qemu_mutex_unlock(&p->mutex);
980 qemu_sem_post(&p->sem);
981 }
982 for (i = 0; i < migrate_multifd_channels(); i++) {
983 MultiFDSendParams *p = &multifd_send_state->params[i];
984
985 trace_multifd_send_sync_main_wait(p->id);
986 qemu_sem_wait(&multifd_send_state->sem_sync);
987 }
988 trace_multifd_send_sync_main(multifd_send_state->packet_num);
989}
990
f986c3d2
JQ
991static void *multifd_send_thread(void *opaque)
992{
993 MultiFDSendParams *p = opaque;
af8b7d2b 994 Error *local_err = NULL;
8b2db7f5 995 int ret;
af8b7d2b 996
408ea6ae 997 trace_multifd_send_thread_start(p->id);
74637e6f 998 rcu_register_thread();
408ea6ae 999
af8b7d2b
JQ
1000 if (multifd_send_initial_packet(p, &local_err) < 0) {
1001 goto out;
1002 }
408ea6ae
JQ
1003 /* initial packet */
1004 p->num_packets = 1;
f986c3d2
JQ
1005
1006 while (true) {
d82628e4 1007 qemu_sem_wait(&p->sem);
f986c3d2 1008 qemu_mutex_lock(&p->mutex);
0beb5ed3
JQ
1009
1010 if (p->pending_job) {
1011 uint32_t used = p->pages->used;
1012 uint64_t packet_num = p->packet_num;
1013 uint32_t flags = p->flags;
1014
1015 multifd_send_fill_packet(p);
1016 p->flags = 0;
1017 p->num_packets++;
1018 p->num_pages += used;
1019 p->pages->used = 0;
1020 qemu_mutex_unlock(&p->mutex);
1021
1022 trace_multifd_send(p->id, packet_num, used, flags);
1023
8b2db7f5
JQ
1024 ret = qio_channel_write_all(p->c, (void *)p->packet,
1025 p->packet_len, &local_err);
1026 if (ret != 0) {
1027 break;
1028 }
1029
1030 ret = qio_channel_writev_all(p->c, p->pages->iov, used, &local_err);
1031 if (ret != 0) {
1032 break;
1033 }
0beb5ed3
JQ
1034
1035 qemu_mutex_lock(&p->mutex);
1036 p->pending_job--;
1037 qemu_mutex_unlock(&p->mutex);
6df264ac
JQ
1038
1039 if (flags & MULTIFD_FLAG_SYNC) {
1040 qemu_sem_post(&multifd_send_state->sem_sync);
1041 }
b9ee2f7d 1042 qemu_sem_post(&multifd_send_state->channels_ready);
0beb5ed3 1043 } else if (p->quit) {
f986c3d2
JQ
1044 qemu_mutex_unlock(&p->mutex);
1045 break;
6df264ac
JQ
1046 } else {
1047 qemu_mutex_unlock(&p->mutex);
1048 /* sometimes there are spurious wakeups */
f986c3d2 1049 }
f986c3d2
JQ
1050 }
1051
af8b7d2b
JQ
1052out:
1053 if (local_err) {
1054 multifd_send_terminate_threads(local_err);
1055 }
1056
66770707
JQ
1057 qemu_mutex_lock(&p->mutex);
1058 p->running = false;
1059 qemu_mutex_unlock(&p->mutex);
1060
74637e6f 1061 rcu_unregister_thread();
408ea6ae
JQ
1062 trace_multifd_send_thread_end(p->id, p->num_packets, p->num_pages);
1063
f986c3d2
JQ
1064 return NULL;
1065}
1066
60df2d4a
JQ
1067static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
1068{
1069 MultiFDSendParams *p = opaque;
1070 QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
1071 Error *local_err = NULL;
1072
1073 if (qio_task_propagate_error(task, &local_err)) {
1074 if (multifd_save_cleanup(&local_err) != 0) {
1075 migrate_set_error(migrate_get_current(), local_err);
1076 }
1077 } else {
1078 p->c = QIO_CHANNEL(sioc);
1079 qio_channel_set_delay(p->c, false);
1080 p->running = true;
1081 qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
1082 QEMU_THREAD_JOINABLE);
1083
1084 atomic_inc(&multifd_send_state->count);
1085 }
1086}
1087
f986c3d2
JQ
1088int multifd_save_setup(void)
1089{
1090 int thread_count;
34c55a94 1091 uint32_t page_count = migrate_multifd_page_count();
f986c3d2
JQ
1092 uint8_t i;
1093
1094 if (!migrate_use_multifd()) {
1095 return 0;
1096 }
1097 thread_count = migrate_multifd_channels();
1098 multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
1099 multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
66770707 1100 atomic_set(&multifd_send_state->count, 0);
34c55a94 1101 multifd_send_state->pages = multifd_pages_init(page_count);
6df264ac 1102 qemu_sem_init(&multifd_send_state->sem_sync, 0);
b9ee2f7d 1103 qemu_sem_init(&multifd_send_state->channels_ready, 0);
34c55a94 1104
f986c3d2
JQ
1105 for (i = 0; i < thread_count; i++) {
1106 MultiFDSendParams *p = &multifd_send_state->params[i];
1107
1108 qemu_mutex_init(&p->mutex);
1109 qemu_sem_init(&p->sem, 0);
6df264ac 1110 qemu_sem_init(&p->sem_sync, 0);
f986c3d2 1111 p->quit = false;
0beb5ed3 1112 p->pending_job = 0;
f986c3d2 1113 p->id = i;
34c55a94 1114 p->pages = multifd_pages_init(page_count);
2a26c979
JQ
1115 p->packet_len = sizeof(MultiFDPacket_t)
1116 + sizeof(ram_addr_t) * page_count;
1117 p->packet = g_malloc0(p->packet_len);
f986c3d2 1118 p->name = g_strdup_printf("multifdsend_%d", i);
60df2d4a 1119 socket_send_channel_create(multifd_new_send_channel_async, p);
f986c3d2
JQ
1120 }
1121 return 0;
1122}
1123
f986c3d2
JQ
1124struct {
1125 MultiFDRecvParams *params;
1126 /* number of created threads */
1127 int count;
6df264ac
JQ
1128 /* syncs main thread and channels */
1129 QemuSemaphore sem_sync;
1130 /* global number of generated multifd packets */
1131 uint64_t packet_num;
f986c3d2
JQ
1132} *multifd_recv_state;
1133
66770707 1134static void multifd_recv_terminate_threads(Error *err)
f986c3d2
JQ
1135{
1136 int i;
1137
7a169d74
JQ
1138 if (err) {
1139 MigrationState *s = migrate_get_current();
1140 migrate_set_error(s, err);
1141 if (s->state == MIGRATION_STATUS_SETUP ||
1142 s->state == MIGRATION_STATUS_ACTIVE) {
1143 migrate_set_state(&s->state, s->state,
1144 MIGRATION_STATUS_FAILED);
1145 }
1146 }
1147
66770707 1148 for (i = 0; i < migrate_multifd_channels(); i++) {
f986c3d2
JQ
1149 MultiFDRecvParams *p = &multifd_recv_state->params[i];
1150
1151 qemu_mutex_lock(&p->mutex);
7a5cc33c
JQ
1152 /* We could arrive here for two reasons:
1153 - normal quit, i.e. everything went fine, just finished
1154 - error quit: We close the channels so the channel threads
1155 finish the qio_channel_read_all_eof() */
1156 qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
f986c3d2
JQ
1157 qemu_mutex_unlock(&p->mutex);
1158 }
1159}
1160
1161int multifd_load_cleanup(Error **errp)
1162{
1163 int i;
1164 int ret = 0;
1165
1166 if (!migrate_use_multifd()) {
1167 return 0;
1168 }
66770707
JQ
1169 multifd_recv_terminate_threads(NULL);
1170 for (i = 0; i < migrate_multifd_channels(); i++) {
f986c3d2
JQ
1171 MultiFDRecvParams *p = &multifd_recv_state->params[i];
1172
66770707
JQ
1173 if (p->running) {
1174 qemu_thread_join(&p->thread);
1175 }
60df2d4a
JQ
1176 object_unref(OBJECT(p->c));
1177 p->c = NULL;
f986c3d2 1178 qemu_mutex_destroy(&p->mutex);
6df264ac 1179 qemu_sem_destroy(&p->sem_sync);
f986c3d2
JQ
1180 g_free(p->name);
1181 p->name = NULL;
34c55a94
JQ
1182 multifd_pages_clear(p->pages);
1183 p->pages = NULL;
2a26c979
JQ
1184 p->packet_len = 0;
1185 g_free(p->packet);
1186 p->packet = NULL;
f986c3d2 1187 }
6df264ac 1188 qemu_sem_destroy(&multifd_recv_state->sem_sync);
f986c3d2
JQ
1189 g_free(multifd_recv_state->params);
1190 multifd_recv_state->params = NULL;
1191 g_free(multifd_recv_state);
1192 multifd_recv_state = NULL;
1193
1194 return ret;
1195}
1196
6df264ac
JQ
1197static void multifd_recv_sync_main(void)
1198{
1199 int i;
1200
1201 if (!migrate_use_multifd()) {
1202 return;
1203 }
1204 for (i = 0; i < migrate_multifd_channels(); i++) {
1205 MultiFDRecvParams *p = &multifd_recv_state->params[i];
1206
6df264ac
JQ
1207 trace_multifd_recv_sync_main_wait(p->id);
1208 qemu_sem_wait(&multifd_recv_state->sem_sync);
1209 qemu_mutex_lock(&p->mutex);
1210 if (multifd_recv_state->packet_num < p->packet_num) {
1211 multifd_recv_state->packet_num = p->packet_num;
1212 }
1213 qemu_mutex_unlock(&p->mutex);
1214 }
1215 for (i = 0; i < migrate_multifd_channels(); i++) {
1216 MultiFDRecvParams *p = &multifd_recv_state->params[i];
1217
1218 trace_multifd_recv_sync_main_signal(p->id);
6df264ac
JQ
1219 qemu_sem_post(&p->sem_sync);
1220 }
1221 trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
1222}
1223
f986c3d2
JQ
1224static void *multifd_recv_thread(void *opaque)
1225{
1226 MultiFDRecvParams *p = opaque;
2a26c979
JQ
1227 Error *local_err = NULL;
1228 int ret;
f986c3d2 1229
408ea6ae 1230 trace_multifd_recv_thread_start(p->id);
74637e6f 1231 rcu_register_thread();
408ea6ae 1232
f986c3d2 1233 while (true) {
6df264ac
JQ
1234 uint32_t used;
1235 uint32_t flags;
0beb5ed3 1236
8b2db7f5
JQ
1237 ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
1238 p->packet_len, &local_err);
1239 if (ret == 0) { /* EOF */
1240 break;
1241 }
1242 if (ret == -1) { /* Error */
1243 break;
1244 }
2a26c979 1245
6df264ac
JQ
1246 qemu_mutex_lock(&p->mutex);
1247 ret = multifd_recv_unfill_packet(p, &local_err);
1248 if (ret) {
f986c3d2
JQ
1249 qemu_mutex_unlock(&p->mutex);
1250 break;
1251 }
6df264ac
JQ
1252
1253 used = p->pages->used;
1254 flags = p->flags;
1255 trace_multifd_recv(p->id, p->packet_num, used, flags);
6df264ac
JQ
1256 p->num_packets++;
1257 p->num_pages += used;
f986c3d2 1258 qemu_mutex_unlock(&p->mutex);
6df264ac 1259
8b2db7f5
JQ
1260 ret = qio_channel_readv_all(p->c, p->pages->iov, used, &local_err);
1261 if (ret != 0) {
1262 break;
1263 }
1264
6df264ac
JQ
1265 if (flags & MULTIFD_FLAG_SYNC) {
1266 qemu_sem_post(&multifd_recv_state->sem_sync);
1267 qemu_sem_wait(&p->sem_sync);
1268 }
f986c3d2
JQ
1269 }
1270
d82628e4
JQ
1271 if (local_err) {
1272 multifd_recv_terminate_threads(local_err);
1273 }
66770707
JQ
1274 qemu_mutex_lock(&p->mutex);
1275 p->running = false;
1276 qemu_mutex_unlock(&p->mutex);
1277
74637e6f 1278 rcu_unregister_thread();
408ea6ae
JQ
1279 trace_multifd_recv_thread_end(p->id, p->num_packets, p->num_pages);
1280
f986c3d2
JQ
1281 return NULL;
1282}
1283
1284int multifd_load_setup(void)
1285{
1286 int thread_count;
34c55a94 1287 uint32_t page_count = migrate_multifd_page_count();
f986c3d2
JQ
1288 uint8_t i;
1289
1290 if (!migrate_use_multifd()) {
1291 return 0;
1292 }
1293 thread_count = migrate_multifd_channels();
1294 multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
1295 multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
66770707 1296 atomic_set(&multifd_recv_state->count, 0);
6df264ac 1297 qemu_sem_init(&multifd_recv_state->sem_sync, 0);
34c55a94 1298
f986c3d2
JQ
1299 for (i = 0; i < thread_count; i++) {
1300 MultiFDRecvParams *p = &multifd_recv_state->params[i];
1301
1302 qemu_mutex_init(&p->mutex);
6df264ac 1303 qemu_sem_init(&p->sem_sync, 0);
f986c3d2 1304 p->id = i;
34c55a94 1305 p->pages = multifd_pages_init(page_count);
2a26c979
JQ
1306 p->packet_len = sizeof(MultiFDPacket_t)
1307 + sizeof(ram_addr_t) * page_count;
1308 p->packet = g_malloc0(p->packet_len);
f986c3d2 1309 p->name = g_strdup_printf("multifdrecv_%d", i);
f986c3d2
JQ
1310 }
1311 return 0;
1312}
1313
62c1e0ca
JQ
1314bool multifd_recv_all_channels_created(void)
1315{
1316 int thread_count = migrate_multifd_channels();
1317
1318 if (!migrate_use_multifd()) {
1319 return true;
1320 }
1321
1322 return thread_count == atomic_read(&multifd_recv_state->count);
1323}
1324
81e62053
PX
1325/* Return true if multifd is ready for the migration, otherwise false */
1326bool multifd_recv_new_channel(QIOChannel *ioc)
71bb07db 1327{
60df2d4a 1328 MultiFDRecvParams *p;
af8b7d2b
JQ
1329 Error *local_err = NULL;
1330 int id;
60df2d4a 1331
af8b7d2b
JQ
1332 id = multifd_recv_initial_packet(ioc, &local_err);
1333 if (id < 0) {
1334 multifd_recv_terminate_threads(local_err);
81e62053 1335 return false;
af8b7d2b
JQ
1336 }
1337
1338 p = &multifd_recv_state->params[id];
1339 if (p->c != NULL) {
1340 error_setg(&local_err, "multifd: received id '%d' already setup'",
1341 id);
1342 multifd_recv_terminate_threads(local_err);
81e62053 1343 return false;
af8b7d2b 1344 }
60df2d4a
JQ
1345 p->c = ioc;
1346 object_ref(OBJECT(ioc));
408ea6ae
JQ
1347 /* initial packet */
1348 p->num_packets = 1;
60df2d4a
JQ
1349
1350 p->running = true;
1351 qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
1352 QEMU_THREAD_JOINABLE);
1353 atomic_inc(&multifd_recv_state->count);
81e62053 1354 return multifd_recv_state->count == migrate_multifd_channels();
71bb07db
JQ
1355}
1356
56e93d26 1357/**
3d0684b2 1358 * save_page_header: write page header to wire
56e93d26
JQ
1359 *
1360 * If this is the 1st block, it also writes the block identification
1361 *
3d0684b2 1362 * Returns the number of bytes written
56e93d26
JQ
1363 *
1364 * @f: QEMUFile where to send the data
1365 * @block: block that contains the page we want to send
1366 * @offset: offset inside the block for the page
1367 * in the lower bits, it contains flags
1368 */
2bf3aa85
JQ
1369static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
1370 ram_addr_t offset)
56e93d26 1371{
9f5f380b 1372 size_t size, len;
56e93d26 1373
24795694
JQ
1374 if (block == rs->last_sent_block) {
1375 offset |= RAM_SAVE_FLAG_CONTINUE;
1376 }
2bf3aa85 1377 qemu_put_be64(f, offset);
56e93d26
JQ
1378 size = 8;
1379
1380 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
9f5f380b 1381 len = strlen(block->idstr);
2bf3aa85
JQ
1382 qemu_put_byte(f, len);
1383 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
9f5f380b 1384 size += 1 + len;
24795694 1385 rs->last_sent_block = block;
56e93d26
JQ
1386 }
1387 return size;
1388}
1389
3d0684b2
JQ
1390/**
1391 * mig_throttle_guest_down: throotle down the guest
1392 *
1393 * Reduce amount of guest cpu execution to hopefully slow down memory
1394 * writes. If guest dirty memory rate is reduced below the rate at
1395 * which we can transfer pages to the destination then we should be
1396 * able to complete migration. Some workloads dirty memory way too
1397 * fast and will not effectively converge, even with auto-converge.
070afca2
JH
1398 */
1399static void mig_throttle_guest_down(void)
1400{
1401 MigrationState *s = migrate_get_current();
2594f56d
DB
1402 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
1403 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
4cbc9c7f 1404 int pct_max = s->parameters.max_cpu_throttle;
070afca2
JH
1405
1406 /* We have not started throttling yet. Let's start it. */
1407 if (!cpu_throttle_active()) {
1408 cpu_throttle_set(pct_initial);
1409 } else {
1410 /* Throttling already on, just increase the rate */
4cbc9c7f
LQ
1411 cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
1412 pct_max));
070afca2
JH
1413 }
1414}
1415
3d0684b2
JQ
1416/**
1417 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
1418 *
6f37bb8b 1419 * @rs: current RAM state
3d0684b2
JQ
1420 * @current_addr: address for the zero page
1421 *
1422 * Update the xbzrle cache to reflect a page that's been sent as all 0.
56e93d26
JQ
1423 * The important thing is that a stale (not-yet-0'd) page be replaced
1424 * by the new data.
1425 * As a bonus, if the page wasn't in the cache it gets added so that
3d0684b2 1426 * when a small write is made into the 0'd page it gets XBZRLE sent.
56e93d26 1427 */
6f37bb8b 1428static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
56e93d26 1429{
6f37bb8b 1430 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
56e93d26
JQ
1431 return;
1432 }
1433
1434 /* We don't care if this fails to allocate a new cache page
1435 * as long as it updated an old one */
c00e0928 1436 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
9360447d 1437 ram_counters.dirty_sync_count);
56e93d26
JQ
1438}
1439
1440#define ENCODING_FLAG_XBZRLE 0x1
1441
1442/**
1443 * save_xbzrle_page: compress and send current page
1444 *
1445 * Returns: 1 means that we wrote the page
1446 * 0 means that page is identical to the one already sent
1447 * -1 means that xbzrle would be longer than normal
1448 *
5a987738 1449 * @rs: current RAM state
3d0684b2
JQ
1450 * @current_data: pointer to the address of the page contents
1451 * @current_addr: addr of the page
56e93d26
JQ
1452 * @block: block that contains the page we want to send
1453 * @offset: offset inside the block for the page
1454 * @last_stage: if we are at the completion stage
56e93d26 1455 */
204b88b8 1456static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
56e93d26 1457 ram_addr_t current_addr, RAMBlock *block,
072c2511 1458 ram_addr_t offset, bool last_stage)
56e93d26
JQ
1459{
1460 int encoded_len = 0, bytes_xbzrle;
1461 uint8_t *prev_cached_page;
1462
9360447d
JQ
1463 if (!cache_is_cached(XBZRLE.cache, current_addr,
1464 ram_counters.dirty_sync_count)) {
1465 xbzrle_counters.cache_miss++;
56e93d26
JQ
1466 if (!last_stage) {
1467 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
9360447d 1468 ram_counters.dirty_sync_count) == -1) {
56e93d26
JQ
1469 return -1;
1470 } else {
1471 /* update *current_data when the page has been
1472 inserted into cache */
1473 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1474 }
1475 }
1476 return -1;
1477 }
1478
1479 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1480
1481 /* save current buffer into memory */
1482 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1483
1484 /* XBZRLE encoding (if there is no overflow) */
1485 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1486 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1487 TARGET_PAGE_SIZE);
1488 if (encoded_len == 0) {
55c4446b 1489 trace_save_xbzrle_page_skipping();
56e93d26
JQ
1490 return 0;
1491 } else if (encoded_len == -1) {
55c4446b 1492 trace_save_xbzrle_page_overflow();
9360447d 1493 xbzrle_counters.overflow++;
56e93d26
JQ
1494 /* update data in the cache */
1495 if (!last_stage) {
1496 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
1497 *current_data = prev_cached_page;
1498 }
1499 return -1;
1500 }
1501
1502 /* we need to update the data in the cache, in order to get the same data */
1503 if (!last_stage) {
1504 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1505 }
1506
1507 /* Send XBZRLE based compressed page */
2bf3aa85 1508 bytes_xbzrle = save_page_header(rs, rs->f, block,
204b88b8
JQ
1509 offset | RAM_SAVE_FLAG_XBZRLE);
1510 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1511 qemu_put_be16(rs->f, encoded_len);
1512 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
56e93d26 1513 bytes_xbzrle += encoded_len + 1 + 2;
9360447d
JQ
1514 xbzrle_counters.pages++;
1515 xbzrle_counters.bytes += bytes_xbzrle;
1516 ram_counters.transferred += bytes_xbzrle;
56e93d26
JQ
1517
1518 return 1;
1519}
1520
3d0684b2
JQ
1521/**
1522 * migration_bitmap_find_dirty: find the next dirty page from start
f3f491fc 1523 *
3d0684b2
JQ
1524 * Called with rcu_read_lock() to protect migration_bitmap
1525 *
1526 * Returns the byte offset within memory region of the start of a dirty page
1527 *
6f37bb8b 1528 * @rs: current RAM state
3d0684b2 1529 * @rb: RAMBlock where to search for dirty pages
a935e30f 1530 * @start: page where we start the search
f3f491fc 1531 */
56e93d26 1532static inline
a935e30f 1533unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
f20e2865 1534 unsigned long start)
56e93d26 1535{
6b6712ef
JQ
1536 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1537 unsigned long *bitmap = rb->bmap;
56e93d26
JQ
1538 unsigned long next;
1539
b895de50
CLG
1540 if (!qemu_ram_is_migratable(rb)) {
1541 return size;
1542 }
1543
6b6712ef
JQ
1544 if (rs->ram_bulk_stage && start > 0) {
1545 next = start + 1;
56e93d26 1546 } else {
6b6712ef 1547 next = find_next_bit(bitmap, size, start);
56e93d26
JQ
1548 }
1549
6b6712ef 1550 return next;
56e93d26
JQ
1551}
1552
06b10688 1553static inline bool migration_bitmap_clear_dirty(RAMState *rs,
f20e2865
JQ
1554 RAMBlock *rb,
1555 unsigned long page)
a82d593b
DDAG
1556{
1557 bool ret;
a82d593b 1558
6b6712ef 1559 ret = test_and_clear_bit(page, rb->bmap);
a82d593b
DDAG
1560
1561 if (ret) {
0d8ec885 1562 rs->migration_dirty_pages--;
a82d593b
DDAG
1563 }
1564 return ret;
1565}
1566
15440dd5
JQ
1567static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
1568 ram_addr_t start, ram_addr_t length)
56e93d26 1569{
0d8ec885 1570 rs->migration_dirty_pages +=
6b6712ef 1571 cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
0d8ec885 1572 &rs->num_dirty_pages_period);
56e93d26
JQ
1573}
1574
3d0684b2
JQ
1575/**
1576 * ram_pagesize_summary: calculate all the pagesizes of a VM
1577 *
1578 * Returns a summary bitmap of the page sizes of all RAMBlocks
1579 *
1580 * For VMs with just normal pages this is equivalent to the host page
1581 * size. If it's got some huge pages then it's the OR of all the
1582 * different page sizes.
e8ca1db2
DDAG
1583 */
1584uint64_t ram_pagesize_summary(void)
1585{
1586 RAMBlock *block;
1587 uint64_t summary = 0;
1588
b895de50 1589 RAMBLOCK_FOREACH_MIGRATABLE(block) {
e8ca1db2
DDAG
1590 summary |= block->page_size;
1591 }
1592
1593 return summary;
1594}
1595
b734035b
XG
1596static void migration_update_rates(RAMState *rs, int64_t end_time)
1597{
be8b02ed 1598 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
76e03000 1599 double compressed_size;
b734035b
XG
1600
1601 /* calculate period counters */
1602 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1603 / (end_time - rs->time_last_bitmap_sync);
1604
be8b02ed 1605 if (!page_count) {
b734035b
XG
1606 return;
1607 }
1608
1609 if (migrate_use_xbzrle()) {
1610 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
be8b02ed 1611 rs->xbzrle_cache_miss_prev) / page_count;
b734035b
XG
1612 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1613 }
76e03000
XG
1614
1615 if (migrate_use_compression()) {
1616 compression_counters.busy_rate = (double)(compression_counters.busy -
1617 rs->compress_thread_busy_prev) / page_count;
1618 rs->compress_thread_busy_prev = compression_counters.busy;
1619
1620 compressed_size = compression_counters.compressed_size -
1621 rs->compressed_size_prev;
1622 if (compressed_size) {
1623 double uncompressed_size = (compression_counters.pages -
1624 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1625
1626 /* Compression-Ratio = Uncompressed-size / Compressed-size */
1627 compression_counters.compression_rate =
1628 uncompressed_size / compressed_size;
1629
1630 rs->compress_pages_prev = compression_counters.pages;
1631 rs->compressed_size_prev = compression_counters.compressed_size;
1632 }
1633 }
b734035b
XG
1634}
1635
8d820d6f 1636static void migration_bitmap_sync(RAMState *rs)
56e93d26
JQ
1637{
1638 RAMBlock *block;
56e93d26 1639 int64_t end_time;
c4bdf0cf 1640 uint64_t bytes_xfer_now;
56e93d26 1641
9360447d 1642 ram_counters.dirty_sync_count++;
56e93d26 1643
f664da80
JQ
1644 if (!rs->time_last_bitmap_sync) {
1645 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
56e93d26
JQ
1646 }
1647
1648 trace_migration_bitmap_sync_start();
9c1f8f44 1649 memory_global_dirty_log_sync();
56e93d26 1650
108cfae0 1651 qemu_mutex_lock(&rs->bitmap_mutex);
56e93d26 1652 rcu_read_lock();
b895de50 1653 RAMBLOCK_FOREACH_MIGRATABLE(block) {
15440dd5 1654 migration_bitmap_sync_range(rs, block, 0, block->used_length);
56e93d26 1655 }
650af890 1656 ram_counters.remaining = ram_bytes_remaining();
56e93d26 1657 rcu_read_unlock();
108cfae0 1658 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26 1659
a66cd90c 1660 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1ffb5dfd 1661
56e93d26
JQ
1662 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1663
1664 /* more than 1 second = 1000 millisecons */
f664da80 1665 if (end_time > rs->time_last_bitmap_sync + 1000) {
9360447d 1666 bytes_xfer_now = ram_counters.transferred;
d693c6f1 1667
9ac78b61
PL
1668 /* During block migration the auto-converge logic incorrectly detects
1669 * that ram migration makes no progress. Avoid this by disabling the
1670 * throttling logic during the bulk phase of block migration. */
1671 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
56e93d26
JQ
1672 /* The following detection logic can be refined later. For now:
1673 Check to see if the dirtied bytes is 50% more than the approx.
1674 amount of bytes that just got transferred since the last time we
070afca2
JH
1675 were in this routine. If that happens twice, start or increase
1676 throttling */
070afca2 1677
d693c6f1 1678 if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
eac74159 1679 (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
b4a3c64b 1680 (++rs->dirty_rate_high_cnt >= 2)) {
56e93d26 1681 trace_migration_throttle();
8d820d6f 1682 rs->dirty_rate_high_cnt = 0;
070afca2 1683 mig_throttle_guest_down();
d693c6f1 1684 }
56e93d26 1685 }
070afca2 1686
b734035b
XG
1687 migration_update_rates(rs, end_time);
1688
be8b02ed 1689 rs->target_page_count_prev = rs->target_page_count;
d693c6f1
FF
1690
1691 /* reset period counters */
f664da80 1692 rs->time_last_bitmap_sync = end_time;
a66cd90c 1693 rs->num_dirty_pages_period = 0;
d2a4d85a 1694 rs->bytes_xfer_prev = bytes_xfer_now;
56e93d26 1695 }
4addcd4f 1696 if (migrate_use_events()) {
3ab72385 1697 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
4addcd4f 1698 }
56e93d26
JQ
1699}
1700
6c97ec5f
XG
1701/**
1702 * save_zero_page_to_file: send the zero page to the file
1703 *
1704 * Returns the size of data written to the file, 0 means the page is not
1705 * a zero page
1706 *
1707 * @rs: current RAM state
1708 * @file: the file where the data is saved
1709 * @block: block that contains the page we want to send
1710 * @offset: offset inside the block for the page
1711 */
1712static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1713 RAMBlock *block, ram_addr_t offset)
1714{
1715 uint8_t *p = block->host + offset;
1716 int len = 0;
1717
1718 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1719 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1720 qemu_put_byte(file, 0);
1721 len += 1;
1722 }
1723 return len;
1724}
1725
56e93d26 1726/**
3d0684b2 1727 * save_zero_page: send the zero page to the stream
56e93d26 1728 *
3d0684b2 1729 * Returns the number of pages written.
56e93d26 1730 *
f7ccd61b 1731 * @rs: current RAM state
56e93d26
JQ
1732 * @block: block that contains the page we want to send
1733 * @offset: offset inside the block for the page
56e93d26 1734 */
7faccdc3 1735static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
56e93d26 1736{
6c97ec5f 1737 int len = save_zero_page_to_file(rs, rs->f, block, offset);
56e93d26 1738
6c97ec5f 1739 if (len) {
9360447d 1740 ram_counters.duplicate++;
6c97ec5f
XG
1741 ram_counters.transferred += len;
1742 return 1;
56e93d26 1743 }
6c97ec5f 1744 return -1;
56e93d26
JQ
1745}
1746
5727309d 1747static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
53f09a10 1748{
5727309d 1749 if (!migrate_release_ram() || !migration_in_postcopy()) {
53f09a10
PB
1750 return;
1751 }
1752
aaa2064c 1753 ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
53f09a10
PB
1754}
1755
059ff0fb
XG
1756/*
1757 * @pages: the number of pages written by the control path,
1758 * < 0 - error
1759 * > 0 - number of pages written
1760 *
1761 * Return true if the pages has been saved, otherwise false is returned.
1762 */
1763static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1764 int *pages)
1765{
1766 uint64_t bytes_xmit = 0;
1767 int ret;
1768
1769 *pages = -1;
1770 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1771 &bytes_xmit);
1772 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1773 return false;
1774 }
1775
1776 if (bytes_xmit) {
1777 ram_counters.transferred += bytes_xmit;
1778 *pages = 1;
1779 }
1780
1781 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1782 return true;
1783 }
1784
1785 if (bytes_xmit > 0) {
1786 ram_counters.normal++;
1787 } else if (bytes_xmit == 0) {
1788 ram_counters.duplicate++;
1789 }
1790
1791 return true;
1792}
1793
65dacaa0
XG
1794/*
1795 * directly send the page to the stream
1796 *
1797 * Returns the number of pages written.
1798 *
1799 * @rs: current RAM state
1800 * @block: block that contains the page we want to send
1801 * @offset: offset inside the block for the page
1802 * @buf: the page to be sent
1803 * @async: send to page asyncly
1804 */
1805static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1806 uint8_t *buf, bool async)
1807{
1808 ram_counters.transferred += save_page_header(rs, rs->f, block,
1809 offset | RAM_SAVE_FLAG_PAGE);
1810 if (async) {
1811 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1812 migrate_release_ram() &
1813 migration_in_postcopy());
1814 } else {
1815 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1816 }
1817 ram_counters.transferred += TARGET_PAGE_SIZE;
1818 ram_counters.normal++;
1819 return 1;
1820}
1821
56e93d26 1822/**
3d0684b2 1823 * ram_save_page: send the given page to the stream
56e93d26 1824 *
3d0684b2 1825 * Returns the number of pages written.
3fd3c4b3
DDAG
1826 * < 0 - error
1827 * >=0 - Number of pages written - this might legally be 0
1828 * if xbzrle noticed the page was the same.
56e93d26 1829 *
6f37bb8b 1830 * @rs: current RAM state
56e93d26
JQ
1831 * @block: block that contains the page we want to send
1832 * @offset: offset inside the block for the page
1833 * @last_stage: if we are at the completion stage
56e93d26 1834 */
a0a8aa14 1835static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
56e93d26
JQ
1836{
1837 int pages = -1;
56e93d26 1838 uint8_t *p;
56e93d26 1839 bool send_async = true;
a08f6890 1840 RAMBlock *block = pss->block;
a935e30f 1841 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
059ff0fb 1842 ram_addr_t current_addr = block->offset + offset;
56e93d26 1843
2f68e399 1844 p = block->host + offset;
1db9d8e5 1845 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
56e93d26 1846
56e93d26 1847 XBZRLE_cache_lock();
d7400a34
XG
1848 if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1849 migrate_use_xbzrle()) {
059ff0fb
XG
1850 pages = save_xbzrle_page(rs, &p, current_addr, block,
1851 offset, last_stage);
1852 if (!last_stage) {
1853 /* Can't send this cached data async, since the cache page
1854 * might get updated before it gets to the wire
56e93d26 1855 */
059ff0fb 1856 send_async = false;
56e93d26
JQ
1857 }
1858 }
1859
1860 /* XBZRLE overflow or normal page */
1861 if (pages == -1) {
65dacaa0 1862 pages = save_normal_page(rs, block, offset, p, send_async);
56e93d26
JQ
1863 }
1864
1865 XBZRLE_cache_unlock();
1866
1867 return pages;
1868}
1869
b9ee2f7d
JQ
1870static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1871 ram_addr_t offset)
1872{
b9ee2f7d 1873 multifd_queue_page(block, offset);
b9ee2f7d
JQ
1874 ram_counters.normal++;
1875
1876 return 1;
1877}
1878
5e5fdcff 1879static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
6ef3771c 1880 ram_addr_t offset, uint8_t *source_buf)
56e93d26 1881{
53518d94 1882 RAMState *rs = ram_state;
a7a9a88f 1883 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
5e5fdcff 1884 bool zero_page = false;
6ef3771c 1885 int ret;
56e93d26 1886
5e5fdcff
XG
1887 if (save_zero_page_to_file(rs, f, block, offset)) {
1888 zero_page = true;
1889 goto exit;
1890 }
1891
6ef3771c 1892 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
34ab9e97
XG
1893
1894 /*
1895 * copy it to a internal buffer to avoid it being modified by VM
1896 * so that we can catch up the error during compression and
1897 * decompression
1898 */
1899 memcpy(source_buf, p, TARGET_PAGE_SIZE);
6ef3771c
XG
1900 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1901 if (ret < 0) {
1902 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
b3be2896 1903 error_report("compressed data failed!");
5e5fdcff 1904 return false;
b3be2896 1905 }
56e93d26 1906
5e5fdcff 1907exit:
6ef3771c 1908 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
5e5fdcff
XG
1909 return zero_page;
1910}
1911
1912static void
1913update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1914{
76e03000
XG
1915 ram_counters.transferred += bytes_xmit;
1916
5e5fdcff
XG
1917 if (param->zero_page) {
1918 ram_counters.duplicate++;
76e03000 1919 return;
5e5fdcff 1920 }
76e03000
XG
1921
1922 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1923 compression_counters.compressed_size += bytes_xmit - 8;
1924 compression_counters.pages++;
56e93d26
JQ
1925}
1926
32b05495
XG
1927static bool save_page_use_compression(RAMState *rs);
1928
ce25d337 1929static void flush_compressed_data(RAMState *rs)
56e93d26
JQ
1930{
1931 int idx, len, thread_count;
1932
32b05495 1933 if (!save_page_use_compression(rs)) {
56e93d26
JQ
1934 return;
1935 }
1936 thread_count = migrate_compress_threads();
a7a9a88f 1937
0d9f9a5c 1938 qemu_mutex_lock(&comp_done_lock);
56e93d26 1939 for (idx = 0; idx < thread_count; idx++) {
a7a9a88f 1940 while (!comp_param[idx].done) {
0d9f9a5c 1941 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26 1942 }
a7a9a88f 1943 }
0d9f9a5c 1944 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
1945
1946 for (idx = 0; idx < thread_count; idx++) {
1947 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 1948 if (!comp_param[idx].quit) {
ce25d337 1949 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
5e5fdcff
XG
1950 /*
1951 * it's safe to fetch zero_page without holding comp_done_lock
1952 * as there is no further request submitted to the thread,
1953 * i.e, the thread should be waiting for a request at this point.
1954 */
1955 update_compress_thread_counts(&comp_param[idx], len);
56e93d26 1956 }
a7a9a88f 1957 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26
JQ
1958 }
1959}
1960
1961static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1962 ram_addr_t offset)
1963{
1964 param->block = block;
1965 param->offset = offset;
1966}
1967
ce25d337
JQ
1968static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1969 ram_addr_t offset)
56e93d26
JQ
1970{
1971 int idx, thread_count, bytes_xmit = -1, pages = -1;
1d58872a 1972 bool wait = migrate_compress_wait_thread();
56e93d26
JQ
1973
1974 thread_count = migrate_compress_threads();
0d9f9a5c 1975 qemu_mutex_lock(&comp_done_lock);
1d58872a
XG
1976retry:
1977 for (idx = 0; idx < thread_count; idx++) {
1978 if (comp_param[idx].done) {
1979 comp_param[idx].done = false;
1980 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1981 qemu_mutex_lock(&comp_param[idx].mutex);
1982 set_compress_params(&comp_param[idx], block, offset);
1983 qemu_cond_signal(&comp_param[idx].cond);
1984 qemu_mutex_unlock(&comp_param[idx].mutex);
1985 pages = 1;
5e5fdcff 1986 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
56e93d26 1987 break;
56e93d26
JQ
1988 }
1989 }
1d58872a
XG
1990
1991 /*
1992 * wait for the free thread if the user specifies 'compress-wait-thread',
1993 * otherwise we will post the page out in the main thread as normal page.
1994 */
1995 if (pages < 0 && wait) {
1996 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1997 goto retry;
1998 }
0d9f9a5c 1999 qemu_mutex_unlock(&comp_done_lock);
56e93d26
JQ
2000
2001 return pages;
2002}
2003
3d0684b2
JQ
2004/**
2005 * find_dirty_block: find the next dirty page and update any state
2006 * associated with the search process.
b9e60928 2007 *
3d0684b2 2008 * Returns if a page is found
b9e60928 2009 *
6f37bb8b 2010 * @rs: current RAM state
3d0684b2
JQ
2011 * @pss: data about the state of the current dirty page scan
2012 * @again: set to false if the search has scanned the whole of RAM
b9e60928 2013 */
f20e2865 2014static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
b9e60928 2015{
f20e2865 2016 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
6f37bb8b 2017 if (pss->complete_round && pss->block == rs->last_seen_block &&
a935e30f 2018 pss->page >= rs->last_page) {
b9e60928
DDAG
2019 /*
2020 * We've been once around the RAM and haven't found anything.
2021 * Give up.
2022 */
2023 *again = false;
2024 return false;
2025 }
a935e30f 2026 if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
b9e60928 2027 /* Didn't find anything in this RAM Block */
a935e30f 2028 pss->page = 0;
b9e60928
DDAG
2029 pss->block = QLIST_NEXT_RCU(pss->block, next);
2030 if (!pss->block) {
48df9d80
XG
2031 /*
2032 * If memory migration starts over, we will meet a dirtied page
2033 * which may still exists in compression threads's ring, so we
2034 * should flush the compressed data to make sure the new page
2035 * is not overwritten by the old one in the destination.
2036 *
2037 * Also If xbzrle is on, stop using the data compression at this
2038 * point. In theory, xbzrle can do better than compression.
2039 */
2040 flush_compressed_data(rs);
2041
b9e60928
DDAG
2042 /* Hit the end of the list */
2043 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
2044 /* Flag that we've looped */
2045 pss->complete_round = true;
6f37bb8b 2046 rs->ram_bulk_stage = false;
b9e60928
DDAG
2047 }
2048 /* Didn't find anything this time, but try again on the new block */
2049 *again = true;
2050 return false;
2051 } else {
2052 /* Can go around again, but... */
2053 *again = true;
2054 /* We've found something so probably don't need to */
2055 return true;
2056 }
2057}
2058
3d0684b2
JQ
2059/**
2060 * unqueue_page: gets a page of the queue
2061 *
a82d593b 2062 * Helper for 'get_queued_page' - gets a page off the queue
a82d593b 2063 *
3d0684b2
JQ
2064 * Returns the block of the page (or NULL if none available)
2065 *
ec481c6c 2066 * @rs: current RAM state
3d0684b2 2067 * @offset: used to return the offset within the RAMBlock
a82d593b 2068 */
f20e2865 2069static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
a82d593b
DDAG
2070{
2071 RAMBlock *block = NULL;
2072
ae526e32
XG
2073 if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
2074 return NULL;
2075 }
2076
ec481c6c
JQ
2077 qemu_mutex_lock(&rs->src_page_req_mutex);
2078 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2079 struct RAMSrcPageRequest *entry =
2080 QSIMPLEQ_FIRST(&rs->src_page_requests);
a82d593b
DDAG
2081 block = entry->rb;
2082 *offset = entry->offset;
a82d593b
DDAG
2083
2084 if (entry->len > TARGET_PAGE_SIZE) {
2085 entry->len -= TARGET_PAGE_SIZE;
2086 entry->offset += TARGET_PAGE_SIZE;
2087 } else {
2088 memory_region_unref(block->mr);
ec481c6c 2089 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
a82d593b 2090 g_free(entry);
e03a34f8 2091 migration_consume_urgent_request();
a82d593b
DDAG
2092 }
2093 }
ec481c6c 2094 qemu_mutex_unlock(&rs->src_page_req_mutex);
a82d593b
DDAG
2095
2096 return block;
2097}
2098
3d0684b2
JQ
2099/**
2100 * get_queued_page: unqueue a page from the postocpy requests
2101 *
2102 * Skips pages that are already sent (!dirty)
a82d593b 2103 *
3d0684b2 2104 * Returns if a queued page is found
a82d593b 2105 *
6f37bb8b 2106 * @rs: current RAM state
3d0684b2 2107 * @pss: data about the state of the current dirty page scan
a82d593b 2108 */
f20e2865 2109static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
a82d593b
DDAG
2110{
2111 RAMBlock *block;
2112 ram_addr_t offset;
2113 bool dirty;
2114
2115 do {
f20e2865 2116 block = unqueue_page(rs, &offset);
a82d593b
DDAG
2117 /*
2118 * We're sending this page, and since it's postcopy nothing else
2119 * will dirty it, and we must make sure it doesn't get sent again
2120 * even if this queue request was received after the background
2121 * search already sent it.
2122 */
2123 if (block) {
f20e2865
JQ
2124 unsigned long page;
2125
6b6712ef
JQ
2126 page = offset >> TARGET_PAGE_BITS;
2127 dirty = test_bit(page, block->bmap);
a82d593b 2128 if (!dirty) {
06b10688 2129 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
6b6712ef 2130 page, test_bit(page, block->unsentmap));
a82d593b 2131 } else {
f20e2865 2132 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
a82d593b
DDAG
2133 }
2134 }
2135
2136 } while (block && !dirty);
2137
2138 if (block) {
2139 /*
2140 * As soon as we start servicing pages out of order, then we have
2141 * to kill the bulk stage, since the bulk stage assumes
2142 * in (migration_bitmap_find_and_reset_dirty) that every page is
2143 * dirty, that's no longer true.
2144 */
6f37bb8b 2145 rs->ram_bulk_stage = false;
a82d593b
DDAG
2146
2147 /*
2148 * We want the background search to continue from the queued page
2149 * since the guest is likely to want other pages near to the page
2150 * it just requested.
2151 */
2152 pss->block = block;
a935e30f 2153 pss->page = offset >> TARGET_PAGE_BITS;
a82d593b
DDAG
2154 }
2155
2156 return !!block;
2157}
2158
6c595cde 2159/**
5e58f968
JQ
2160 * migration_page_queue_free: drop any remaining pages in the ram
2161 * request queue
6c595cde 2162 *
3d0684b2
JQ
2163 * It should be empty at the end anyway, but in error cases there may
2164 * be some left. in case that there is any page left, we drop it.
2165 *
6c595cde 2166 */
83c13382 2167static void migration_page_queue_free(RAMState *rs)
6c595cde 2168{
ec481c6c 2169 struct RAMSrcPageRequest *mspr, *next_mspr;
6c595cde
DDAG
2170 /* This queue generally should be empty - but in the case of a failed
2171 * migration might have some droppings in.
2172 */
2173 rcu_read_lock();
ec481c6c 2174 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
6c595cde 2175 memory_region_unref(mspr->rb->mr);
ec481c6c 2176 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
6c595cde
DDAG
2177 g_free(mspr);
2178 }
2179 rcu_read_unlock();
2180}
2181
2182/**
3d0684b2
JQ
2183 * ram_save_queue_pages: queue the page for transmission
2184 *
2185 * A request from postcopy destination for example.
2186 *
2187 * Returns zero on success or negative on error
2188 *
3d0684b2
JQ
2189 * @rbname: Name of the RAMBLock of the request. NULL means the
2190 * same that last one.
2191 * @start: starting address from the start of the RAMBlock
2192 * @len: length (in bytes) to send
6c595cde 2193 */
96506894 2194int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
6c595cde
DDAG
2195{
2196 RAMBlock *ramblock;
53518d94 2197 RAMState *rs = ram_state;
6c595cde 2198
9360447d 2199 ram_counters.postcopy_requests++;
6c595cde
DDAG
2200 rcu_read_lock();
2201 if (!rbname) {
2202 /* Reuse last RAMBlock */
68a098f3 2203 ramblock = rs->last_req_rb;
6c595cde
DDAG
2204
2205 if (!ramblock) {
2206 /*
2207 * Shouldn't happen, we can't reuse the last RAMBlock if
2208 * it's the 1st request.
2209 */
2210 error_report("ram_save_queue_pages no previous block");
2211 goto err;
2212 }
2213 } else {
2214 ramblock = qemu_ram_block_by_name(rbname);
2215
2216 if (!ramblock) {
2217 /* We shouldn't be asked for a non-existent RAMBlock */
2218 error_report("ram_save_queue_pages no block '%s'", rbname);
2219 goto err;
2220 }
68a098f3 2221 rs->last_req_rb = ramblock;
6c595cde
DDAG
2222 }
2223 trace_ram_save_queue_pages(ramblock->idstr, start, len);
2224 if (start+len > ramblock->used_length) {
9458ad6b
JQ
2225 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2226 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
6c595cde
DDAG
2227 __func__, start, len, ramblock->used_length);
2228 goto err;
2229 }
2230
ec481c6c
JQ
2231 struct RAMSrcPageRequest *new_entry =
2232 g_malloc0(sizeof(struct RAMSrcPageRequest));
6c595cde
DDAG
2233 new_entry->rb = ramblock;
2234 new_entry->offset = start;
2235 new_entry->len = len;
2236
2237 memory_region_ref(ramblock->mr);
ec481c6c
JQ
2238 qemu_mutex_lock(&rs->src_page_req_mutex);
2239 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
e03a34f8 2240 migration_make_urgent_request();
ec481c6c 2241 qemu_mutex_unlock(&rs->src_page_req_mutex);
6c595cde
DDAG
2242 rcu_read_unlock();
2243
2244 return 0;
2245
2246err:
2247 rcu_read_unlock();
2248 return -1;
2249}
2250
d7400a34
XG
2251static bool save_page_use_compression(RAMState *rs)
2252{
2253 if (!migrate_use_compression()) {
2254 return false;
2255 }
2256
2257 /*
2258 * If xbzrle is on, stop using the data compression after first
2259 * round of migration even if compression is enabled. In theory,
2260 * xbzrle can do better than compression.
2261 */
2262 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
2263 return true;
2264 }
2265
2266 return false;
2267}
2268
5e5fdcff
XG
2269/*
2270 * try to compress the page before posting it out, return true if the page
2271 * has been properly handled by compression, otherwise needs other
2272 * paths to handle it
2273 */
2274static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2275{
2276 if (!save_page_use_compression(rs)) {
2277 return false;
2278 }
2279
2280 /*
2281 * When starting the process of a new block, the first page of
2282 * the block should be sent out before other pages in the same
2283 * block, and all the pages in last block should have been sent
2284 * out, keeping this order is important, because the 'cont' flag
2285 * is used to avoid resending the block name.
2286 *
2287 * We post the fist page as normal page as compression will take
2288 * much CPU resource.
2289 */
2290 if (block != rs->last_sent_block) {
2291 flush_compressed_data(rs);
2292 return false;
2293 }
2294
2295 if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2296 return true;
2297 }
2298
76e03000 2299 compression_counters.busy++;
5e5fdcff
XG
2300 return false;
2301}
2302
a82d593b 2303/**
3d0684b2 2304 * ram_save_target_page: save one target page
a82d593b 2305 *
3d0684b2 2306 * Returns the number of pages written
a82d593b 2307 *
6f37bb8b 2308 * @rs: current RAM state
3d0684b2 2309 * @pss: data about the page we want to send
a82d593b 2310 * @last_stage: if we are at the completion stage
a82d593b 2311 */
a0a8aa14 2312static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
f20e2865 2313 bool last_stage)
a82d593b 2314{
a8ec91f9
XG
2315 RAMBlock *block = pss->block;
2316 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2317 int res;
2318
2319 if (control_save_page(rs, block, offset, &res)) {
2320 return res;
2321 }
2322
5e5fdcff
XG
2323 if (save_compress_page(rs, block, offset)) {
2324 return 1;
d7400a34
XG
2325 }
2326
2327 res = save_zero_page(rs, block, offset);
2328 if (res > 0) {
2329 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2330 * page would be stale
2331 */
2332 if (!save_page_use_compression(rs)) {
2333 XBZRLE_cache_lock();
2334 xbzrle_cache_zero_page(rs, block->offset + offset);
2335 XBZRLE_cache_unlock();
2336 }
2337 ram_release_pages(block->idstr, offset, res);
2338 return res;
2339 }
2340
da3f56cb 2341 /*
5e5fdcff
XG
2342 * do not use multifd for compression as the first page in the new
2343 * block should be posted out before sending the compressed page
da3f56cb 2344 */
5e5fdcff 2345 if (!save_page_use_compression(rs) && migrate_use_multifd()) {
b9ee2f7d 2346 return ram_save_multifd_page(rs, block, offset);
a82d593b
DDAG
2347 }
2348
1faa5665 2349 return ram_save_page(rs, pss, last_stage);
a82d593b
DDAG
2350}
2351
2352/**
3d0684b2 2353 * ram_save_host_page: save a whole host page
a82d593b 2354 *
3d0684b2
JQ
2355 * Starting at *offset send pages up to the end of the current host
2356 * page. It's valid for the initial offset to point into the middle of
2357 * a host page in which case the remainder of the hostpage is sent.
2358 * Only dirty target pages are sent. Note that the host page size may
2359 * be a huge page for this block.
1eb3fc0a
DDAG
2360 * The saving stops at the boundary of the used_length of the block
2361 * if the RAMBlock isn't a multiple of the host page size.
a82d593b 2362 *
3d0684b2
JQ
2363 * Returns the number of pages written or negative on error
2364 *
6f37bb8b 2365 * @rs: current RAM state
3d0684b2 2366 * @ms: current migration state
3d0684b2 2367 * @pss: data about the page we want to send
a82d593b 2368 * @last_stage: if we are at the completion stage
a82d593b 2369 */
a0a8aa14 2370static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
f20e2865 2371 bool last_stage)
a82d593b
DDAG
2372{
2373 int tmppages, pages = 0;
a935e30f
JQ
2374 size_t pagesize_bits =
2375 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
4c011c37 2376
b895de50
CLG
2377 if (!qemu_ram_is_migratable(pss->block)) {
2378 error_report("block %s should not be migrated !", pss->block->idstr);
2379 return 0;
2380 }
2381
a82d593b 2382 do {
1faa5665
XG
2383 /* Check the pages is dirty and if it is send it */
2384 if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2385 pss->page++;
2386 continue;
2387 }
2388
f20e2865 2389 tmppages = ram_save_target_page(rs, pss, last_stage);
a82d593b
DDAG
2390 if (tmppages < 0) {
2391 return tmppages;
2392 }
2393
2394 pages += tmppages;
1faa5665
XG
2395 if (pss->block->unsentmap) {
2396 clear_bit(pss->page, pss->block->unsentmap);
2397 }
2398
a935e30f 2399 pss->page++;
1eb3fc0a
DDAG
2400 } while ((pss->page & (pagesize_bits - 1)) &&
2401 offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
a82d593b
DDAG
2402
2403 /* The offset we leave with is the last one we looked at */
a935e30f 2404 pss->page--;
a82d593b
DDAG
2405 return pages;
2406}
6c595cde 2407
56e93d26 2408/**
3d0684b2 2409 * ram_find_and_save_block: finds a dirty page and sends it to f
56e93d26
JQ
2410 *
2411 * Called within an RCU critical section.
2412 *
e8f3735f
XG
2413 * Returns the number of pages written where zero means no dirty pages,
2414 * or negative on error
56e93d26 2415 *
6f37bb8b 2416 * @rs: current RAM state
56e93d26 2417 * @last_stage: if we are at the completion stage
a82d593b
DDAG
2418 *
2419 * On systems where host-page-size > target-page-size it will send all the
2420 * pages in a host page that are dirty.
56e93d26
JQ
2421 */
2422
ce25d337 2423static int ram_find_and_save_block(RAMState *rs, bool last_stage)
56e93d26 2424{
b8fb8cb7 2425 PageSearchStatus pss;
56e93d26 2426 int pages = 0;
b9e60928 2427 bool again, found;
56e93d26 2428
0827b9e9
AA
2429 /* No dirty page as there is zero RAM */
2430 if (!ram_bytes_total()) {
2431 return pages;
2432 }
2433
6f37bb8b 2434 pss.block = rs->last_seen_block;
a935e30f 2435 pss.page = rs->last_page;
b8fb8cb7
DDAG
2436 pss.complete_round = false;
2437
2438 if (!pss.block) {
2439 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2440 }
56e93d26 2441
b9e60928 2442 do {
a82d593b 2443 again = true;
f20e2865 2444 found = get_queued_page(rs, &pss);
b9e60928 2445
a82d593b
DDAG
2446 if (!found) {
2447 /* priority queue empty, so just search for something dirty */
f20e2865 2448 found = find_dirty_block(rs, &pss, &again);
a82d593b 2449 }
f3f491fc 2450
a82d593b 2451 if (found) {
f20e2865 2452 pages = ram_save_host_page(rs, &pss, last_stage);
56e93d26 2453 }
b9e60928 2454 } while (!pages && again);
56e93d26 2455
6f37bb8b 2456 rs->last_seen_block = pss.block;
a935e30f 2457 rs->last_page = pss.page;
56e93d26
JQ
2458
2459 return pages;
2460}
2461
2462void acct_update_position(QEMUFile *f, size_t size, bool zero)
2463{
2464 uint64_t pages = size / TARGET_PAGE_SIZE;
f7ccd61b 2465
56e93d26 2466 if (zero) {
9360447d 2467 ram_counters.duplicate += pages;
56e93d26 2468 } else {
9360447d
JQ
2469 ram_counters.normal += pages;
2470 ram_counters.transferred += size;
56e93d26
JQ
2471 qemu_update_position(f, size);
2472 }
2473}
2474
56e93d26
JQ
2475uint64_t ram_bytes_total(void)
2476{
2477 RAMBlock *block;
2478 uint64_t total = 0;
2479
2480 rcu_read_lock();
b895de50 2481 RAMBLOCK_FOREACH_MIGRATABLE(block) {
56e93d26 2482 total += block->used_length;
99e15582 2483 }
56e93d26
JQ
2484 rcu_read_unlock();
2485 return total;
2486}
2487
f265e0e4 2488static void xbzrle_load_setup(void)
56e93d26 2489{
f265e0e4 2490 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
56e93d26
JQ
2491}
2492
f265e0e4
JQ
2493static void xbzrle_load_cleanup(void)
2494{
2495 g_free(XBZRLE.decoded_buf);
2496 XBZRLE.decoded_buf = NULL;
2497}
2498
7d7c96be
PX
2499static void ram_state_cleanup(RAMState **rsp)
2500{
b9ccaf6d
DDAG
2501 if (*rsp) {
2502 migration_page_queue_free(*rsp);
2503 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2504 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2505 g_free(*rsp);
2506 *rsp = NULL;
2507 }
7d7c96be
PX
2508}
2509
84593a08
PX
2510static void xbzrle_cleanup(void)
2511{
2512 XBZRLE_cache_lock();
2513 if (XBZRLE.cache) {
2514 cache_fini(XBZRLE.cache);
2515 g_free(XBZRLE.encoded_buf);
2516 g_free(XBZRLE.current_buf);
2517 g_free(XBZRLE.zero_target_page);
2518 XBZRLE.cache = NULL;
2519 XBZRLE.encoded_buf = NULL;
2520 XBZRLE.current_buf = NULL;
2521 XBZRLE.zero_target_page = NULL;
2522 }
2523 XBZRLE_cache_unlock();
2524}
2525
f265e0e4 2526static void ram_save_cleanup(void *opaque)
56e93d26 2527{
53518d94 2528 RAMState **rsp = opaque;
6b6712ef 2529 RAMBlock *block;
eb859c53 2530
2ff64038
LZ
2531 /* caller have hold iothread lock or is in a bh, so there is
2532 * no writing race against this migration_bitmap
2533 */
6b6712ef
JQ
2534 memory_global_dirty_log_stop();
2535
b895de50 2536 RAMBLOCK_FOREACH_MIGRATABLE(block) {
6b6712ef
JQ
2537 g_free(block->bmap);
2538 block->bmap = NULL;
2539 g_free(block->unsentmap);
2540 block->unsentmap = NULL;
56e93d26
JQ
2541 }
2542
84593a08 2543 xbzrle_cleanup();
f0afa331 2544 compress_threads_save_cleanup();
7d7c96be 2545 ram_state_cleanup(rsp);
56e93d26
JQ
2546}
2547
6f37bb8b 2548static void ram_state_reset(RAMState *rs)
56e93d26 2549{
6f37bb8b
JQ
2550 rs->last_seen_block = NULL;
2551 rs->last_sent_block = NULL;
269ace29 2552 rs->last_page = 0;
6f37bb8b
JQ
2553 rs->last_version = ram_list.version;
2554 rs->ram_bulk_stage = true;
56e93d26
JQ
2555}
2556
2557#define MAX_WAIT 50 /* ms, half buffered_file limit */
2558
4f2e4252
DDAG
2559/*
2560 * 'expected' is the value you expect the bitmap mostly to be full
2561 * of; it won't bother printing lines that are all this value.
2562 * If 'todump' is null the migration bitmap is dumped.
2563 */
6b6712ef
JQ
2564void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2565 unsigned long pages)
4f2e4252 2566{
4f2e4252
DDAG
2567 int64_t cur;
2568 int64_t linelen = 128;
2569 char linebuf[129];
2570
6b6712ef 2571 for (cur = 0; cur < pages; cur += linelen) {
4f2e4252
DDAG
2572 int64_t curb;
2573 bool found = false;
2574 /*
2575 * Last line; catch the case where the line length
2576 * is longer than remaining ram
2577 */
6b6712ef
JQ
2578 if (cur + linelen > pages) {
2579 linelen = pages - cur;
4f2e4252
DDAG
2580 }
2581 for (curb = 0; curb < linelen; curb++) {
2582 bool thisbit = test_bit(cur + curb, todump);
2583 linebuf[curb] = thisbit ? '1' : '.';
2584 found = found || (thisbit != expected);
2585 }
2586 if (found) {
2587 linebuf[curb] = '\0';
2588 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
2589 }
2590 }
2591}
2592
e0b266f0
DDAG
2593/* **** functions for postcopy ***** */
2594
ced1c616
PB
2595void ram_postcopy_migrated_memory_release(MigrationState *ms)
2596{
2597 struct RAMBlock *block;
ced1c616 2598
b895de50 2599 RAMBLOCK_FOREACH_MIGRATABLE(block) {
6b6712ef
JQ
2600 unsigned long *bitmap = block->bmap;
2601 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2602 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
ced1c616
PB
2603
2604 while (run_start < range) {
2605 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
aaa2064c 2606 ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
ced1c616
PB
2607 (run_end - run_start) << TARGET_PAGE_BITS);
2608 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2609 }
2610 }
2611}
2612
3d0684b2
JQ
2613/**
2614 * postcopy_send_discard_bm_ram: discard a RAMBlock
2615 *
2616 * Returns zero on success
2617 *
e0b266f0
DDAG
2618 * Callback from postcopy_each_ram_send_discard for each RAMBlock
2619 * Note: At this point the 'unsentmap' is the processed bitmap combined
2620 * with the dirtymap; so a '1' means it's either dirty or unsent.
3d0684b2
JQ
2621 *
2622 * @ms: current migration state
2623 * @pds: state for postcopy
2624 * @start: RAMBlock starting page
2625 * @length: RAMBlock size
e0b266f0
DDAG
2626 */
2627static int postcopy_send_discard_bm_ram(MigrationState *ms,
2628 PostcopyDiscardState *pds,
6b6712ef 2629 RAMBlock *block)
e0b266f0 2630{
6b6712ef 2631 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
e0b266f0 2632 unsigned long current;
6b6712ef 2633 unsigned long *unsentmap = block->unsentmap;
e0b266f0 2634
6b6712ef 2635 for (current = 0; current < end; ) {
e0b266f0
DDAG
2636 unsigned long one = find_next_bit(unsentmap, end, current);
2637
2638 if (one <= end) {
2639 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
2640 unsigned long discard_length;
2641
2642 if (zero >= end) {
2643 discard_length = end - one;
2644 } else {
2645 discard_length = zero - one;
2646 }
d688c62d
DDAG
2647 if (discard_length) {
2648 postcopy_discard_send_range(ms, pds, one, discard_length);
2649 }
e0b266f0
DDAG
2650 current = one + discard_length;
2651 } else {
2652 current = one;
2653 }
2654 }
2655
2656 return 0;
2657}
2658
3d0684b2
JQ
2659/**
2660 * postcopy_each_ram_send_discard: discard all RAMBlocks
2661 *
2662 * Returns 0 for success or negative for error
2663 *
e0b266f0
DDAG
2664 * Utility for the outgoing postcopy code.
2665 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2666 * passing it bitmap indexes and name.
e0b266f0
DDAG
2667 * (qemu_ram_foreach_block ends up passing unscaled lengths
2668 * which would mean postcopy code would have to deal with target page)
3d0684b2
JQ
2669 *
2670 * @ms: current migration state
e0b266f0
DDAG
2671 */
2672static int postcopy_each_ram_send_discard(MigrationState *ms)
2673{
2674 struct RAMBlock *block;
2675 int ret;
2676
b895de50 2677 RAMBLOCK_FOREACH_MIGRATABLE(block) {
6b6712ef
JQ
2678 PostcopyDiscardState *pds =
2679 postcopy_discard_send_init(ms, block->idstr);
e0b266f0
DDAG
2680
2681 /*
2682 * Postcopy sends chunks of bitmap over the wire, but it
2683 * just needs indexes at this point, avoids it having
2684 * target page specific code.
2685 */
6b6712ef 2686 ret = postcopy_send_discard_bm_ram(ms, pds, block);
e0b266f0
DDAG
2687 postcopy_discard_send_finish(ms, pds);
2688 if (ret) {
2689 return ret;
2690 }
2691 }
2692
2693 return 0;
2694}
2695
3d0684b2
JQ
2696/**
2697 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
2698 *
2699 * Helper for postcopy_chunk_hostpages; it's called twice to
2700 * canonicalize the two bitmaps, that are similar, but one is
2701 * inverted.
99e314eb 2702 *
3d0684b2
JQ
2703 * Postcopy requires that all target pages in a hostpage are dirty or
2704 * clean, not a mix. This function canonicalizes the bitmaps.
99e314eb 2705 *
3d0684b2
JQ
2706 * @ms: current migration state
2707 * @unsent_pass: if true we need to canonicalize partially unsent host pages
2708 * otherwise we need to canonicalize partially dirty host pages
2709 * @block: block that contains the page we want to canonicalize
2710 * @pds: state for postcopy
99e314eb
DDAG
2711 */
2712static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
2713 RAMBlock *block,
2714 PostcopyDiscardState *pds)
2715{
53518d94 2716 RAMState *rs = ram_state;
6b6712ef
JQ
2717 unsigned long *bitmap = block->bmap;
2718 unsigned long *unsentmap = block->unsentmap;
29c59172 2719 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
6b6712ef 2720 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
99e314eb
DDAG
2721 unsigned long run_start;
2722
29c59172
DDAG
2723 if (block->page_size == TARGET_PAGE_SIZE) {
2724 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2725 return;
2726 }
2727
99e314eb
DDAG
2728 if (unsent_pass) {
2729 /* Find a sent page */
6b6712ef 2730 run_start = find_next_zero_bit(unsentmap, pages, 0);
99e314eb
DDAG
2731 } else {
2732 /* Find a dirty page */
6b6712ef 2733 run_start = find_next_bit(bitmap, pages, 0);
99e314eb
DDAG
2734 }
2735
6b6712ef 2736 while (run_start < pages) {
99e314eb
DDAG
2737 bool do_fixup = false;
2738 unsigned long fixup_start_addr;
2739 unsigned long host_offset;
2740
2741 /*
2742 * If the start of this run of pages is in the middle of a host
2743 * page, then we need to fixup this host page.
2744 */
2745 host_offset = run_start % host_ratio;
2746 if (host_offset) {
2747 do_fixup = true;
2748 run_start -= host_offset;
2749 fixup_start_addr = run_start;
2750 /* For the next pass */
2751 run_start = run_start + host_ratio;
2752 } else {
2753 /* Find the end of this run */
2754 unsigned long run_end;
2755 if (unsent_pass) {
6b6712ef 2756 run_end = find_next_bit(unsentmap, pages, run_start + 1);
99e314eb 2757 } else {
6b6712ef 2758 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
99e314eb
DDAG
2759 }
2760 /*
2761 * If the end isn't at the start of a host page, then the
2762 * run doesn't finish at the end of a host page
2763 * and we need to discard.
2764 */
2765 host_offset = run_end % host_ratio;
2766 if (host_offset) {
2767 do_fixup = true;
2768 fixup_start_addr = run_end - host_offset;
2769 /*
2770 * This host page has gone, the next loop iteration starts
2771 * from after the fixup
2772 */
2773 run_start = fixup_start_addr + host_ratio;
2774 } else {
2775 /*
2776 * No discards on this iteration, next loop starts from
2777 * next sent/dirty page
2778 */
2779 run_start = run_end + 1;
2780 }
2781 }
2782
2783 if (do_fixup) {
2784 unsigned long page;
2785
2786 /* Tell the destination to discard this page */
2787 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
2788 /* For the unsent_pass we:
2789 * discard partially sent pages
2790 * For the !unsent_pass (dirty) we:
2791 * discard partially dirty pages that were sent
2792 * (any partially sent pages were already discarded
2793 * by the previous unsent_pass)
2794 */
2795 postcopy_discard_send_range(ms, pds, fixup_start_addr,
2796 host_ratio);
2797 }
2798
2799 /* Clean up the bitmap */
2800 for (page = fixup_start_addr;
2801 page < fixup_start_addr + host_ratio; page++) {
2802 /* All pages in this host page are now not sent */
2803 set_bit(page, unsentmap);
2804
2805 /*
2806 * Remark them as dirty, updating the count for any pages
2807 * that weren't previously dirty.
2808 */
0d8ec885 2809 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
99e314eb
DDAG
2810 }
2811 }
2812
2813 if (unsent_pass) {
2814 /* Find the next sent page for the next iteration */
6b6712ef 2815 run_start = find_next_zero_bit(unsentmap, pages, run_start);
99e314eb
DDAG
2816 } else {
2817 /* Find the next dirty page for the next iteration */
6b6712ef 2818 run_start = find_next_bit(bitmap, pages, run_start);
99e314eb
DDAG
2819 }
2820 }
2821}
2822
3d0684b2
JQ
2823/**
2824 * postcopy_chuck_hostpages: discrad any partially sent host page
2825 *
99e314eb
DDAG
2826 * Utility for the outgoing postcopy code.
2827 *
2828 * Discard any partially sent host-page size chunks, mark any partially
29c59172
DDAG
2829 * dirty host-page size chunks as all dirty. In this case the host-page
2830 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
99e314eb 2831 *
3d0684b2
JQ
2832 * Returns zero on success
2833 *
2834 * @ms: current migration state
6b6712ef 2835 * @block: block we want to work with
99e314eb 2836 */
6b6712ef 2837static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
99e314eb 2838{
6b6712ef
JQ
2839 PostcopyDiscardState *pds =
2840 postcopy_discard_send_init(ms, block->idstr);
99e314eb 2841
6b6712ef
JQ
2842 /* First pass: Discard all partially sent host pages */
2843 postcopy_chunk_hostpages_pass(ms, true, block, pds);
2844 /*
2845 * Second pass: Ensure that all partially dirty host pages are made
2846 * fully dirty.
2847 */
2848 postcopy_chunk_hostpages_pass(ms, false, block, pds);
99e314eb 2849
6b6712ef 2850 postcopy_discard_send_finish(ms, pds);
99e314eb
DDAG
2851 return 0;
2852}
2853
3d0684b2
JQ
2854/**
2855 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2856 *
2857 * Returns zero on success
2858 *
e0b266f0
DDAG
2859 * Transmit the set of pages to be discarded after precopy to the target
2860 * these are pages that:
2861 * a) Have been previously transmitted but are now dirty again
2862 * b) Pages that have never been transmitted, this ensures that
2863 * any pages on the destination that have been mapped by background
2864 * tasks get discarded (transparent huge pages is the specific concern)
2865 * Hopefully this is pretty sparse
3d0684b2
JQ
2866 *
2867 * @ms: current migration state
e0b266f0
DDAG
2868 */
2869int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2870{
53518d94 2871 RAMState *rs = ram_state;
6b6712ef 2872 RAMBlock *block;
e0b266f0 2873 int ret;
e0b266f0
DDAG
2874
2875 rcu_read_lock();
2876
2877 /* This should be our last sync, the src is now paused */
eb859c53 2878 migration_bitmap_sync(rs);
e0b266f0 2879
6b6712ef
JQ
2880 /* Easiest way to make sure we don't resume in the middle of a host-page */
2881 rs->last_seen_block = NULL;
2882 rs->last_sent_block = NULL;
2883 rs->last_page = 0;
e0b266f0 2884
b895de50 2885 RAMBLOCK_FOREACH_MIGRATABLE(block) {
6b6712ef
JQ
2886 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2887 unsigned long *bitmap = block->bmap;
2888 unsigned long *unsentmap = block->unsentmap;
2889
2890 if (!unsentmap) {
2891 /* We don't have a safe way to resize the sentmap, so
2892 * if the bitmap was resized it will be NULL at this
2893 * point.
2894 */
2895 error_report("migration ram resized during precopy phase");
2896 rcu_read_unlock();
2897 return -EINVAL;
2898 }
2899 /* Deal with TPS != HPS and huge pages */
2900 ret = postcopy_chunk_hostpages(ms, block);
2901 if (ret) {
2902 rcu_read_unlock();
2903 return ret;
2904 }
e0b266f0 2905
6b6712ef
JQ
2906 /*
2907 * Update the unsentmap to be unsentmap = unsentmap | dirty
2908 */
2909 bitmap_or(unsentmap, unsentmap, bitmap, pages);
e0b266f0 2910#ifdef DEBUG_POSTCOPY
6b6712ef 2911 ram_debug_dump_bitmap(unsentmap, true, pages);
e0b266f0 2912#endif
6b6712ef
JQ
2913 }
2914 trace_ram_postcopy_send_discard_bitmap();
e0b266f0
DDAG
2915
2916 ret = postcopy_each_ram_send_discard(ms);
2917 rcu_read_unlock();
2918
2919 return ret;
2920}
2921
3d0684b2
JQ
2922/**
2923 * ram_discard_range: discard dirtied pages at the beginning of postcopy
e0b266f0 2924 *
3d0684b2 2925 * Returns zero on success
e0b266f0 2926 *
36449157
JQ
2927 * @rbname: name of the RAMBlock of the request. NULL means the
2928 * same that last one.
3d0684b2
JQ
2929 * @start: RAMBlock starting page
2930 * @length: RAMBlock size
e0b266f0 2931 */
aaa2064c 2932int ram_discard_range(const char *rbname, uint64_t start, size_t length)
e0b266f0
DDAG
2933{
2934 int ret = -1;
2935
36449157 2936 trace_ram_discard_range(rbname, start, length);
d3a5038c 2937
e0b266f0 2938 rcu_read_lock();
36449157 2939 RAMBlock *rb = qemu_ram_block_by_name(rbname);
e0b266f0
DDAG
2940
2941 if (!rb) {
36449157 2942 error_report("ram_discard_range: Failed to find block '%s'", rbname);
e0b266f0
DDAG
2943 goto err;
2944 }
2945
814bb08f
PX
2946 /*
2947 * On source VM, we don't need to update the received bitmap since
2948 * we don't even have one.
2949 */
2950 if (rb->receivedmap) {
2951 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2952 length >> qemu_target_page_bits());
2953 }
2954
d3a5038c 2955 ret = ram_block_discard_range(rb, start, length);
e0b266f0
DDAG
2956
2957err:
2958 rcu_read_unlock();
2959
2960 return ret;
2961}
2962
84593a08
PX
2963/*
2964 * For every allocation, we will try not to crash the VM if the
2965 * allocation failed.
2966 */
2967static int xbzrle_init(void)
2968{
2969 Error *local_err = NULL;
2970
2971 if (!migrate_use_xbzrle()) {
2972 return 0;
2973 }
2974
2975 XBZRLE_cache_lock();
2976
2977 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2978 if (!XBZRLE.zero_target_page) {
2979 error_report("%s: Error allocating zero page", __func__);
2980 goto err_out;
2981 }
2982
2983 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2984 TARGET_PAGE_SIZE, &local_err);
2985 if (!XBZRLE.cache) {
2986 error_report_err(local_err);
2987 goto free_zero_page;
2988 }
2989
2990 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2991 if (!XBZRLE.encoded_buf) {
2992 error_report("%s: Error allocating encoded_buf", __func__);
2993 goto free_cache;
2994 }
2995
2996 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2997 if (!XBZRLE.current_buf) {
2998 error_report("%s: Error allocating current_buf", __func__);
2999 goto free_encoded_buf;
3000 }
3001
3002 /* We are all good */
3003 XBZRLE_cache_unlock();
3004 return 0;
3005
3006free_encoded_buf:
3007 g_free(XBZRLE.encoded_buf);
3008 XBZRLE.encoded_buf = NULL;
3009free_cache:
3010 cache_fini(XBZRLE.cache);
3011 XBZRLE.cache = NULL;
3012free_zero_page:
3013 g_free(XBZRLE.zero_target_page);
3014 XBZRLE.zero_target_page = NULL;
3015err_out:
3016 XBZRLE_cache_unlock();
3017 return -ENOMEM;
3018}
3019
53518d94 3020static int ram_state_init(RAMState **rsp)
56e93d26 3021{
7d00ee6a
PX
3022 *rsp = g_try_new0(RAMState, 1);
3023
3024 if (!*rsp) {
3025 error_report("%s: Init ramstate fail", __func__);
3026 return -1;
3027 }
53518d94
JQ
3028
3029 qemu_mutex_init(&(*rsp)->bitmap_mutex);
3030 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3031 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
56e93d26 3032
7d00ee6a
PX
3033 /*
3034 * Count the total number of pages used by ram blocks not including any
3035 * gaps due to alignment or unplugs.
3036 */
3037 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
3038
3039 ram_state_reset(*rsp);
3040
3041 return 0;
3042}
3043
d6eff5d7 3044static void ram_list_init_bitmaps(void)
7d00ee6a 3045{
d6eff5d7
PX
3046 RAMBlock *block;
3047 unsigned long pages;
56e93d26 3048
0827b9e9
AA
3049 /* Skip setting bitmap if there is no RAM */
3050 if (ram_bytes_total()) {
b895de50 3051 RAMBLOCK_FOREACH_MIGRATABLE(block) {
d6eff5d7 3052 pages = block->max_length >> TARGET_PAGE_BITS;
6b6712ef
JQ
3053 block->bmap = bitmap_new(pages);
3054 bitmap_set(block->bmap, 0, pages);
3055 if (migrate_postcopy_ram()) {
3056 block->unsentmap = bitmap_new(pages);
3057 bitmap_set(block->unsentmap, 0, pages);
3058 }
0827b9e9 3059 }
f3f491fc 3060 }
d6eff5d7
PX
3061}
3062
3063static void ram_init_bitmaps(RAMState *rs)
3064{
3065 /* For memory_global_dirty_log_start below. */
3066 qemu_mutex_lock_iothread();
3067 qemu_mutex_lock_ramlist();
3068 rcu_read_lock();
f3f491fc 3069
d6eff5d7 3070 ram_list_init_bitmaps();
56e93d26 3071 memory_global_dirty_log_start();
d6eff5d7
PX
3072 migration_bitmap_sync(rs);
3073
3074 rcu_read_unlock();
56e93d26 3075 qemu_mutex_unlock_ramlist();
49877834 3076 qemu_mutex_unlock_iothread();
d6eff5d7
PX
3077}
3078
3079static int ram_init_all(RAMState **rsp)
3080{
3081 if (ram_state_init(rsp)) {
3082 return -1;
3083 }
3084
3085 if (xbzrle_init()) {
3086 ram_state_cleanup(rsp);
3087 return -1;
3088 }
3089
3090 ram_init_bitmaps(*rsp);
a91246c9
HZ
3091
3092 return 0;
3093}
3094
08614f34
PX
3095static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3096{
3097 RAMBlock *block;
3098 uint64_t pages = 0;
3099
3100 /*
3101 * Postcopy is not using xbzrle/compression, so no need for that.
3102 * Also, since source are already halted, we don't need to care
3103 * about dirty page logging as well.
3104 */
3105
ff0769a4 3106 RAMBLOCK_FOREACH_MIGRATABLE(block) {
08614f34
PX
3107 pages += bitmap_count_one(block->bmap,
3108 block->used_length >> TARGET_PAGE_BITS);
3109 }
3110
3111 /* This may not be aligned with current bitmaps. Recalculate. */
3112 rs->migration_dirty_pages = pages;
3113
3114 rs->last_seen_block = NULL;
3115 rs->last_sent_block = NULL;
3116 rs->last_page = 0;
3117 rs->last_version = ram_list.version;
3118 /*
3119 * Disable the bulk stage, otherwise we'll resend the whole RAM no
3120 * matter what we have sent.
3121 */
3122 rs->ram_bulk_stage = false;
3123
3124 /* Update RAMState cache of output QEMUFile */
3125 rs->f = out;
3126
3127 trace_ram_state_resume_prepare(pages);
3128}
3129
3d0684b2
JQ
3130/*
3131 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
a91246c9
HZ
3132 * long-running RCU critical section. When rcu-reclaims in the code
3133 * start to become numerous it will be necessary to reduce the
3134 * granularity of these critical sections.
3135 */
3136
3d0684b2
JQ
3137/**
3138 * ram_save_setup: Setup RAM for migration
3139 *
3140 * Returns zero to indicate success and negative for error
3141 *
3142 * @f: QEMUFile where to send the data
3143 * @opaque: RAMState pointer
3144 */
a91246c9
HZ
3145static int ram_save_setup(QEMUFile *f, void *opaque)
3146{
53518d94 3147 RAMState **rsp = opaque;
a91246c9
HZ
3148 RAMBlock *block;
3149
dcaf446e
XG
3150 if (compress_threads_save_setup()) {
3151 return -1;
3152 }
3153
a91246c9
HZ
3154 /* migration has already setup the bitmap, reuse it. */
3155 if (!migration_in_colo_state()) {
7d00ee6a 3156 if (ram_init_all(rsp) != 0) {
dcaf446e 3157 compress_threads_save_cleanup();
a91246c9 3158 return -1;
53518d94 3159 }
a91246c9 3160 }
53518d94 3161 (*rsp)->f = f;
a91246c9
HZ
3162
3163 rcu_read_lock();
56e93d26
JQ
3164
3165 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
3166
b895de50 3167 RAMBLOCK_FOREACH_MIGRATABLE(block) {
56e93d26
JQ
3168 qemu_put_byte(f, strlen(block->idstr));
3169 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3170 qemu_put_be64(f, block->used_length);
ef08fb38
DDAG
3171 if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
3172 qemu_put_be64(f, block->page_size);
3173 }
56e93d26
JQ
3174 }
3175
3176 rcu_read_unlock();
3177
3178 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3179 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3180
6df264ac 3181 multifd_send_sync_main();
56e93d26 3182 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
35374cbd 3183 qemu_fflush(f);
56e93d26
JQ
3184
3185 return 0;
3186}
3187
3d0684b2
JQ
3188/**
3189 * ram_save_iterate: iterative stage for migration
3190 *
3191 * Returns zero to indicate success and negative for error
3192 *
3193 * @f: QEMUFile where to send the data
3194 * @opaque: RAMState pointer
3195 */
56e93d26
JQ
3196static int ram_save_iterate(QEMUFile *f, void *opaque)
3197{
53518d94
JQ
3198 RAMState **temp = opaque;
3199 RAMState *rs = *temp;
56e93d26
JQ
3200 int ret;
3201 int i;
3202 int64_t t0;
5c90308f 3203 int done = 0;
56e93d26 3204
b2557345
PL
3205 if (blk_mig_bulk_active()) {
3206 /* Avoid transferring ram during bulk phase of block migration as
3207 * the bulk phase will usually take a long time and transferring
3208 * ram updates during that time is pointless. */
3209 goto out;
3210 }
3211
56e93d26 3212 rcu_read_lock();
6f37bb8b
JQ
3213 if (ram_list.version != rs->last_version) {
3214 ram_state_reset(rs);
56e93d26
JQ
3215 }
3216
3217 /* Read version before ram_list.blocks */
3218 smp_rmb();
3219
3220 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3221
3222 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3223 i = 0;
e03a34f8
DDAG
3224 while ((ret = qemu_file_rate_limit(f)) == 0 ||
3225 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
56e93d26
JQ
3226 int pages;
3227
e03a34f8
DDAG
3228 if (qemu_file_get_error(f)) {
3229 break;
3230 }
3231
ce25d337 3232 pages = ram_find_and_save_block(rs, false);
56e93d26
JQ
3233 /* no more pages to sent */
3234 if (pages == 0) {
5c90308f 3235 done = 1;
56e93d26
JQ
3236 break;
3237 }
e8f3735f
XG
3238
3239 if (pages < 0) {
3240 qemu_file_set_error(f, pages);
3241 break;
3242 }
3243
be8b02ed 3244 rs->target_page_count += pages;
070afca2 3245
56e93d26
JQ
3246 /* we want to check in the 1st loop, just in case it was the 1st time
3247 and we had to sync the dirty bitmap.
3248 qemu_get_clock_ns() is a bit expensive, so we only check each some
3249 iterations
3250 */
3251 if ((i & 63) == 0) {
3252 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
3253 if (t1 > MAX_WAIT) {
55c4446b 3254 trace_ram_save_iterate_big_wait(t1, i);
56e93d26
JQ
3255 break;
3256 }
3257 }
3258 i++;
3259 }
56e93d26
JQ
3260 rcu_read_unlock();
3261
3262 /*
3263 * Must occur before EOS (or any QEMUFile operation)
3264 * because of RDMA protocol.
3265 */
3266 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3267
6df264ac 3268 multifd_send_sync_main();
b2557345 3269out:
56e93d26 3270 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
35374cbd 3271 qemu_fflush(f);
9360447d 3272 ram_counters.transferred += 8;
56e93d26
JQ
3273
3274 ret = qemu_file_get_error(f);
3275 if (ret < 0) {
3276 return ret;
3277 }
3278
5c90308f 3279 return done;
56e93d26
JQ
3280}
3281
3d0684b2
JQ
3282/**
3283 * ram_save_complete: function called to send the remaining amount of ram
3284 *
e8f3735f 3285 * Returns zero to indicate success or negative on error
3d0684b2
JQ
3286 *
3287 * Called with iothread lock
3288 *
3289 * @f: QEMUFile where to send the data
3290 * @opaque: RAMState pointer
3291 */
56e93d26
JQ
3292static int ram_save_complete(QEMUFile *f, void *opaque)
3293{
53518d94
JQ
3294 RAMState **temp = opaque;
3295 RAMState *rs = *temp;
e8f3735f 3296 int ret = 0;
6f37bb8b 3297
56e93d26
JQ
3298 rcu_read_lock();
3299
5727309d 3300 if (!migration_in_postcopy()) {
8d820d6f 3301 migration_bitmap_sync(rs);
663e6c1d 3302 }
56e93d26
JQ
3303
3304 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3305
3306 /* try transferring iterative blocks of memory */
3307
3308 /* flush all remaining blocks regardless of rate limiting */
3309 while (true) {
3310 int pages;
3311
ce25d337 3312 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
56e93d26
JQ
3313 /* no more blocks to sent */
3314 if (pages == 0) {
3315 break;
3316 }
e8f3735f
XG
3317 if (pages < 0) {
3318 ret = pages;
3319 break;
3320 }
56e93d26
JQ
3321 }
3322
ce25d337 3323 flush_compressed_data(rs);
56e93d26 3324 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
56e93d26
JQ
3325
3326 rcu_read_unlock();
d09a6fde 3327
6df264ac 3328 multifd_send_sync_main();
56e93d26 3329 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
35374cbd 3330 qemu_fflush(f);
56e93d26 3331
e8f3735f 3332 return ret;
56e93d26
JQ
3333}
3334
c31b098f 3335static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
47995026
VSO
3336 uint64_t *res_precopy_only,
3337 uint64_t *res_compatible,
3338 uint64_t *res_postcopy_only)
56e93d26 3339{
53518d94
JQ
3340 RAMState **temp = opaque;
3341 RAMState *rs = *temp;
56e93d26
JQ
3342 uint64_t remaining_size;
3343
9edabd4d 3344 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 3345
5727309d 3346 if (!migration_in_postcopy() &&
663e6c1d 3347 remaining_size < max_size) {
56e93d26
JQ
3348 qemu_mutex_lock_iothread();
3349 rcu_read_lock();
8d820d6f 3350 migration_bitmap_sync(rs);
56e93d26
JQ
3351 rcu_read_unlock();
3352 qemu_mutex_unlock_iothread();
9edabd4d 3353 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 3354 }
c31b098f 3355
86e1167e
VSO
3356 if (migrate_postcopy_ram()) {
3357 /* We can do postcopy, and all the data is postcopiable */
47995026 3358 *res_compatible += remaining_size;
86e1167e 3359 } else {
47995026 3360 *res_precopy_only += remaining_size;
86e1167e 3361 }
56e93d26
JQ
3362}
3363
3364static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3365{
3366 unsigned int xh_len;
3367 int xh_flags;
063e760a 3368 uint8_t *loaded_data;
56e93d26 3369
56e93d26
JQ
3370 /* extract RLE header */
3371 xh_flags = qemu_get_byte(f);
3372 xh_len = qemu_get_be16(f);
3373
3374 if (xh_flags != ENCODING_FLAG_XBZRLE) {
3375 error_report("Failed to load XBZRLE page - wrong compression!");
3376 return -1;
3377 }
3378
3379 if (xh_len > TARGET_PAGE_SIZE) {
3380 error_report("Failed to load XBZRLE page - len overflow!");
3381 return -1;
3382 }
f265e0e4 3383 loaded_data = XBZRLE.decoded_buf;
56e93d26 3384 /* load data and decode */
f265e0e4 3385 /* it can change loaded_data to point to an internal buffer */
063e760a 3386 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
56e93d26
JQ
3387
3388 /* decode RLE */
063e760a 3389 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
56e93d26
JQ
3390 TARGET_PAGE_SIZE) == -1) {
3391 error_report("Failed to load XBZRLE page - decode error!");
3392 return -1;
3393 }
3394
3395 return 0;
3396}
3397
3d0684b2
JQ
3398/**
3399 * ram_block_from_stream: read a RAMBlock id from the migration stream
3400 *
3401 * Must be called from within a rcu critical section.
3402 *
56e93d26 3403 * Returns a pointer from within the RCU-protected ram_list.
a7180877 3404 *
3d0684b2
JQ
3405 * @f: QEMUFile where to read the data from
3406 * @flags: Page flags (mostly to see if it's a continuation of previous block)
a7180877 3407 */
3d0684b2 3408static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
56e93d26
JQ
3409{
3410 static RAMBlock *block = NULL;
3411 char id[256];
3412 uint8_t len;
3413
3414 if (flags & RAM_SAVE_FLAG_CONTINUE) {
4c4bad48 3415 if (!block) {
56e93d26
JQ
3416 error_report("Ack, bad migration stream!");
3417 return NULL;
3418 }
4c4bad48 3419 return block;
56e93d26
JQ
3420 }
3421
3422 len = qemu_get_byte(f);
3423 qemu_get_buffer(f, (uint8_t *)id, len);
3424 id[len] = 0;
3425
e3dd7493 3426 block = qemu_ram_block_by_name(id);
4c4bad48
HZ
3427 if (!block) {
3428 error_report("Can't find block %s", id);
3429 return NULL;
56e93d26
JQ
3430 }
3431
b895de50
CLG
3432 if (!qemu_ram_is_migratable(block)) {
3433 error_report("block %s should not be migrated !", id);
3434 return NULL;
3435 }
3436
4c4bad48
HZ
3437 return block;
3438}
3439
3440static inline void *host_from_ram_block_offset(RAMBlock *block,
3441 ram_addr_t offset)
3442{
3443 if (!offset_in_ramblock(block, offset)) {
3444 return NULL;
3445 }
3446
3447 return block->host + offset;
56e93d26
JQ
3448}
3449
13af18f2
ZC
3450static inline void *colo_cache_from_block_offset(RAMBlock *block,
3451 ram_addr_t offset)
3452{
3453 if (!offset_in_ramblock(block, offset)) {
3454 return NULL;
3455 }
3456 if (!block->colo_cache) {
3457 error_report("%s: colo_cache is NULL in block :%s",
3458 __func__, block->idstr);
3459 return NULL;
3460 }
3461 return block->colo_cache + offset;
3462}
3463
3d0684b2
JQ
3464/**
3465 * ram_handle_compressed: handle the zero page case
3466 *
56e93d26
JQ
3467 * If a page (or a whole RDMA chunk) has been
3468 * determined to be zero, then zap it.
3d0684b2
JQ
3469 *
3470 * @host: host address for the zero page
3471 * @ch: what the page is filled from. We only support zero
3472 * @size: size of the zero page
56e93d26
JQ
3473 */
3474void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3475{
3476 if (ch != 0 || !is_zero_range(host, size)) {
3477 memset(host, ch, size);
3478 }
3479}
3480
797ca154
XG
3481/* return the size after decompression, or negative value on error */
3482static int
3483qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3484 const uint8_t *source, size_t source_len)
3485{
3486 int err;
3487
3488 err = inflateReset(stream);
3489 if (err != Z_OK) {
3490 return -1;
3491 }
3492
3493 stream->avail_in = source_len;
3494 stream->next_in = (uint8_t *)source;
3495 stream->avail_out = dest_len;
3496 stream->next_out = dest;
3497
3498 err = inflate(stream, Z_NO_FLUSH);
3499 if (err != Z_STREAM_END) {
3500 return -1;
3501 }
3502
3503 return stream->total_out;
3504}
3505
56e93d26
JQ
3506static void *do_data_decompress(void *opaque)
3507{
3508 DecompressParam *param = opaque;
3509 unsigned long pagesize;
33d151f4 3510 uint8_t *des;
34ab9e97 3511 int len, ret;
56e93d26 3512
33d151f4 3513 qemu_mutex_lock(&param->mutex);
90e56fb4 3514 while (!param->quit) {
33d151f4
LL
3515 if (param->des) {
3516 des = param->des;
3517 len = param->len;
3518 param->des = 0;
3519 qemu_mutex_unlock(&param->mutex);
3520
56e93d26 3521 pagesize = TARGET_PAGE_SIZE;
34ab9e97
XG
3522
3523 ret = qemu_uncompress_data(&param->stream, des, pagesize,
3524 param->compbuf, len);
f548222c 3525 if (ret < 0 && migrate_get_current()->decompress_error_check) {
34ab9e97
XG
3526 error_report("decompress data failed");
3527 qemu_file_set_error(decomp_file, ret);
3528 }
73a8912b 3529
33d151f4
LL
3530 qemu_mutex_lock(&decomp_done_lock);
3531 param->done = true;
3532 qemu_cond_signal(&decomp_done_cond);
3533 qemu_mutex_unlock(&decomp_done_lock);
3534
3535 qemu_mutex_lock(&param->mutex);
3536 } else {
3537 qemu_cond_wait(&param->cond, &param->mutex);
3538 }
56e93d26 3539 }
33d151f4 3540 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
3541
3542 return NULL;
3543}
3544
34ab9e97 3545static int wait_for_decompress_done(void)
5533b2e9
LL
3546{
3547 int idx, thread_count;
3548
3549 if (!migrate_use_compression()) {
34ab9e97 3550 return 0;
5533b2e9
LL
3551 }
3552
3553 thread_count = migrate_decompress_threads();
3554 qemu_mutex_lock(&decomp_done_lock);
3555 for (idx = 0; idx < thread_count; idx++) {
3556 while (!decomp_param[idx].done) {
3557 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3558 }
3559 }
3560 qemu_mutex_unlock(&decomp_done_lock);
34ab9e97 3561 return qemu_file_get_error(decomp_file);
5533b2e9
LL
3562}
3563
f0afa331 3564static void compress_threads_load_cleanup(void)
56e93d26
JQ
3565{
3566 int i, thread_count;
3567
3416ab5b
JQ
3568 if (!migrate_use_compression()) {
3569 return;
3570 }
56e93d26
JQ
3571 thread_count = migrate_decompress_threads();
3572 for (i = 0; i < thread_count; i++) {
797ca154
XG
3573 /*
3574 * we use it as a indicator which shows if the thread is
3575 * properly init'd or not
3576 */
3577 if (!decomp_param[i].compbuf) {
3578 break;
3579 }
3580
56e93d26 3581 qemu_mutex_lock(&decomp_param[i].mutex);
90e56fb4 3582 decomp_param[i].quit = true;
56e93d26
JQ
3583 qemu_cond_signal(&decomp_param[i].cond);
3584 qemu_mutex_unlock(&decomp_param[i].mutex);
3585 }
3586 for (i = 0; i < thread_count; i++) {
797ca154
XG
3587 if (!decomp_param[i].compbuf) {
3588 break;
3589 }
3590
56e93d26
JQ
3591 qemu_thread_join(decompress_threads + i);
3592 qemu_mutex_destroy(&decomp_param[i].mutex);
3593 qemu_cond_destroy(&decomp_param[i].cond);
797ca154 3594 inflateEnd(&decomp_param[i].stream);
56e93d26 3595 g_free(decomp_param[i].compbuf);
797ca154 3596 decomp_param[i].compbuf = NULL;
56e93d26
JQ
3597 }
3598 g_free(decompress_threads);
3599 g_free(decomp_param);
56e93d26
JQ
3600 decompress_threads = NULL;
3601 decomp_param = NULL;
34ab9e97 3602 decomp_file = NULL;
56e93d26
JQ
3603}
3604
34ab9e97 3605static int compress_threads_load_setup(QEMUFile *f)
797ca154
XG
3606{
3607 int i, thread_count;
3608
3609 if (!migrate_use_compression()) {
3610 return 0;
3611 }
3612
3613 thread_count = migrate_decompress_threads();
3614 decompress_threads = g_new0(QemuThread, thread_count);
3615 decomp_param = g_new0(DecompressParam, thread_count);
3616 qemu_mutex_init(&decomp_done_lock);
3617 qemu_cond_init(&decomp_done_cond);
34ab9e97 3618 decomp_file = f;
797ca154
XG
3619 for (i = 0; i < thread_count; i++) {
3620 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3621 goto exit;
3622 }
3623
3624 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3625 qemu_mutex_init(&decomp_param[i].mutex);
3626 qemu_cond_init(&decomp_param[i].cond);
3627 decomp_param[i].done = true;
3628 decomp_param[i].quit = false;
3629 qemu_thread_create(decompress_threads + i, "decompress",
3630 do_data_decompress, decomp_param + i,
3631 QEMU_THREAD_JOINABLE);
3632 }
3633 return 0;
3634exit:
3635 compress_threads_load_cleanup();
3636 return -1;
3637}
3638
c1bc6626 3639static void decompress_data_with_multi_threads(QEMUFile *f,
56e93d26
JQ
3640 void *host, int len)
3641{
3642 int idx, thread_count;
3643
3644 thread_count = migrate_decompress_threads();
73a8912b 3645 qemu_mutex_lock(&decomp_done_lock);
56e93d26
JQ
3646 while (true) {
3647 for (idx = 0; idx < thread_count; idx++) {
73a8912b 3648 if (decomp_param[idx].done) {
33d151f4
LL
3649 decomp_param[idx].done = false;
3650 qemu_mutex_lock(&decomp_param[idx].mutex);
c1bc6626 3651 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
56e93d26
JQ
3652 decomp_param[idx].des = host;
3653 decomp_param[idx].len = len;
33d151f4
LL
3654 qemu_cond_signal(&decomp_param[idx].cond);
3655 qemu_mutex_unlock(&decomp_param[idx].mutex);
56e93d26
JQ
3656 break;
3657 }
3658 }
3659 if (idx < thread_count) {
3660 break;
73a8912b
LL
3661 } else {
3662 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
56e93d26
JQ
3663 }
3664 }
73a8912b 3665 qemu_mutex_unlock(&decomp_done_lock);
56e93d26
JQ
3666}
3667
13af18f2
ZC
3668/*
3669 * colo cache: this is for secondary VM, we cache the whole
3670 * memory of the secondary VM, it is need to hold the global lock
3671 * to call this helper.
3672 */
3673int colo_init_ram_cache(void)
3674{
3675 RAMBlock *block;
3676
3677 rcu_read_lock();
3678 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
3679 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3680 NULL,
3681 false);
3682 if (!block->colo_cache) {
3683 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3684 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3685 block->used_length);
3686 goto out_locked;
3687 }
3688 memcpy(block->colo_cache, block->host, block->used_length);
3689 }
3690 rcu_read_unlock();
3691 return 0;
3692
3693out_locked:
3694 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
3695 if (block->colo_cache) {
3696 qemu_anon_ram_free(block->colo_cache, block->used_length);
3697 block->colo_cache = NULL;
3698 }
3699 }
3700
3701 rcu_read_unlock();
3702 return -errno;
3703}
3704
3705/* It is need to hold the global lock to call this helper */
3706void colo_release_ram_cache(void)
3707{
3708 RAMBlock *block;
3709
3710 rcu_read_lock();
3711 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
3712 if (block->colo_cache) {
3713 qemu_anon_ram_free(block->colo_cache, block->used_length);
3714 block->colo_cache = NULL;
3715 }
3716 }
3717 rcu_read_unlock();
3718}
3719
f265e0e4
JQ
3720/**
3721 * ram_load_setup: Setup RAM for migration incoming side
3722 *
3723 * Returns zero to indicate success and negative for error
3724 *
3725 * @f: QEMUFile where to receive the data
3726 * @opaque: RAMState pointer
3727 */
3728static int ram_load_setup(QEMUFile *f, void *opaque)
3729{
34ab9e97 3730 if (compress_threads_load_setup(f)) {
797ca154
XG
3731 return -1;
3732 }
3733
f265e0e4 3734 xbzrle_load_setup();
f9494614 3735 ramblock_recv_map_init();
13af18f2 3736
f265e0e4
JQ
3737 return 0;
3738}
3739
3740static int ram_load_cleanup(void *opaque)
3741{
f9494614 3742 RAMBlock *rb;
56eb90af
JH
3743
3744 RAMBLOCK_FOREACH_MIGRATABLE(rb) {
3745 if (ramblock_is_pmem(rb)) {
3746 pmem_persist(rb->host, rb->used_length);
3747 }
3748 }
3749
f265e0e4 3750 xbzrle_load_cleanup();
f0afa331 3751 compress_threads_load_cleanup();
f9494614 3752
b895de50 3753 RAMBLOCK_FOREACH_MIGRATABLE(rb) {
f9494614
AP
3754 g_free(rb->receivedmap);
3755 rb->receivedmap = NULL;
3756 }
13af18f2 3757
f265e0e4
JQ
3758 return 0;
3759}
3760
3d0684b2
JQ
3761/**
3762 * ram_postcopy_incoming_init: allocate postcopy data structures
3763 *
3764 * Returns 0 for success and negative if there was one error
3765 *
3766 * @mis: current migration incoming state
3767 *
3768 * Allocate data structures etc needed by incoming migration with
3769 * postcopy-ram. postcopy-ram's similarly names
3770 * postcopy_ram_incoming_init does the work.
1caddf8a
DDAG
3771 */
3772int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3773{
c136180c 3774 return postcopy_ram_incoming_init(mis);
1caddf8a
DDAG
3775}
3776
3d0684b2
JQ
3777/**
3778 * ram_load_postcopy: load a page in postcopy case
3779 *
3780 * Returns 0 for success or -errno in case of error
3781 *
a7180877
DDAG
3782 * Called in postcopy mode by ram_load().
3783 * rcu_read_lock is taken prior to this being called.
3d0684b2
JQ
3784 *
3785 * @f: QEMUFile where to send the data
a7180877
DDAG
3786 */
3787static int ram_load_postcopy(QEMUFile *f)
3788{
3789 int flags = 0, ret = 0;
3790 bool place_needed = false;
1aa83678 3791 bool matches_target_page_size = false;
a7180877
DDAG
3792 MigrationIncomingState *mis = migration_incoming_get_current();
3793 /* Temporary page that is later 'placed' */
3794 void *postcopy_host_page = postcopy_get_tmp_page(mis);
c53b7ddc 3795 void *last_host = NULL;
a3b6ff6d 3796 bool all_zero = false;
a7180877
DDAG
3797
3798 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3799 ram_addr_t addr;
3800 void *host = NULL;
3801 void *page_buffer = NULL;
3802 void *place_source = NULL;
df9ff5e1 3803 RAMBlock *block = NULL;
a7180877 3804 uint8_t ch;
a7180877
DDAG
3805
3806 addr = qemu_get_be64(f);
7a9ddfbf
PX
3807
3808 /*
3809 * If qemu file error, we should stop here, and then "addr"
3810 * may be invalid
3811 */
3812 ret = qemu_file_get_error(f);
3813 if (ret) {
3814 break;
3815 }
3816
a7180877
DDAG
3817 flags = addr & ~TARGET_PAGE_MASK;
3818 addr &= TARGET_PAGE_MASK;
3819
3820 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3821 place_needed = false;
bb890ed5 3822 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
df9ff5e1 3823 block = ram_block_from_stream(f, flags);
4c4bad48
HZ
3824
3825 host = host_from_ram_block_offset(block, addr);
a7180877
DDAG
3826 if (!host) {
3827 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3828 ret = -EINVAL;
3829 break;
3830 }
1aa83678 3831 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
a7180877 3832 /*
28abd200
DDAG
3833 * Postcopy requires that we place whole host pages atomically;
3834 * these may be huge pages for RAMBlocks that are backed by
3835 * hugetlbfs.
a7180877
DDAG
3836 * To make it atomic, the data is read into a temporary page
3837 * that's moved into place later.
3838 * The migration protocol uses, possibly smaller, target-pages
3839 * however the source ensures it always sends all the components
3840 * of a host page in order.
3841 */
3842 page_buffer = postcopy_host_page +
28abd200 3843 ((uintptr_t)host & (block->page_size - 1));
a7180877 3844 /* If all TP are zero then we can optimise the place */
28abd200 3845 if (!((uintptr_t)host & (block->page_size - 1))) {
a7180877 3846 all_zero = true;
c53b7ddc
DDAG
3847 } else {
3848 /* not the 1st TP within the HP */
3849 if (host != (last_host + TARGET_PAGE_SIZE)) {
9af9e0fe 3850 error_report("Non-sequential target page %p/%p",
c53b7ddc
DDAG
3851 host, last_host);
3852 ret = -EINVAL;
3853 break;
3854 }
a7180877
DDAG
3855 }
3856
c53b7ddc 3857
a7180877
DDAG
3858 /*
3859 * If it's the last part of a host page then we place the host
3860 * page
3861 */
3862 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
28abd200 3863 (block->page_size - 1)) == 0;
a7180877
DDAG
3864 place_source = postcopy_host_page;
3865 }
c53b7ddc 3866 last_host = host;
a7180877
DDAG
3867
3868 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
bb890ed5 3869 case RAM_SAVE_FLAG_ZERO:
a7180877
DDAG
3870 ch = qemu_get_byte(f);
3871 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3872 if (ch) {
3873 all_zero = false;
3874 }
3875 break;
3876
3877 case RAM_SAVE_FLAG_PAGE:
3878 all_zero = false;
1aa83678
PX
3879 if (!matches_target_page_size) {
3880 /* For huge pages, we always use temporary buffer */
a7180877
DDAG
3881 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3882 } else {
1aa83678
PX
3883 /*
3884 * For small pages that matches target page size, we
3885 * avoid the qemu_file copy. Instead we directly use
3886 * the buffer of QEMUFile to place the page. Note: we
3887 * cannot do any QEMUFile operation before using that
3888 * buffer to make sure the buffer is valid when
3889 * placing the page.
a7180877
DDAG
3890 */
3891 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3892 TARGET_PAGE_SIZE);
3893 }
3894 break;
3895 case RAM_SAVE_FLAG_EOS:
3896 /* normal exit */
6df264ac 3897 multifd_recv_sync_main();
a7180877
DDAG
3898 break;
3899 default:
3900 error_report("Unknown combination of migration flags: %#x"
3901 " (postcopy mode)", flags);
3902 ret = -EINVAL;
7a9ddfbf
PX
3903 break;
3904 }
3905
3906 /* Detect for any possible file errors */
3907 if (!ret && qemu_file_get_error(f)) {
3908 ret = qemu_file_get_error(f);
a7180877
DDAG
3909 }
3910
7a9ddfbf 3911 if (!ret && place_needed) {
a7180877 3912 /* This gets called at the last target page in the host page */
df9ff5e1
DDAG
3913 void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
3914
a7180877 3915 if (all_zero) {
df9ff5e1 3916 ret = postcopy_place_page_zero(mis, place_dest,
8be4620b 3917 block);
a7180877 3918 } else {
df9ff5e1 3919 ret = postcopy_place_page(mis, place_dest,
8be4620b 3920 place_source, block);
a7180877
DDAG
3921 }
3922 }
a7180877
DDAG
3923 }
3924
3925 return ret;
3926}
3927
acab30b8
DHB
3928static bool postcopy_is_advised(void)
3929{
3930 PostcopyState ps = postcopy_state_get();
3931 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3932}
3933
3934static bool postcopy_is_running(void)
3935{
3936 PostcopyState ps = postcopy_state_get();
3937 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3938}
3939
56e93d26
JQ
3940static int ram_load(QEMUFile *f, void *opaque, int version_id)
3941{
edc60127 3942 int flags = 0, ret = 0, invalid_flags = 0;
56e93d26
JQ
3943 static uint64_t seq_iter;
3944 int len = 0;
a7180877
DDAG
3945 /*
3946 * If system is running in postcopy mode, page inserts to host memory must
3947 * be atomic
3948 */
acab30b8 3949 bool postcopy_running = postcopy_is_running();
ef08fb38 3950 /* ADVISE is earlier, it shows the source has the postcopy capability on */
acab30b8 3951 bool postcopy_advised = postcopy_is_advised();
56e93d26
JQ
3952
3953 seq_iter++;
3954
3955 if (version_id != 4) {
3956 ret = -EINVAL;
3957 }
3958
edc60127
JQ
3959 if (!migrate_use_compression()) {
3960 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3961 }
56e93d26
JQ
3962 /* This RCU critical section can be very long running.
3963 * When RCU reclaims in the code start to become numerous,
3964 * it will be necessary to reduce the granularity of this
3965 * critical section.
3966 */
3967 rcu_read_lock();
a7180877
DDAG
3968
3969 if (postcopy_running) {
3970 ret = ram_load_postcopy(f);
3971 }
3972
3973 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
56e93d26 3974 ram_addr_t addr, total_ram_bytes;
a776aa15 3975 void *host = NULL;
56e93d26
JQ
3976 uint8_t ch;
3977
3978 addr = qemu_get_be64(f);
3979 flags = addr & ~TARGET_PAGE_MASK;
3980 addr &= TARGET_PAGE_MASK;
3981
edc60127
JQ
3982 if (flags & invalid_flags) {
3983 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3984 error_report("Received an unexpected compressed page");
3985 }
3986
3987 ret = -EINVAL;
3988 break;
3989 }
3990
bb890ed5 3991 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
a776aa15 3992 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4c4bad48
HZ
3993 RAMBlock *block = ram_block_from_stream(f, flags);
3994
13af18f2
ZC
3995 /*
3996 * After going into COLO, we should load the Page into colo_cache.
3997 */
3998 if (migration_incoming_in_colo_state()) {
3999 host = colo_cache_from_block_offset(block, addr);
4000 } else {
4001 host = host_from_ram_block_offset(block, addr);
4002 }
a776aa15
DDAG
4003 if (!host) {
4004 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4005 ret = -EINVAL;
4006 break;
4007 }
13af18f2
ZC
4008
4009 if (!migration_incoming_in_colo_state()) {
4010 ramblock_recv_bitmap_set(block, host);
4011 }
4012
1db9d8e5 4013 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
a776aa15
DDAG
4014 }
4015
56e93d26
JQ
4016 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4017 case RAM_SAVE_FLAG_MEM_SIZE:
4018 /* Synchronize RAM block list */
4019 total_ram_bytes = addr;
4020 while (!ret && total_ram_bytes) {
4021 RAMBlock *block;
56e93d26
JQ
4022 char id[256];
4023 ram_addr_t length;
4024
4025 len = qemu_get_byte(f);
4026 qemu_get_buffer(f, (uint8_t *)id, len);
4027 id[len] = 0;
4028 length = qemu_get_be64(f);
4029
e3dd7493 4030 block = qemu_ram_block_by_name(id);
b895de50
CLG
4031 if (block && !qemu_ram_is_migratable(block)) {
4032 error_report("block %s should not be migrated !", id);
4033 ret = -EINVAL;
4034 } else if (block) {
e3dd7493
DDAG
4035 if (length != block->used_length) {
4036 Error *local_err = NULL;
56e93d26 4037
fa53a0e5 4038 ret = qemu_ram_resize(block, length,
e3dd7493
DDAG
4039 &local_err);
4040 if (local_err) {
4041 error_report_err(local_err);
56e93d26 4042 }
56e93d26 4043 }
ef08fb38
DDAG
4044 /* For postcopy we need to check hugepage sizes match */
4045 if (postcopy_advised &&
4046 block->page_size != qemu_host_page_size) {
4047 uint64_t remote_page_size = qemu_get_be64(f);
4048 if (remote_page_size != block->page_size) {
4049 error_report("Mismatched RAM page size %s "
4050 "(local) %zd != %" PRId64,
4051 id, block->page_size,
4052 remote_page_size);
4053 ret = -EINVAL;
4054 }
4055 }
e3dd7493
DDAG
4056 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4057 block->idstr);
4058 } else {
56e93d26
JQ
4059 error_report("Unknown ramblock \"%s\", cannot "
4060 "accept migration", id);
4061 ret = -EINVAL;
4062 }
4063
4064 total_ram_bytes -= length;
4065 }
4066 break;
a776aa15 4067
bb890ed5 4068 case RAM_SAVE_FLAG_ZERO:
56e93d26
JQ
4069 ch = qemu_get_byte(f);
4070 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4071 break;
a776aa15 4072
56e93d26 4073 case RAM_SAVE_FLAG_PAGE:
56e93d26
JQ
4074 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4075 break;
56e93d26 4076
a776aa15 4077 case RAM_SAVE_FLAG_COMPRESS_PAGE:
56e93d26
JQ
4078 len = qemu_get_be32(f);
4079 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4080 error_report("Invalid compressed data length: %d", len);
4081 ret = -EINVAL;
4082 break;
4083 }
c1bc6626 4084 decompress_data_with_multi_threads(f, host, len);
56e93d26 4085 break;
a776aa15 4086
56e93d26 4087 case RAM_SAVE_FLAG_XBZRLE:
56e93d26
JQ
4088 if (load_xbzrle(f, addr, host) < 0) {
4089 error_report("Failed to decompress XBZRLE page at "
4090 RAM_ADDR_FMT, addr);
4091 ret = -EINVAL;
4092 break;
4093 }
4094 break;
4095 case RAM_SAVE_FLAG_EOS:
4096 /* normal exit */
6df264ac 4097 multifd_recv_sync_main();
56e93d26
JQ
4098 break;
4099 default:
4100 if (flags & RAM_SAVE_FLAG_HOOK) {
632e3a5c 4101 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
56e93d26
JQ
4102 } else {
4103 error_report("Unknown combination of migration flags: %#x",
4104 flags);
4105 ret = -EINVAL;
4106 }
4107 }
4108 if (!ret) {
4109 ret = qemu_file_get_error(f);
4110 }
4111 }
4112
34ab9e97 4113 ret |= wait_for_decompress_done();
56e93d26 4114 rcu_read_unlock();
55c4446b 4115 trace_ram_load_complete(ret, seq_iter);
56e93d26
JQ
4116 return ret;
4117}
4118
c6467627
VSO
4119static bool ram_has_postcopy(void *opaque)
4120{
469dd51b
JH
4121 RAMBlock *rb;
4122 RAMBLOCK_FOREACH_MIGRATABLE(rb) {
4123 if (ramblock_is_pmem(rb)) {
4124 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4125 "is not supported now!", rb->idstr, rb->host);
4126 return false;
4127 }
4128 }
4129
c6467627
VSO
4130 return migrate_postcopy_ram();
4131}
4132
edd090c7
PX
4133/* Sync all the dirty bitmap with destination VM. */
4134static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4135{
4136 RAMBlock *block;
4137 QEMUFile *file = s->to_dst_file;
4138 int ramblock_count = 0;
4139
4140 trace_ram_dirty_bitmap_sync_start();
4141
ff0769a4 4142 RAMBLOCK_FOREACH_MIGRATABLE(block) {
edd090c7
PX
4143 qemu_savevm_send_recv_bitmap(file, block->idstr);
4144 trace_ram_dirty_bitmap_request(block->idstr);
4145 ramblock_count++;
4146 }
4147
4148 trace_ram_dirty_bitmap_sync_wait();
4149
4150 /* Wait until all the ramblocks' dirty bitmap synced */
4151 while (ramblock_count--) {
4152 qemu_sem_wait(&s->rp_state.rp_sem);
4153 }
4154
4155 trace_ram_dirty_bitmap_sync_complete();
4156
4157 return 0;
4158}
4159
4160static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4161{
4162 qemu_sem_post(&s->rp_state.rp_sem);
4163}
4164
a335debb
PX
4165/*
4166 * Read the received bitmap, revert it as the initial dirty bitmap.
4167 * This is only used when the postcopy migration is paused but wants
4168 * to resume from a middle point.
4169 */
4170int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4171{
4172 int ret = -EINVAL;
4173 QEMUFile *file = s->rp_state.from_dst_file;
4174 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
a725ef9f 4175 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
4176 uint64_t size, end_mark;
4177
4178 trace_ram_dirty_bitmap_reload_begin(block->idstr);
4179
4180 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4181 error_report("%s: incorrect state %s", __func__,
4182 MigrationStatus_str(s->state));
4183 return -EINVAL;
4184 }
4185
4186 /*
4187 * Note: see comments in ramblock_recv_bitmap_send() on why we
4188 * need the endianess convertion, and the paddings.
4189 */
4190 local_size = ROUND_UP(local_size, 8);
4191
4192 /* Add paddings */
4193 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4194
4195 size = qemu_get_be64(file);
4196
4197 /* The size of the bitmap should match with our ramblock */
4198 if (size != local_size) {
4199 error_report("%s: ramblock '%s' bitmap size mismatch "
4200 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4201 block->idstr, size, local_size);
4202 ret = -EINVAL;
4203 goto out;
4204 }
4205
4206 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4207 end_mark = qemu_get_be64(file);
4208
4209 ret = qemu_file_get_error(file);
4210 if (ret || size != local_size) {
4211 error_report("%s: read bitmap failed for ramblock '%s': %d"
4212 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4213 __func__, block->idstr, ret, local_size, size);
4214 ret = -EIO;
4215 goto out;
4216 }
4217
4218 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4219 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
4220 __func__, block->idstr, end_mark);
4221 ret = -EINVAL;
4222 goto out;
4223 }
4224
4225 /*
4226 * Endianess convertion. We are during postcopy (though paused).
4227 * The dirty bitmap won't change. We can directly modify it.
4228 */
4229 bitmap_from_le(block->bmap, le_bitmap, nbits);
4230
4231 /*
4232 * What we received is "received bitmap". Revert it as the initial
4233 * dirty bitmap for this ramblock.
4234 */
4235 bitmap_complement(block->bmap, block->bmap, nbits);
4236
4237 trace_ram_dirty_bitmap_reload_complete(block->idstr);
4238
edd090c7
PX
4239 /*
4240 * We succeeded to sync bitmap for current ramblock. If this is
4241 * the last one to sync, we need to notify the main send thread.
4242 */
4243 ram_dirty_bitmap_reload_notify(s);
4244
a335debb
PX
4245 ret = 0;
4246out:
bf269906 4247 g_free(le_bitmap);
a335debb
PX
4248 return ret;
4249}
4250
edd090c7
PX
4251static int ram_resume_prepare(MigrationState *s, void *opaque)
4252{
4253 RAMState *rs = *(RAMState **)opaque;
08614f34 4254 int ret;
edd090c7 4255
08614f34
PX
4256 ret = ram_dirty_bitmap_sync_all(s, rs);
4257 if (ret) {
4258 return ret;
4259 }
4260
4261 ram_state_resume_prepare(rs, s->to_dst_file);
4262
4263 return 0;
edd090c7
PX
4264}
4265
56e93d26 4266static SaveVMHandlers savevm_ram_handlers = {
9907e842 4267 .save_setup = ram_save_setup,
56e93d26 4268 .save_live_iterate = ram_save_iterate,
763c906b 4269 .save_live_complete_postcopy = ram_save_complete,
a3e06c3d 4270 .save_live_complete_precopy = ram_save_complete,
c6467627 4271 .has_postcopy = ram_has_postcopy,
56e93d26
JQ
4272 .save_live_pending = ram_save_pending,
4273 .load_state = ram_load,
f265e0e4
JQ
4274 .save_cleanup = ram_save_cleanup,
4275 .load_setup = ram_load_setup,
4276 .load_cleanup = ram_load_cleanup,
edd090c7 4277 .resume_prepare = ram_resume_prepare,
56e93d26
JQ
4278};
4279
4280void ram_mig_init(void)
4281{
4282 qemu_mutex_init(&XBZRLE.lock);
6f37bb8b 4283 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
56e93d26 4284}