]> git.proxmox.com Git - mirror_qemu.git/blame - migration/ram.c
ohci: check device is not NULL before calling usb_ep_get()
[mirror_qemu.git] / migration / ram.c
CommitLineData
56e93d26
JQ
1/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
76cc7b58
JQ
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
56e93d26
JQ
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
e688df6b 28
1393a485 29#include "qemu/osdep.h"
33c11879 30#include "cpu.h"
56e93d26 31#include <zlib.h>
f348b6d1 32#include "qemu/cutils.h"
56e93d26
JQ
33#include "qemu/bitops.h"
34#include "qemu/bitmap.h"
7205c9ec 35#include "qemu/main-loop.h"
56eb90af 36#include "qemu/pmem.h"
709e3fe8 37#include "xbzrle.h"
7b1e1a22 38#include "ram.h"
6666c96a 39#include "migration.h"
71bb07db 40#include "socket.h"
f2a8f0a6 41#include "migration/register.h"
7b1e1a22 42#include "migration/misc.h"
08a0aee1 43#include "qemu-file.h"
be07b0ac 44#include "postcopy-ram.h"
53d37d36 45#include "page_cache.h"
56e93d26 46#include "qemu/error-report.h"
e688df6b 47#include "qapi/error.h"
9af23989 48#include "qapi/qapi-events-migration.h"
8acabf69 49#include "qapi/qmp/qerror.h"
56e93d26 50#include "trace.h"
56e93d26 51#include "exec/ram_addr.h"
f9494614 52#include "exec/target_page.h"
56e93d26 53#include "qemu/rcu_queue.h"
a91246c9 54#include "migration/colo.h"
53d37d36 55#include "block.h"
af8b7d2b
JQ
56#include "sysemu/sysemu.h"
57#include "qemu/uuid.h"
edd090c7 58#include "savevm.h"
b9ee2f7d 59#include "qemu/iov.h"
56e93d26 60
56e93d26
JQ
61/***********************************************************/
62/* ram save/restore */
63
bb890ed5
JQ
64/* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
65 * worked for pages that where filled with the same char. We switched
66 * it to only search for the zero value. And to avoid confusion with
67 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
68 */
69
56e93d26 70#define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
bb890ed5 71#define RAM_SAVE_FLAG_ZERO 0x02
56e93d26
JQ
72#define RAM_SAVE_FLAG_MEM_SIZE 0x04
73#define RAM_SAVE_FLAG_PAGE 0x08
74#define RAM_SAVE_FLAG_EOS 0x10
75#define RAM_SAVE_FLAG_CONTINUE 0x20
76#define RAM_SAVE_FLAG_XBZRLE 0x40
77/* 0x80 is reserved in migration.h start with 0x100 next */
78#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
79
56e93d26
JQ
80static inline bool is_zero_range(uint8_t *p, uint64_t size)
81{
a1febc49 82 return buffer_is_zero(p, size);
56e93d26
JQ
83}
84
9360447d
JQ
85XBZRLECacheStats xbzrle_counters;
86
56e93d26
JQ
87/* struct contains XBZRLE cache and a static page
88 used by the compression */
89static struct {
90 /* buffer used for XBZRLE encoding */
91 uint8_t *encoded_buf;
92 /* buffer for storing page content */
93 uint8_t *current_buf;
94 /* Cache for XBZRLE, Protected by lock. */
95 PageCache *cache;
96 QemuMutex lock;
c00e0928
JQ
97 /* it will store a page full of zeros */
98 uint8_t *zero_target_page;
f265e0e4
JQ
99 /* buffer used for XBZRLE decoding */
100 uint8_t *decoded_buf;
56e93d26
JQ
101} XBZRLE;
102
56e93d26
JQ
103static void XBZRLE_cache_lock(void)
104{
105 if (migrate_use_xbzrle())
106 qemu_mutex_lock(&XBZRLE.lock);
107}
108
109static void XBZRLE_cache_unlock(void)
110{
111 if (migrate_use_xbzrle())
112 qemu_mutex_unlock(&XBZRLE.lock);
113}
114
3d0684b2
JQ
115/**
116 * xbzrle_cache_resize: resize the xbzrle cache
117 *
118 * This function is called from qmp_migrate_set_cache_size in main
119 * thread, possibly while a migration is in progress. A running
120 * migration may be using the cache and might finish during this call,
121 * hence changes to the cache are protected by XBZRLE.lock().
122 *
c9dede2d 123 * Returns 0 for success or -1 for error
3d0684b2
JQ
124 *
125 * @new_size: new cache size
8acabf69 126 * @errp: set *errp if the check failed, with reason
56e93d26 127 */
c9dede2d 128int xbzrle_cache_resize(int64_t new_size, Error **errp)
56e93d26
JQ
129{
130 PageCache *new_cache;
c9dede2d 131 int64_t ret = 0;
56e93d26 132
8acabf69
JQ
133 /* Check for truncation */
134 if (new_size != (size_t)new_size) {
135 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
136 "exceeding address space");
137 return -1;
138 }
139
2a313e5c
JQ
140 if (new_size == migrate_xbzrle_cache_size()) {
141 /* nothing to do */
c9dede2d 142 return 0;
2a313e5c
JQ
143 }
144
56e93d26
JQ
145 XBZRLE_cache_lock();
146
147 if (XBZRLE.cache != NULL) {
80f8dfde 148 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
56e93d26 149 if (!new_cache) {
56e93d26
JQ
150 ret = -1;
151 goto out;
152 }
153
154 cache_fini(XBZRLE.cache);
155 XBZRLE.cache = new_cache;
156 }
56e93d26
JQ
157out:
158 XBZRLE_cache_unlock();
159 return ret;
160}
161
b895de50
CLG
162/* Should be holding either ram_list.mutex, or the RCU lock. */
163#define RAMBLOCK_FOREACH_MIGRATABLE(block) \
343f632c 164 INTERNAL_RAMBLOCK_FOREACH(block) \
b895de50
CLG
165 if (!qemu_ram_is_migratable(block)) {} else
166
343f632c
DDAG
167#undef RAMBLOCK_FOREACH
168
f9494614
AP
169static void ramblock_recv_map_init(void)
170{
171 RAMBlock *rb;
172
b895de50 173 RAMBLOCK_FOREACH_MIGRATABLE(rb) {
f9494614
AP
174 assert(!rb->receivedmap);
175 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
176 }
177}
178
179int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
180{
181 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
182 rb->receivedmap);
183}
184
1cba9f6e
DDAG
185bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
186{
187 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
188}
189
f9494614
AP
190void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
191{
192 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
193}
194
195void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
196 size_t nr)
197{
198 bitmap_set_atomic(rb->receivedmap,
199 ramblock_recv_bitmap_offset(host_addr, rb),
200 nr);
201}
202
a335debb
PX
203#define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
204
205/*
206 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
207 *
208 * Returns >0 if success with sent bytes, or <0 if error.
209 */
210int64_t ramblock_recv_bitmap_send(QEMUFile *file,
211 const char *block_name)
212{
213 RAMBlock *block = qemu_ram_block_by_name(block_name);
214 unsigned long *le_bitmap, nbits;
215 uint64_t size;
216
217 if (!block) {
218 error_report("%s: invalid block name: %s", __func__, block_name);
219 return -1;
220 }
221
222 nbits = block->used_length >> TARGET_PAGE_BITS;
223
224 /*
225 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
226 * machines we may need 4 more bytes for padding (see below
227 * comment). So extend it a bit before hand.
228 */
229 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
230
231 /*
232 * Always use little endian when sending the bitmap. This is
233 * required that when source and destination VMs are not using the
234 * same endianess. (Note: big endian won't work.)
235 */
236 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
237
238 /* Size of the bitmap, in bytes */
a725ef9f 239 size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
240
241 /*
242 * size is always aligned to 8 bytes for 64bit machines, but it
243 * may not be true for 32bit machines. We need this padding to
244 * make sure the migration can survive even between 32bit and
245 * 64bit machines.
246 */
247 size = ROUND_UP(size, 8);
248
249 qemu_put_be64(file, size);
250 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
251 /*
252 * Mark as an end, in case the middle part is screwed up due to
253 * some "misterious" reason.
254 */
255 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
256 qemu_fflush(file);
257
bf269906 258 g_free(le_bitmap);
a335debb
PX
259
260 if (qemu_file_get_error(file)) {
261 return qemu_file_get_error(file);
262 }
263
264 return size + sizeof(size);
265}
266
ec481c6c
JQ
267/*
268 * An outstanding page request, on the source, having been received
269 * and queued
270 */
271struct RAMSrcPageRequest {
272 RAMBlock *rb;
273 hwaddr offset;
274 hwaddr len;
275
276 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
277};
278
6f37bb8b
JQ
279/* State of RAM for migration */
280struct RAMState {
204b88b8
JQ
281 /* QEMUFile used for this migration */
282 QEMUFile *f;
6f37bb8b
JQ
283 /* Last block that we have visited searching for dirty pages */
284 RAMBlock *last_seen_block;
285 /* Last block from where we have sent data */
286 RAMBlock *last_sent_block;
269ace29
JQ
287 /* Last dirty target page we have sent */
288 ram_addr_t last_page;
6f37bb8b
JQ
289 /* last ram version we have seen */
290 uint32_t last_version;
291 /* We are in the first round */
292 bool ram_bulk_stage;
8d820d6f
JQ
293 /* How many times we have dirty too many pages */
294 int dirty_rate_high_cnt;
f664da80
JQ
295 /* these variables are used for bitmap sync */
296 /* last time we did a full bitmap_sync */
297 int64_t time_last_bitmap_sync;
eac74159 298 /* bytes transferred at start_time */
c4bdf0cf 299 uint64_t bytes_xfer_prev;
a66cd90c 300 /* number of dirty pages since start_time */
68908ed6 301 uint64_t num_dirty_pages_period;
b5833fde
JQ
302 /* xbzrle misses since the beginning of the period */
303 uint64_t xbzrle_cache_miss_prev;
76e03000
XG
304
305 /* compression statistics since the beginning of the period */
306 /* amount of count that no free thread to compress data */
307 uint64_t compress_thread_busy_prev;
308 /* amount bytes after compression */
309 uint64_t compressed_size_prev;
310 /* amount of compressed pages */
311 uint64_t compress_pages_prev;
312
be8b02ed
XG
313 /* total handled target pages at the beginning of period */
314 uint64_t target_page_count_prev;
315 /* total handled target pages since start */
316 uint64_t target_page_count;
9360447d 317 /* number of dirty bits in the bitmap */
2dfaf12e
PX
318 uint64_t migration_dirty_pages;
319 /* protects modification of the bitmap */
108cfae0 320 QemuMutex bitmap_mutex;
68a098f3
JQ
321 /* The RAMBlock used in the last src_page_requests */
322 RAMBlock *last_req_rb;
ec481c6c
JQ
323 /* Queue of outstanding page requests from the destination */
324 QemuMutex src_page_req_mutex;
b58deb34 325 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
6f37bb8b
JQ
326};
327typedef struct RAMState RAMState;
328
53518d94 329static RAMState *ram_state;
6f37bb8b 330
9edabd4d 331uint64_t ram_bytes_remaining(void)
2f4fde93 332{
bae416e5
DDAG
333 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
334 0;
2f4fde93
JQ
335}
336
9360447d 337MigrationStats ram_counters;
96506894 338
b8fb8cb7
DDAG
339/* used by the search for pages to send */
340struct PageSearchStatus {
341 /* Current block being searched */
342 RAMBlock *block;
a935e30f
JQ
343 /* Current page to search from */
344 unsigned long page;
b8fb8cb7
DDAG
345 /* Set once we wrap around */
346 bool complete_round;
347};
348typedef struct PageSearchStatus PageSearchStatus;
349
76e03000
XG
350CompressionStats compression_counters;
351
56e93d26 352struct CompressParam {
56e93d26 353 bool done;
90e56fb4 354 bool quit;
5e5fdcff 355 bool zero_page;
56e93d26
JQ
356 QEMUFile *file;
357 QemuMutex mutex;
358 QemuCond cond;
359 RAMBlock *block;
360 ram_addr_t offset;
34ab9e97
XG
361
362 /* internally used fields */
dcaf446e 363 z_stream stream;
34ab9e97 364 uint8_t *originbuf;
56e93d26
JQ
365};
366typedef struct CompressParam CompressParam;
367
368struct DecompressParam {
73a8912b 369 bool done;
90e56fb4 370 bool quit;
56e93d26
JQ
371 QemuMutex mutex;
372 QemuCond cond;
373 void *des;
d341d9f3 374 uint8_t *compbuf;
56e93d26 375 int len;
797ca154 376 z_stream stream;
56e93d26
JQ
377};
378typedef struct DecompressParam DecompressParam;
379
380static CompressParam *comp_param;
381static QemuThread *compress_threads;
382/* comp_done_cond is used to wake up the migration thread when
383 * one of the compression threads has finished the compression.
384 * comp_done_lock is used to co-work with comp_done_cond.
385 */
0d9f9a5c
LL
386static QemuMutex comp_done_lock;
387static QemuCond comp_done_cond;
56e93d26
JQ
388/* The empty QEMUFileOps will be used by file in CompressParam */
389static const QEMUFileOps empty_ops = { };
390
34ab9e97 391static QEMUFile *decomp_file;
56e93d26
JQ
392static DecompressParam *decomp_param;
393static QemuThread *decompress_threads;
73a8912b
LL
394static QemuMutex decomp_done_lock;
395static QemuCond decomp_done_cond;
56e93d26 396
5e5fdcff 397static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
6ef3771c 398 ram_addr_t offset, uint8_t *source_buf);
56e93d26
JQ
399
400static void *do_data_compress(void *opaque)
401{
402 CompressParam *param = opaque;
a7a9a88f
LL
403 RAMBlock *block;
404 ram_addr_t offset;
5e5fdcff 405 bool zero_page;
56e93d26 406
a7a9a88f 407 qemu_mutex_lock(&param->mutex);
90e56fb4 408 while (!param->quit) {
a7a9a88f
LL
409 if (param->block) {
410 block = param->block;
411 offset = param->offset;
412 param->block = NULL;
413 qemu_mutex_unlock(&param->mutex);
414
5e5fdcff
XG
415 zero_page = do_compress_ram_page(param->file, &param->stream,
416 block, offset, param->originbuf);
a7a9a88f 417
0d9f9a5c 418 qemu_mutex_lock(&comp_done_lock);
a7a9a88f 419 param->done = true;
5e5fdcff 420 param->zero_page = zero_page;
0d9f9a5c
LL
421 qemu_cond_signal(&comp_done_cond);
422 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
423
424 qemu_mutex_lock(&param->mutex);
425 } else {
56e93d26
JQ
426 qemu_cond_wait(&param->cond, &param->mutex);
427 }
56e93d26 428 }
a7a9a88f 429 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
430
431 return NULL;
432}
433
f0afa331 434static void compress_threads_save_cleanup(void)
56e93d26
JQ
435{
436 int i, thread_count;
437
05306935 438 if (!migrate_use_compression() || !comp_param) {
56e93d26
JQ
439 return;
440 }
05306935 441
56e93d26
JQ
442 thread_count = migrate_compress_threads();
443 for (i = 0; i < thread_count; i++) {
dcaf446e
XG
444 /*
445 * we use it as a indicator which shows if the thread is
446 * properly init'd or not
447 */
448 if (!comp_param[i].file) {
449 break;
450 }
05306935
FL
451
452 qemu_mutex_lock(&comp_param[i].mutex);
453 comp_param[i].quit = true;
454 qemu_cond_signal(&comp_param[i].cond);
455 qemu_mutex_unlock(&comp_param[i].mutex);
456
56e93d26 457 qemu_thread_join(compress_threads + i);
56e93d26
JQ
458 qemu_mutex_destroy(&comp_param[i].mutex);
459 qemu_cond_destroy(&comp_param[i].cond);
dcaf446e 460 deflateEnd(&comp_param[i].stream);
34ab9e97 461 g_free(comp_param[i].originbuf);
dcaf446e
XG
462 qemu_fclose(comp_param[i].file);
463 comp_param[i].file = NULL;
56e93d26 464 }
0d9f9a5c
LL
465 qemu_mutex_destroy(&comp_done_lock);
466 qemu_cond_destroy(&comp_done_cond);
56e93d26
JQ
467 g_free(compress_threads);
468 g_free(comp_param);
56e93d26
JQ
469 compress_threads = NULL;
470 comp_param = NULL;
56e93d26
JQ
471}
472
dcaf446e 473static int compress_threads_save_setup(void)
56e93d26
JQ
474{
475 int i, thread_count;
476
477 if (!migrate_use_compression()) {
dcaf446e 478 return 0;
56e93d26 479 }
56e93d26
JQ
480 thread_count = migrate_compress_threads();
481 compress_threads = g_new0(QemuThread, thread_count);
482 comp_param = g_new0(CompressParam, thread_count);
0d9f9a5c
LL
483 qemu_cond_init(&comp_done_cond);
484 qemu_mutex_init(&comp_done_lock);
56e93d26 485 for (i = 0; i < thread_count; i++) {
34ab9e97
XG
486 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
487 if (!comp_param[i].originbuf) {
488 goto exit;
489 }
490
dcaf446e
XG
491 if (deflateInit(&comp_param[i].stream,
492 migrate_compress_level()) != Z_OK) {
34ab9e97 493 g_free(comp_param[i].originbuf);
dcaf446e
XG
494 goto exit;
495 }
496
e110aa91
C
497 /* comp_param[i].file is just used as a dummy buffer to save data,
498 * set its ops to empty.
56e93d26
JQ
499 */
500 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
501 comp_param[i].done = true;
90e56fb4 502 comp_param[i].quit = false;
56e93d26
JQ
503 qemu_mutex_init(&comp_param[i].mutex);
504 qemu_cond_init(&comp_param[i].cond);
505 qemu_thread_create(compress_threads + i, "compress",
506 do_data_compress, comp_param + i,
507 QEMU_THREAD_JOINABLE);
508 }
dcaf446e
XG
509 return 0;
510
511exit:
512 compress_threads_save_cleanup();
513 return -1;
56e93d26
JQ
514}
515
f986c3d2
JQ
516/* Multiple fd's */
517
af8b7d2b
JQ
518#define MULTIFD_MAGIC 0x11223344U
519#define MULTIFD_VERSION 1
520
6df264ac
JQ
521#define MULTIFD_FLAG_SYNC (1 << 0)
522
af8b7d2b
JQ
523typedef struct {
524 uint32_t magic;
525 uint32_t version;
526 unsigned char uuid[16]; /* QemuUUID */
527 uint8_t id;
528} __attribute__((packed)) MultiFDInit_t;
529
2a26c979
JQ
530typedef struct {
531 uint32_t magic;
532 uint32_t version;
533 uint32_t flags;
534 uint32_t size;
535 uint32_t used;
536 uint64_t packet_num;
537 char ramblock[256];
538 uint64_t offset[];
539} __attribute__((packed)) MultiFDPacket_t;
540
34c55a94
JQ
541typedef struct {
542 /* number of used pages */
543 uint32_t used;
544 /* number of allocated pages */
545 uint32_t allocated;
546 /* global number of generated multifd packets */
547 uint64_t packet_num;
548 /* offset of each page */
549 ram_addr_t *offset;
550 /* pointer to each page */
551 struct iovec *iov;
552 RAMBlock *block;
553} MultiFDPages_t;
554
8c4598f2
JQ
555typedef struct {
556 /* this fields are not changed once the thread is created */
557 /* channel number */
f986c3d2 558 uint8_t id;
8c4598f2 559 /* channel thread name */
f986c3d2 560 char *name;
8c4598f2 561 /* channel thread id */
f986c3d2 562 QemuThread thread;
8c4598f2 563 /* communication channel */
60df2d4a 564 QIOChannel *c;
8c4598f2 565 /* sem where to wait for more work */
f986c3d2 566 QemuSemaphore sem;
8c4598f2 567 /* this mutex protects the following parameters */
f986c3d2 568 QemuMutex mutex;
8c4598f2 569 /* is this channel thread running */
66770707 570 bool running;
8c4598f2 571 /* should this thread finish */
f986c3d2 572 bool quit;
0beb5ed3
JQ
573 /* thread has work to do */
574 int pending_job;
34c55a94
JQ
575 /* array of pages to sent */
576 MultiFDPages_t *pages;
2a26c979
JQ
577 /* packet allocated len */
578 uint32_t packet_len;
579 /* pointer to the packet */
580 MultiFDPacket_t *packet;
581 /* multifd flags for each packet */
582 uint32_t flags;
583 /* global number of generated multifd packets */
584 uint64_t packet_num;
408ea6ae
JQ
585 /* thread local variables */
586 /* packets sent through this channel */
587 uint64_t num_packets;
588 /* pages sent through this channel */
589 uint64_t num_pages;
6df264ac
JQ
590 /* syncs main thread and channels */
591 QemuSemaphore sem_sync;
8c4598f2
JQ
592} MultiFDSendParams;
593
594typedef struct {
595 /* this fields are not changed once the thread is created */
596 /* channel number */
597 uint8_t id;
598 /* channel thread name */
599 char *name;
600 /* channel thread id */
601 QemuThread thread;
602 /* communication channel */
603 QIOChannel *c;
8c4598f2
JQ
604 /* this mutex protects the following parameters */
605 QemuMutex mutex;
606 /* is this channel thread running */
607 bool running;
34c55a94
JQ
608 /* array of pages to receive */
609 MultiFDPages_t *pages;
2a26c979
JQ
610 /* packet allocated len */
611 uint32_t packet_len;
612 /* pointer to the packet */
613 MultiFDPacket_t *packet;
614 /* multifd flags for each packet */
615 uint32_t flags;
616 /* global number of generated multifd packets */
617 uint64_t packet_num;
408ea6ae
JQ
618 /* thread local variables */
619 /* packets sent through this channel */
620 uint64_t num_packets;
621 /* pages sent through this channel */
622 uint64_t num_pages;
6df264ac
JQ
623 /* syncs main thread and channels */
624 QemuSemaphore sem_sync;
8c4598f2 625} MultiFDRecvParams;
f986c3d2 626
af8b7d2b
JQ
627static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
628{
629 MultiFDInit_t msg;
630 int ret;
631
632 msg.magic = cpu_to_be32(MULTIFD_MAGIC);
633 msg.version = cpu_to_be32(MULTIFD_VERSION);
634 msg.id = p->id;
635 memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
636
637 ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
638 if (ret != 0) {
639 return -1;
640 }
641 return 0;
642}
643
644static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
645{
646 MultiFDInit_t msg;
647 int ret;
648
649 ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
650 if (ret != 0) {
651 return -1;
652 }
653
341ba0df
PM
654 msg.magic = be32_to_cpu(msg.magic);
655 msg.version = be32_to_cpu(msg.version);
af8b7d2b
JQ
656
657 if (msg.magic != MULTIFD_MAGIC) {
658 error_setg(errp, "multifd: received packet magic %x "
659 "expected %x", msg.magic, MULTIFD_MAGIC);
660 return -1;
661 }
662
663 if (msg.version != MULTIFD_VERSION) {
664 error_setg(errp, "multifd: received packet version %d "
665 "expected %d", msg.version, MULTIFD_VERSION);
666 return -1;
667 }
668
669 if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
670 char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
671 char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
672
673 error_setg(errp, "multifd: received uuid '%s' and expected "
674 "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
675 g_free(uuid);
676 g_free(msg_uuid);
677 return -1;
678 }
679
680 if (msg.id > migrate_multifd_channels()) {
681 error_setg(errp, "multifd: received channel version %d "
682 "expected %d", msg.version, MULTIFD_VERSION);
683 return -1;
684 }
685
686 return msg.id;
687}
688
34c55a94
JQ
689static MultiFDPages_t *multifd_pages_init(size_t size)
690{
691 MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
692
693 pages->allocated = size;
694 pages->iov = g_new0(struct iovec, size);
695 pages->offset = g_new0(ram_addr_t, size);
696
697 return pages;
698}
699
700static void multifd_pages_clear(MultiFDPages_t *pages)
701{
702 pages->used = 0;
703 pages->allocated = 0;
704 pages->packet_num = 0;
705 pages->block = NULL;
706 g_free(pages->iov);
707 pages->iov = NULL;
708 g_free(pages->offset);
709 pages->offset = NULL;
710 g_free(pages);
711}
712
2a26c979
JQ
713static void multifd_send_fill_packet(MultiFDSendParams *p)
714{
715 MultiFDPacket_t *packet = p->packet;
716 int i;
717
718 packet->magic = cpu_to_be32(MULTIFD_MAGIC);
719 packet->version = cpu_to_be32(MULTIFD_VERSION);
720 packet->flags = cpu_to_be32(p->flags);
721 packet->size = cpu_to_be32(migrate_multifd_page_count());
722 packet->used = cpu_to_be32(p->pages->used);
723 packet->packet_num = cpu_to_be64(p->packet_num);
724
725 if (p->pages->block) {
726 strncpy(packet->ramblock, p->pages->block->idstr, 256);
727 }
728
729 for (i = 0; i < p->pages->used; i++) {
730 packet->offset[i] = cpu_to_be64(p->pages->offset[i]);
731 }
732}
733
734static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
735{
736 MultiFDPacket_t *packet = p->packet;
737 RAMBlock *block;
738 int i;
739
341ba0df 740 packet->magic = be32_to_cpu(packet->magic);
2a26c979
JQ
741 if (packet->magic != MULTIFD_MAGIC) {
742 error_setg(errp, "multifd: received packet "
743 "magic %x and expected magic %x",
744 packet->magic, MULTIFD_MAGIC);
745 return -1;
746 }
747
341ba0df 748 packet->version = be32_to_cpu(packet->version);
2a26c979
JQ
749 if (packet->version != MULTIFD_VERSION) {
750 error_setg(errp, "multifd: received packet "
751 "version %d and expected version %d",
752 packet->version, MULTIFD_VERSION);
753 return -1;
754 }
755
756 p->flags = be32_to_cpu(packet->flags);
757
341ba0df 758 packet->size = be32_to_cpu(packet->size);
2a26c979
JQ
759 if (packet->size > migrate_multifd_page_count()) {
760 error_setg(errp, "multifd: received packet "
761 "with size %d and expected maximum size %d",
762 packet->size, migrate_multifd_page_count()) ;
763 return -1;
764 }
765
766 p->pages->used = be32_to_cpu(packet->used);
767 if (p->pages->used > packet->size) {
768 error_setg(errp, "multifd: received packet "
769 "with size %d and expected maximum size %d",
770 p->pages->used, packet->size) ;
771 return -1;
772 }
773
774 p->packet_num = be64_to_cpu(packet->packet_num);
775
776 if (p->pages->used) {
777 /* make sure that ramblock is 0 terminated */
778 packet->ramblock[255] = 0;
779 block = qemu_ram_block_by_name(packet->ramblock);
780 if (!block) {
781 error_setg(errp, "multifd: unknown ram block %s",
782 packet->ramblock);
783 return -1;
784 }
785 }
786
787 for (i = 0; i < p->pages->used; i++) {
788 ram_addr_t offset = be64_to_cpu(packet->offset[i]);
789
790 if (offset > (block->used_length - TARGET_PAGE_SIZE)) {
791 error_setg(errp, "multifd: offset too long " RAM_ADDR_FMT
792 " (max " RAM_ADDR_FMT ")",
793 offset, block->max_length);
794 return -1;
795 }
796 p->pages->iov[i].iov_base = block->host + offset;
797 p->pages->iov[i].iov_len = TARGET_PAGE_SIZE;
798 }
799
800 return 0;
801}
802
f986c3d2
JQ
803struct {
804 MultiFDSendParams *params;
805 /* number of created threads */
806 int count;
34c55a94
JQ
807 /* array of pages to sent */
808 MultiFDPages_t *pages;
6df264ac
JQ
809 /* syncs main thread and channels */
810 QemuSemaphore sem_sync;
811 /* global number of generated multifd packets */
812 uint64_t packet_num;
b9ee2f7d
JQ
813 /* send channels ready */
814 QemuSemaphore channels_ready;
f986c3d2
JQ
815} *multifd_send_state;
816
b9ee2f7d
JQ
817/*
818 * How we use multifd_send_state->pages and channel->pages?
819 *
820 * We create a pages for each channel, and a main one. Each time that
821 * we need to send a batch of pages we interchange the ones between
822 * multifd_send_state and the channel that is sending it. There are
823 * two reasons for that:
824 * - to not have to do so many mallocs during migration
825 * - to make easier to know what to free at the end of migration
826 *
827 * This way we always know who is the owner of each "pages" struct,
828 * and we don't need any loocking. It belongs to the migration thread
829 * or to the channel thread. Switching is safe because the migration
830 * thread is using the channel mutex when changing it, and the channel
831 * have to had finish with its own, otherwise pending_job can't be
832 * false.
833 */
834
835static void multifd_send_pages(void)
836{
837 int i;
838 static int next_channel;
839 MultiFDSendParams *p = NULL; /* make happy gcc */
840 MultiFDPages_t *pages = multifd_send_state->pages;
841 uint64_t transferred;
842
843 qemu_sem_wait(&multifd_send_state->channels_ready);
844 for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
845 p = &multifd_send_state->params[i];
846
847 qemu_mutex_lock(&p->mutex);
848 if (!p->pending_job) {
849 p->pending_job++;
850 next_channel = (i + 1) % migrate_multifd_channels();
851 break;
852 }
853 qemu_mutex_unlock(&p->mutex);
854 }
855 p->pages->used = 0;
856
857 p->packet_num = multifd_send_state->packet_num++;
858 p->pages->block = NULL;
859 multifd_send_state->pages = p->pages;
860 p->pages = pages;
4fcefd44 861 transferred = ((uint64_t) pages->used) * TARGET_PAGE_SIZE + p->packet_len;
b9ee2f7d
JQ
862 ram_counters.multifd_bytes += transferred;
863 ram_counters.transferred += transferred;;
864 qemu_mutex_unlock(&p->mutex);
865 qemu_sem_post(&p->sem);
866}
867
868static void multifd_queue_page(RAMBlock *block, ram_addr_t offset)
869{
870 MultiFDPages_t *pages = multifd_send_state->pages;
871
872 if (!pages->block) {
873 pages->block = block;
874 }
875
876 if (pages->block == block) {
877 pages->offset[pages->used] = offset;
878 pages->iov[pages->used].iov_base = block->host + offset;
879 pages->iov[pages->used].iov_len = TARGET_PAGE_SIZE;
880 pages->used++;
881
882 if (pages->used < pages->allocated) {
883 return;
884 }
885 }
886
887 multifd_send_pages();
888
889 if (pages->block != block) {
890 multifd_queue_page(block, offset);
891 }
892}
893
66770707 894static void multifd_send_terminate_threads(Error *err)
f986c3d2
JQ
895{
896 int i;
897
7a169d74
JQ
898 if (err) {
899 MigrationState *s = migrate_get_current();
900 migrate_set_error(s, err);
901 if (s->state == MIGRATION_STATUS_SETUP ||
902 s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
903 s->state == MIGRATION_STATUS_DEVICE ||
904 s->state == MIGRATION_STATUS_ACTIVE) {
905 migrate_set_state(&s->state, s->state,
906 MIGRATION_STATUS_FAILED);
907 }
908 }
909
66770707 910 for (i = 0; i < migrate_multifd_channels(); i++) {
f986c3d2
JQ
911 MultiFDSendParams *p = &multifd_send_state->params[i];
912
913 qemu_mutex_lock(&p->mutex);
914 p->quit = true;
915 qemu_sem_post(&p->sem);
916 qemu_mutex_unlock(&p->mutex);
917 }
918}
919
1398b2e3 920void multifd_save_cleanup(void)
f986c3d2
JQ
921{
922 int i;
f986c3d2
JQ
923
924 if (!migrate_use_multifd()) {
1398b2e3 925 return;
f986c3d2 926 }
66770707
JQ
927 multifd_send_terminate_threads(NULL);
928 for (i = 0; i < migrate_multifd_channels(); i++) {
f986c3d2
JQ
929 MultiFDSendParams *p = &multifd_send_state->params[i];
930
66770707
JQ
931 if (p->running) {
932 qemu_thread_join(&p->thread);
933 }
60df2d4a
JQ
934 socket_send_channel_destroy(p->c);
935 p->c = NULL;
f986c3d2
JQ
936 qemu_mutex_destroy(&p->mutex);
937 qemu_sem_destroy(&p->sem);
6df264ac 938 qemu_sem_destroy(&p->sem_sync);
f986c3d2
JQ
939 g_free(p->name);
940 p->name = NULL;
34c55a94
JQ
941 multifd_pages_clear(p->pages);
942 p->pages = NULL;
2a26c979
JQ
943 p->packet_len = 0;
944 g_free(p->packet);
945 p->packet = NULL;
f986c3d2 946 }
b9ee2f7d 947 qemu_sem_destroy(&multifd_send_state->channels_ready);
6df264ac 948 qemu_sem_destroy(&multifd_send_state->sem_sync);
f986c3d2
JQ
949 g_free(multifd_send_state->params);
950 multifd_send_state->params = NULL;
34c55a94
JQ
951 multifd_pages_clear(multifd_send_state->pages);
952 multifd_send_state->pages = NULL;
f986c3d2
JQ
953 g_free(multifd_send_state);
954 multifd_send_state = NULL;
f986c3d2
JQ
955}
956
6df264ac
JQ
957static void multifd_send_sync_main(void)
958{
959 int i;
960
961 if (!migrate_use_multifd()) {
962 return;
963 }
b9ee2f7d
JQ
964 if (multifd_send_state->pages->used) {
965 multifd_send_pages();
966 }
6df264ac
JQ
967 for (i = 0; i < migrate_multifd_channels(); i++) {
968 MultiFDSendParams *p = &multifd_send_state->params[i];
969
970 trace_multifd_send_sync_main_signal(p->id);
971
972 qemu_mutex_lock(&p->mutex);
b9ee2f7d
JQ
973
974 p->packet_num = multifd_send_state->packet_num++;
6df264ac
JQ
975 p->flags |= MULTIFD_FLAG_SYNC;
976 p->pending_job++;
977 qemu_mutex_unlock(&p->mutex);
978 qemu_sem_post(&p->sem);
979 }
980 for (i = 0; i < migrate_multifd_channels(); i++) {
981 MultiFDSendParams *p = &multifd_send_state->params[i];
982
983 trace_multifd_send_sync_main_wait(p->id);
984 qemu_sem_wait(&multifd_send_state->sem_sync);
985 }
986 trace_multifd_send_sync_main(multifd_send_state->packet_num);
987}
988
f986c3d2
JQ
989static void *multifd_send_thread(void *opaque)
990{
991 MultiFDSendParams *p = opaque;
af8b7d2b 992 Error *local_err = NULL;
8b2db7f5 993 int ret;
af8b7d2b 994
408ea6ae 995 trace_multifd_send_thread_start(p->id);
74637e6f 996 rcu_register_thread();
408ea6ae 997
af8b7d2b
JQ
998 if (multifd_send_initial_packet(p, &local_err) < 0) {
999 goto out;
1000 }
408ea6ae
JQ
1001 /* initial packet */
1002 p->num_packets = 1;
f986c3d2
JQ
1003
1004 while (true) {
d82628e4 1005 qemu_sem_wait(&p->sem);
f986c3d2 1006 qemu_mutex_lock(&p->mutex);
0beb5ed3
JQ
1007
1008 if (p->pending_job) {
1009 uint32_t used = p->pages->used;
1010 uint64_t packet_num = p->packet_num;
1011 uint32_t flags = p->flags;
1012
1013 multifd_send_fill_packet(p);
1014 p->flags = 0;
1015 p->num_packets++;
1016 p->num_pages += used;
1017 p->pages->used = 0;
1018 qemu_mutex_unlock(&p->mutex);
1019
1020 trace_multifd_send(p->id, packet_num, used, flags);
1021
8b2db7f5
JQ
1022 ret = qio_channel_write_all(p->c, (void *)p->packet,
1023 p->packet_len, &local_err);
1024 if (ret != 0) {
1025 break;
1026 }
1027
1028 ret = qio_channel_writev_all(p->c, p->pages->iov, used, &local_err);
1029 if (ret != 0) {
1030 break;
1031 }
0beb5ed3
JQ
1032
1033 qemu_mutex_lock(&p->mutex);
1034 p->pending_job--;
1035 qemu_mutex_unlock(&p->mutex);
6df264ac
JQ
1036
1037 if (flags & MULTIFD_FLAG_SYNC) {
1038 qemu_sem_post(&multifd_send_state->sem_sync);
1039 }
b9ee2f7d 1040 qemu_sem_post(&multifd_send_state->channels_ready);
0beb5ed3 1041 } else if (p->quit) {
f986c3d2
JQ
1042 qemu_mutex_unlock(&p->mutex);
1043 break;
6df264ac
JQ
1044 } else {
1045 qemu_mutex_unlock(&p->mutex);
1046 /* sometimes there are spurious wakeups */
f986c3d2 1047 }
f986c3d2
JQ
1048 }
1049
af8b7d2b
JQ
1050out:
1051 if (local_err) {
1052 multifd_send_terminate_threads(local_err);
1053 }
1054
66770707
JQ
1055 qemu_mutex_lock(&p->mutex);
1056 p->running = false;
1057 qemu_mutex_unlock(&p->mutex);
1058
74637e6f 1059 rcu_unregister_thread();
408ea6ae
JQ
1060 trace_multifd_send_thread_end(p->id, p->num_packets, p->num_pages);
1061
f986c3d2
JQ
1062 return NULL;
1063}
1064
60df2d4a
JQ
1065static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
1066{
1067 MultiFDSendParams *p = opaque;
1068 QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
1069 Error *local_err = NULL;
1070
1071 if (qio_task_propagate_error(task, &local_err)) {
1398b2e3
FL
1072 migrate_set_error(migrate_get_current(), local_err);
1073 multifd_save_cleanup();
60df2d4a
JQ
1074 } else {
1075 p->c = QIO_CHANNEL(sioc);
1076 qio_channel_set_delay(p->c, false);
1077 p->running = true;
1078 qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
1079 QEMU_THREAD_JOINABLE);
1080
1081 atomic_inc(&multifd_send_state->count);
1082 }
1083}
1084
f986c3d2
JQ
1085int multifd_save_setup(void)
1086{
1087 int thread_count;
34c55a94 1088 uint32_t page_count = migrate_multifd_page_count();
f986c3d2
JQ
1089 uint8_t i;
1090
1091 if (!migrate_use_multifd()) {
1092 return 0;
1093 }
1094 thread_count = migrate_multifd_channels();
1095 multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
1096 multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
66770707 1097 atomic_set(&multifd_send_state->count, 0);
34c55a94 1098 multifd_send_state->pages = multifd_pages_init(page_count);
6df264ac 1099 qemu_sem_init(&multifd_send_state->sem_sync, 0);
b9ee2f7d 1100 qemu_sem_init(&multifd_send_state->channels_ready, 0);
34c55a94 1101
f986c3d2
JQ
1102 for (i = 0; i < thread_count; i++) {
1103 MultiFDSendParams *p = &multifd_send_state->params[i];
1104
1105 qemu_mutex_init(&p->mutex);
1106 qemu_sem_init(&p->sem, 0);
6df264ac 1107 qemu_sem_init(&p->sem_sync, 0);
f986c3d2 1108 p->quit = false;
0beb5ed3 1109 p->pending_job = 0;
f986c3d2 1110 p->id = i;
34c55a94 1111 p->pages = multifd_pages_init(page_count);
2a26c979
JQ
1112 p->packet_len = sizeof(MultiFDPacket_t)
1113 + sizeof(ram_addr_t) * page_count;
1114 p->packet = g_malloc0(p->packet_len);
f986c3d2 1115 p->name = g_strdup_printf("multifdsend_%d", i);
60df2d4a 1116 socket_send_channel_create(multifd_new_send_channel_async, p);
f986c3d2
JQ
1117 }
1118 return 0;
1119}
1120
f986c3d2
JQ
1121struct {
1122 MultiFDRecvParams *params;
1123 /* number of created threads */
1124 int count;
6df264ac
JQ
1125 /* syncs main thread and channels */
1126 QemuSemaphore sem_sync;
1127 /* global number of generated multifd packets */
1128 uint64_t packet_num;
f986c3d2
JQ
1129} *multifd_recv_state;
1130
66770707 1131static void multifd_recv_terminate_threads(Error *err)
f986c3d2
JQ
1132{
1133 int i;
1134
7a169d74
JQ
1135 if (err) {
1136 MigrationState *s = migrate_get_current();
1137 migrate_set_error(s, err);
1138 if (s->state == MIGRATION_STATUS_SETUP ||
1139 s->state == MIGRATION_STATUS_ACTIVE) {
1140 migrate_set_state(&s->state, s->state,
1141 MIGRATION_STATUS_FAILED);
1142 }
1143 }
1144
66770707 1145 for (i = 0; i < migrate_multifd_channels(); i++) {
f986c3d2
JQ
1146 MultiFDRecvParams *p = &multifd_recv_state->params[i];
1147
1148 qemu_mutex_lock(&p->mutex);
7a5cc33c
JQ
1149 /* We could arrive here for two reasons:
1150 - normal quit, i.e. everything went fine, just finished
1151 - error quit: We close the channels so the channel threads
1152 finish the qio_channel_read_all_eof() */
1153 qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
f986c3d2
JQ
1154 qemu_mutex_unlock(&p->mutex);
1155 }
1156}
1157
1158int multifd_load_cleanup(Error **errp)
1159{
1160 int i;
1161 int ret = 0;
1162
1163 if (!migrate_use_multifd()) {
1164 return 0;
1165 }
66770707
JQ
1166 multifd_recv_terminate_threads(NULL);
1167 for (i = 0; i < migrate_multifd_channels(); i++) {
f986c3d2
JQ
1168 MultiFDRecvParams *p = &multifd_recv_state->params[i];
1169
66770707
JQ
1170 if (p->running) {
1171 qemu_thread_join(&p->thread);
1172 }
60df2d4a
JQ
1173 object_unref(OBJECT(p->c));
1174 p->c = NULL;
f986c3d2 1175 qemu_mutex_destroy(&p->mutex);
6df264ac 1176 qemu_sem_destroy(&p->sem_sync);
f986c3d2
JQ
1177 g_free(p->name);
1178 p->name = NULL;
34c55a94
JQ
1179 multifd_pages_clear(p->pages);
1180 p->pages = NULL;
2a26c979
JQ
1181 p->packet_len = 0;
1182 g_free(p->packet);
1183 p->packet = NULL;
f986c3d2 1184 }
6df264ac 1185 qemu_sem_destroy(&multifd_recv_state->sem_sync);
f986c3d2
JQ
1186 g_free(multifd_recv_state->params);
1187 multifd_recv_state->params = NULL;
1188 g_free(multifd_recv_state);
1189 multifd_recv_state = NULL;
1190
1191 return ret;
1192}
1193
6df264ac
JQ
1194static void multifd_recv_sync_main(void)
1195{
1196 int i;
1197
1198 if (!migrate_use_multifd()) {
1199 return;
1200 }
1201 for (i = 0; i < migrate_multifd_channels(); i++) {
1202 MultiFDRecvParams *p = &multifd_recv_state->params[i];
1203
6df264ac
JQ
1204 trace_multifd_recv_sync_main_wait(p->id);
1205 qemu_sem_wait(&multifd_recv_state->sem_sync);
1206 qemu_mutex_lock(&p->mutex);
1207 if (multifd_recv_state->packet_num < p->packet_num) {
1208 multifd_recv_state->packet_num = p->packet_num;
1209 }
1210 qemu_mutex_unlock(&p->mutex);
1211 }
1212 for (i = 0; i < migrate_multifd_channels(); i++) {
1213 MultiFDRecvParams *p = &multifd_recv_state->params[i];
1214
1215 trace_multifd_recv_sync_main_signal(p->id);
6df264ac
JQ
1216 qemu_sem_post(&p->sem_sync);
1217 }
1218 trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
1219}
1220
f986c3d2
JQ
1221static void *multifd_recv_thread(void *opaque)
1222{
1223 MultiFDRecvParams *p = opaque;
2a26c979
JQ
1224 Error *local_err = NULL;
1225 int ret;
f986c3d2 1226
408ea6ae 1227 trace_multifd_recv_thread_start(p->id);
74637e6f 1228 rcu_register_thread();
408ea6ae 1229
f986c3d2 1230 while (true) {
6df264ac
JQ
1231 uint32_t used;
1232 uint32_t flags;
0beb5ed3 1233
8b2db7f5
JQ
1234 ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
1235 p->packet_len, &local_err);
1236 if (ret == 0) { /* EOF */
1237 break;
1238 }
1239 if (ret == -1) { /* Error */
1240 break;
1241 }
2a26c979 1242
6df264ac
JQ
1243 qemu_mutex_lock(&p->mutex);
1244 ret = multifd_recv_unfill_packet(p, &local_err);
1245 if (ret) {
f986c3d2
JQ
1246 qemu_mutex_unlock(&p->mutex);
1247 break;
1248 }
6df264ac
JQ
1249
1250 used = p->pages->used;
1251 flags = p->flags;
1252 trace_multifd_recv(p->id, p->packet_num, used, flags);
6df264ac
JQ
1253 p->num_packets++;
1254 p->num_pages += used;
f986c3d2 1255 qemu_mutex_unlock(&p->mutex);
6df264ac 1256
8b2db7f5
JQ
1257 ret = qio_channel_readv_all(p->c, p->pages->iov, used, &local_err);
1258 if (ret != 0) {
1259 break;
1260 }
1261
6df264ac
JQ
1262 if (flags & MULTIFD_FLAG_SYNC) {
1263 qemu_sem_post(&multifd_recv_state->sem_sync);
1264 qemu_sem_wait(&p->sem_sync);
1265 }
f986c3d2
JQ
1266 }
1267
d82628e4
JQ
1268 if (local_err) {
1269 multifd_recv_terminate_threads(local_err);
1270 }
66770707
JQ
1271 qemu_mutex_lock(&p->mutex);
1272 p->running = false;
1273 qemu_mutex_unlock(&p->mutex);
1274
74637e6f 1275 rcu_unregister_thread();
408ea6ae
JQ
1276 trace_multifd_recv_thread_end(p->id, p->num_packets, p->num_pages);
1277
f986c3d2
JQ
1278 return NULL;
1279}
1280
1281int multifd_load_setup(void)
1282{
1283 int thread_count;
34c55a94 1284 uint32_t page_count = migrate_multifd_page_count();
f986c3d2
JQ
1285 uint8_t i;
1286
1287 if (!migrate_use_multifd()) {
1288 return 0;
1289 }
1290 thread_count = migrate_multifd_channels();
1291 multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
1292 multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
66770707 1293 atomic_set(&multifd_recv_state->count, 0);
6df264ac 1294 qemu_sem_init(&multifd_recv_state->sem_sync, 0);
34c55a94 1295
f986c3d2
JQ
1296 for (i = 0; i < thread_count; i++) {
1297 MultiFDRecvParams *p = &multifd_recv_state->params[i];
1298
1299 qemu_mutex_init(&p->mutex);
6df264ac 1300 qemu_sem_init(&p->sem_sync, 0);
f986c3d2 1301 p->id = i;
34c55a94 1302 p->pages = multifd_pages_init(page_count);
2a26c979
JQ
1303 p->packet_len = sizeof(MultiFDPacket_t)
1304 + sizeof(ram_addr_t) * page_count;
1305 p->packet = g_malloc0(p->packet_len);
f986c3d2 1306 p->name = g_strdup_printf("multifdrecv_%d", i);
f986c3d2
JQ
1307 }
1308 return 0;
1309}
1310
62c1e0ca
JQ
1311bool multifd_recv_all_channels_created(void)
1312{
1313 int thread_count = migrate_multifd_channels();
1314
1315 if (!migrate_use_multifd()) {
1316 return true;
1317 }
1318
1319 return thread_count == atomic_read(&multifd_recv_state->count);
1320}
1321
49ed0d24
FL
1322/*
1323 * Try to receive all multifd channels to get ready for the migration.
1324 * - Return true and do not set @errp when correctly receving all channels;
1325 * - Return false and do not set @errp when correctly receiving the current one;
1326 * - Return false and set @errp when failing to receive the current channel.
1327 */
1328bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
71bb07db 1329{
60df2d4a 1330 MultiFDRecvParams *p;
af8b7d2b
JQ
1331 Error *local_err = NULL;
1332 int id;
60df2d4a 1333
af8b7d2b
JQ
1334 id = multifd_recv_initial_packet(ioc, &local_err);
1335 if (id < 0) {
1336 multifd_recv_terminate_threads(local_err);
49ed0d24
FL
1337 error_propagate_prepend(errp, local_err,
1338 "failed to receive packet"
1339 " via multifd channel %d: ",
1340 atomic_read(&multifd_recv_state->count));
81e62053 1341 return false;
af8b7d2b
JQ
1342 }
1343
1344 p = &multifd_recv_state->params[id];
1345 if (p->c != NULL) {
1346 error_setg(&local_err, "multifd: received id '%d' already setup'",
1347 id);
1348 multifd_recv_terminate_threads(local_err);
49ed0d24 1349 error_propagate(errp, local_err);
81e62053 1350 return false;
af8b7d2b 1351 }
60df2d4a
JQ
1352 p->c = ioc;
1353 object_ref(OBJECT(ioc));
408ea6ae
JQ
1354 /* initial packet */
1355 p->num_packets = 1;
60df2d4a
JQ
1356
1357 p->running = true;
1358 qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
1359 QEMU_THREAD_JOINABLE);
1360 atomic_inc(&multifd_recv_state->count);
49ed0d24
FL
1361 return atomic_read(&multifd_recv_state->count) ==
1362 migrate_multifd_channels();
71bb07db
JQ
1363}
1364
56e93d26 1365/**
3d0684b2 1366 * save_page_header: write page header to wire
56e93d26
JQ
1367 *
1368 * If this is the 1st block, it also writes the block identification
1369 *
3d0684b2 1370 * Returns the number of bytes written
56e93d26
JQ
1371 *
1372 * @f: QEMUFile where to send the data
1373 * @block: block that contains the page we want to send
1374 * @offset: offset inside the block for the page
1375 * in the lower bits, it contains flags
1376 */
2bf3aa85
JQ
1377static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
1378 ram_addr_t offset)
56e93d26 1379{
9f5f380b 1380 size_t size, len;
56e93d26 1381
24795694
JQ
1382 if (block == rs->last_sent_block) {
1383 offset |= RAM_SAVE_FLAG_CONTINUE;
1384 }
2bf3aa85 1385 qemu_put_be64(f, offset);
56e93d26
JQ
1386 size = 8;
1387
1388 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
9f5f380b 1389 len = strlen(block->idstr);
2bf3aa85
JQ
1390 qemu_put_byte(f, len);
1391 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
9f5f380b 1392 size += 1 + len;
24795694 1393 rs->last_sent_block = block;
56e93d26
JQ
1394 }
1395 return size;
1396}
1397
3d0684b2
JQ
1398/**
1399 * mig_throttle_guest_down: throotle down the guest
1400 *
1401 * Reduce amount of guest cpu execution to hopefully slow down memory
1402 * writes. If guest dirty memory rate is reduced below the rate at
1403 * which we can transfer pages to the destination then we should be
1404 * able to complete migration. Some workloads dirty memory way too
1405 * fast and will not effectively converge, even with auto-converge.
070afca2
JH
1406 */
1407static void mig_throttle_guest_down(void)
1408{
1409 MigrationState *s = migrate_get_current();
2594f56d
DB
1410 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
1411 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
4cbc9c7f 1412 int pct_max = s->parameters.max_cpu_throttle;
070afca2
JH
1413
1414 /* We have not started throttling yet. Let's start it. */
1415 if (!cpu_throttle_active()) {
1416 cpu_throttle_set(pct_initial);
1417 } else {
1418 /* Throttling already on, just increase the rate */
4cbc9c7f
LQ
1419 cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
1420 pct_max));
070afca2
JH
1421 }
1422}
1423
3d0684b2
JQ
1424/**
1425 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
1426 *
6f37bb8b 1427 * @rs: current RAM state
3d0684b2
JQ
1428 * @current_addr: address for the zero page
1429 *
1430 * Update the xbzrle cache to reflect a page that's been sent as all 0.
56e93d26
JQ
1431 * The important thing is that a stale (not-yet-0'd) page be replaced
1432 * by the new data.
1433 * As a bonus, if the page wasn't in the cache it gets added so that
3d0684b2 1434 * when a small write is made into the 0'd page it gets XBZRLE sent.
56e93d26 1435 */
6f37bb8b 1436static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
56e93d26 1437{
6f37bb8b 1438 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
56e93d26
JQ
1439 return;
1440 }
1441
1442 /* We don't care if this fails to allocate a new cache page
1443 * as long as it updated an old one */
c00e0928 1444 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
9360447d 1445 ram_counters.dirty_sync_count);
56e93d26
JQ
1446}
1447
1448#define ENCODING_FLAG_XBZRLE 0x1
1449
1450/**
1451 * save_xbzrle_page: compress and send current page
1452 *
1453 * Returns: 1 means that we wrote the page
1454 * 0 means that page is identical to the one already sent
1455 * -1 means that xbzrle would be longer than normal
1456 *
5a987738 1457 * @rs: current RAM state
3d0684b2
JQ
1458 * @current_data: pointer to the address of the page contents
1459 * @current_addr: addr of the page
56e93d26
JQ
1460 * @block: block that contains the page we want to send
1461 * @offset: offset inside the block for the page
1462 * @last_stage: if we are at the completion stage
56e93d26 1463 */
204b88b8 1464static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
56e93d26 1465 ram_addr_t current_addr, RAMBlock *block,
072c2511 1466 ram_addr_t offset, bool last_stage)
56e93d26
JQ
1467{
1468 int encoded_len = 0, bytes_xbzrle;
1469 uint8_t *prev_cached_page;
1470
9360447d
JQ
1471 if (!cache_is_cached(XBZRLE.cache, current_addr,
1472 ram_counters.dirty_sync_count)) {
1473 xbzrle_counters.cache_miss++;
56e93d26
JQ
1474 if (!last_stage) {
1475 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
9360447d 1476 ram_counters.dirty_sync_count) == -1) {
56e93d26
JQ
1477 return -1;
1478 } else {
1479 /* update *current_data when the page has been
1480 inserted into cache */
1481 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1482 }
1483 }
1484 return -1;
1485 }
1486
1487 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1488
1489 /* save current buffer into memory */
1490 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1491
1492 /* XBZRLE encoding (if there is no overflow) */
1493 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1494 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1495 TARGET_PAGE_SIZE);
1496 if (encoded_len == 0) {
55c4446b 1497 trace_save_xbzrle_page_skipping();
56e93d26
JQ
1498 return 0;
1499 } else if (encoded_len == -1) {
55c4446b 1500 trace_save_xbzrle_page_overflow();
9360447d 1501 xbzrle_counters.overflow++;
56e93d26
JQ
1502 /* update data in the cache */
1503 if (!last_stage) {
1504 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
1505 *current_data = prev_cached_page;
1506 }
1507 return -1;
1508 }
1509
1510 /* we need to update the data in the cache, in order to get the same data */
1511 if (!last_stage) {
1512 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1513 }
1514
1515 /* Send XBZRLE based compressed page */
2bf3aa85 1516 bytes_xbzrle = save_page_header(rs, rs->f, block,
204b88b8
JQ
1517 offset | RAM_SAVE_FLAG_XBZRLE);
1518 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1519 qemu_put_be16(rs->f, encoded_len);
1520 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
56e93d26 1521 bytes_xbzrle += encoded_len + 1 + 2;
9360447d
JQ
1522 xbzrle_counters.pages++;
1523 xbzrle_counters.bytes += bytes_xbzrle;
1524 ram_counters.transferred += bytes_xbzrle;
56e93d26
JQ
1525
1526 return 1;
1527}
1528
3d0684b2
JQ
1529/**
1530 * migration_bitmap_find_dirty: find the next dirty page from start
f3f491fc 1531 *
3d0684b2
JQ
1532 * Called with rcu_read_lock() to protect migration_bitmap
1533 *
1534 * Returns the byte offset within memory region of the start of a dirty page
1535 *
6f37bb8b 1536 * @rs: current RAM state
3d0684b2 1537 * @rb: RAMBlock where to search for dirty pages
a935e30f 1538 * @start: page where we start the search
f3f491fc 1539 */
56e93d26 1540static inline
a935e30f 1541unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
f20e2865 1542 unsigned long start)
56e93d26 1543{
6b6712ef
JQ
1544 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1545 unsigned long *bitmap = rb->bmap;
56e93d26
JQ
1546 unsigned long next;
1547
b895de50
CLG
1548 if (!qemu_ram_is_migratable(rb)) {
1549 return size;
1550 }
1551
6b6712ef
JQ
1552 if (rs->ram_bulk_stage && start > 0) {
1553 next = start + 1;
56e93d26 1554 } else {
6b6712ef 1555 next = find_next_bit(bitmap, size, start);
56e93d26
JQ
1556 }
1557
6b6712ef 1558 return next;
56e93d26
JQ
1559}
1560
06b10688 1561static inline bool migration_bitmap_clear_dirty(RAMState *rs,
f20e2865
JQ
1562 RAMBlock *rb,
1563 unsigned long page)
a82d593b
DDAG
1564{
1565 bool ret;
a82d593b 1566
6b6712ef 1567 ret = test_and_clear_bit(page, rb->bmap);
a82d593b
DDAG
1568
1569 if (ret) {
0d8ec885 1570 rs->migration_dirty_pages--;
a82d593b
DDAG
1571 }
1572 return ret;
1573}
1574
15440dd5
JQ
1575static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
1576 ram_addr_t start, ram_addr_t length)
56e93d26 1577{
0d8ec885 1578 rs->migration_dirty_pages +=
6b6712ef 1579 cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
0d8ec885 1580 &rs->num_dirty_pages_period);
56e93d26
JQ
1581}
1582
3d0684b2
JQ
1583/**
1584 * ram_pagesize_summary: calculate all the pagesizes of a VM
1585 *
1586 * Returns a summary bitmap of the page sizes of all RAMBlocks
1587 *
1588 * For VMs with just normal pages this is equivalent to the host page
1589 * size. If it's got some huge pages then it's the OR of all the
1590 * different page sizes.
e8ca1db2
DDAG
1591 */
1592uint64_t ram_pagesize_summary(void)
1593{
1594 RAMBlock *block;
1595 uint64_t summary = 0;
1596
b895de50 1597 RAMBLOCK_FOREACH_MIGRATABLE(block) {
e8ca1db2
DDAG
1598 summary |= block->page_size;
1599 }
1600
1601 return summary;
1602}
1603
aecbfe9c
XG
1604uint64_t ram_get_total_transferred_pages(void)
1605{
1606 return ram_counters.normal + ram_counters.duplicate +
1607 compression_counters.pages + xbzrle_counters.pages;
1608}
1609
b734035b
XG
1610static void migration_update_rates(RAMState *rs, int64_t end_time)
1611{
be8b02ed 1612 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
76e03000 1613 double compressed_size;
b734035b
XG
1614
1615 /* calculate period counters */
1616 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1617 / (end_time - rs->time_last_bitmap_sync);
1618
be8b02ed 1619 if (!page_count) {
b734035b
XG
1620 return;
1621 }
1622
1623 if (migrate_use_xbzrle()) {
1624 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
be8b02ed 1625 rs->xbzrle_cache_miss_prev) / page_count;
b734035b
XG
1626 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1627 }
76e03000
XG
1628
1629 if (migrate_use_compression()) {
1630 compression_counters.busy_rate = (double)(compression_counters.busy -
1631 rs->compress_thread_busy_prev) / page_count;
1632 rs->compress_thread_busy_prev = compression_counters.busy;
1633
1634 compressed_size = compression_counters.compressed_size -
1635 rs->compressed_size_prev;
1636 if (compressed_size) {
1637 double uncompressed_size = (compression_counters.pages -
1638 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1639
1640 /* Compression-Ratio = Uncompressed-size / Compressed-size */
1641 compression_counters.compression_rate =
1642 uncompressed_size / compressed_size;
1643
1644 rs->compress_pages_prev = compression_counters.pages;
1645 rs->compressed_size_prev = compression_counters.compressed_size;
1646 }
1647 }
b734035b
XG
1648}
1649
8d820d6f 1650static void migration_bitmap_sync(RAMState *rs)
56e93d26
JQ
1651{
1652 RAMBlock *block;
56e93d26 1653 int64_t end_time;
c4bdf0cf 1654 uint64_t bytes_xfer_now;
56e93d26 1655
9360447d 1656 ram_counters.dirty_sync_count++;
56e93d26 1657
f664da80
JQ
1658 if (!rs->time_last_bitmap_sync) {
1659 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
56e93d26
JQ
1660 }
1661
1662 trace_migration_bitmap_sync_start();
9c1f8f44 1663 memory_global_dirty_log_sync();
56e93d26 1664
108cfae0 1665 qemu_mutex_lock(&rs->bitmap_mutex);
56e93d26 1666 rcu_read_lock();
b895de50 1667 RAMBLOCK_FOREACH_MIGRATABLE(block) {
15440dd5 1668 migration_bitmap_sync_range(rs, block, 0, block->used_length);
56e93d26 1669 }
650af890 1670 ram_counters.remaining = ram_bytes_remaining();
56e93d26 1671 rcu_read_unlock();
108cfae0 1672 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26 1673
a66cd90c 1674 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1ffb5dfd 1675
56e93d26
JQ
1676 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1677
1678 /* more than 1 second = 1000 millisecons */
f664da80 1679 if (end_time > rs->time_last_bitmap_sync + 1000) {
9360447d 1680 bytes_xfer_now = ram_counters.transferred;
d693c6f1 1681
9ac78b61
PL
1682 /* During block migration the auto-converge logic incorrectly detects
1683 * that ram migration makes no progress. Avoid this by disabling the
1684 * throttling logic during the bulk phase of block migration. */
1685 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
56e93d26
JQ
1686 /* The following detection logic can be refined later. For now:
1687 Check to see if the dirtied bytes is 50% more than the approx.
1688 amount of bytes that just got transferred since the last time we
070afca2
JH
1689 were in this routine. If that happens twice, start or increase
1690 throttling */
070afca2 1691
d693c6f1 1692 if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
eac74159 1693 (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
b4a3c64b 1694 (++rs->dirty_rate_high_cnt >= 2)) {
56e93d26 1695 trace_migration_throttle();
8d820d6f 1696 rs->dirty_rate_high_cnt = 0;
070afca2 1697 mig_throttle_guest_down();
d693c6f1 1698 }
56e93d26 1699 }
070afca2 1700
b734035b
XG
1701 migration_update_rates(rs, end_time);
1702
be8b02ed 1703 rs->target_page_count_prev = rs->target_page_count;
d693c6f1
FF
1704
1705 /* reset period counters */
f664da80 1706 rs->time_last_bitmap_sync = end_time;
a66cd90c 1707 rs->num_dirty_pages_period = 0;
d2a4d85a 1708 rs->bytes_xfer_prev = bytes_xfer_now;
56e93d26 1709 }
4addcd4f 1710 if (migrate_use_events()) {
3ab72385 1711 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
4addcd4f 1712 }
56e93d26
JQ
1713}
1714
6c97ec5f
XG
1715/**
1716 * save_zero_page_to_file: send the zero page to the file
1717 *
1718 * Returns the size of data written to the file, 0 means the page is not
1719 * a zero page
1720 *
1721 * @rs: current RAM state
1722 * @file: the file where the data is saved
1723 * @block: block that contains the page we want to send
1724 * @offset: offset inside the block for the page
1725 */
1726static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1727 RAMBlock *block, ram_addr_t offset)
1728{
1729 uint8_t *p = block->host + offset;
1730 int len = 0;
1731
1732 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1733 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1734 qemu_put_byte(file, 0);
1735 len += 1;
1736 }
1737 return len;
1738}
1739
56e93d26 1740/**
3d0684b2 1741 * save_zero_page: send the zero page to the stream
56e93d26 1742 *
3d0684b2 1743 * Returns the number of pages written.
56e93d26 1744 *
f7ccd61b 1745 * @rs: current RAM state
56e93d26
JQ
1746 * @block: block that contains the page we want to send
1747 * @offset: offset inside the block for the page
56e93d26 1748 */
7faccdc3 1749static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
56e93d26 1750{
6c97ec5f 1751 int len = save_zero_page_to_file(rs, rs->f, block, offset);
56e93d26 1752
6c97ec5f 1753 if (len) {
9360447d 1754 ram_counters.duplicate++;
6c97ec5f
XG
1755 ram_counters.transferred += len;
1756 return 1;
56e93d26 1757 }
6c97ec5f 1758 return -1;
56e93d26
JQ
1759}
1760
5727309d 1761static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
53f09a10 1762{
5727309d 1763 if (!migrate_release_ram() || !migration_in_postcopy()) {
53f09a10
PB
1764 return;
1765 }
1766
aaa2064c 1767 ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
53f09a10
PB
1768}
1769
059ff0fb
XG
1770/*
1771 * @pages: the number of pages written by the control path,
1772 * < 0 - error
1773 * > 0 - number of pages written
1774 *
1775 * Return true if the pages has been saved, otherwise false is returned.
1776 */
1777static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1778 int *pages)
1779{
1780 uint64_t bytes_xmit = 0;
1781 int ret;
1782
1783 *pages = -1;
1784 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1785 &bytes_xmit);
1786 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1787 return false;
1788 }
1789
1790 if (bytes_xmit) {
1791 ram_counters.transferred += bytes_xmit;
1792 *pages = 1;
1793 }
1794
1795 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1796 return true;
1797 }
1798
1799 if (bytes_xmit > 0) {
1800 ram_counters.normal++;
1801 } else if (bytes_xmit == 0) {
1802 ram_counters.duplicate++;
1803 }
1804
1805 return true;
1806}
1807
65dacaa0
XG
1808/*
1809 * directly send the page to the stream
1810 *
1811 * Returns the number of pages written.
1812 *
1813 * @rs: current RAM state
1814 * @block: block that contains the page we want to send
1815 * @offset: offset inside the block for the page
1816 * @buf: the page to be sent
1817 * @async: send to page asyncly
1818 */
1819static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1820 uint8_t *buf, bool async)
1821{
1822 ram_counters.transferred += save_page_header(rs, rs->f, block,
1823 offset | RAM_SAVE_FLAG_PAGE);
1824 if (async) {
1825 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1826 migrate_release_ram() &
1827 migration_in_postcopy());
1828 } else {
1829 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1830 }
1831 ram_counters.transferred += TARGET_PAGE_SIZE;
1832 ram_counters.normal++;
1833 return 1;
1834}
1835
56e93d26 1836/**
3d0684b2 1837 * ram_save_page: send the given page to the stream
56e93d26 1838 *
3d0684b2 1839 * Returns the number of pages written.
3fd3c4b3
DDAG
1840 * < 0 - error
1841 * >=0 - Number of pages written - this might legally be 0
1842 * if xbzrle noticed the page was the same.
56e93d26 1843 *
6f37bb8b 1844 * @rs: current RAM state
56e93d26
JQ
1845 * @block: block that contains the page we want to send
1846 * @offset: offset inside the block for the page
1847 * @last_stage: if we are at the completion stage
56e93d26 1848 */
a0a8aa14 1849static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
56e93d26
JQ
1850{
1851 int pages = -1;
56e93d26 1852 uint8_t *p;
56e93d26 1853 bool send_async = true;
a08f6890 1854 RAMBlock *block = pss->block;
a935e30f 1855 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
059ff0fb 1856 ram_addr_t current_addr = block->offset + offset;
56e93d26 1857
2f68e399 1858 p = block->host + offset;
1db9d8e5 1859 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
56e93d26 1860
56e93d26 1861 XBZRLE_cache_lock();
d7400a34
XG
1862 if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1863 migrate_use_xbzrle()) {
059ff0fb
XG
1864 pages = save_xbzrle_page(rs, &p, current_addr, block,
1865 offset, last_stage);
1866 if (!last_stage) {
1867 /* Can't send this cached data async, since the cache page
1868 * might get updated before it gets to the wire
56e93d26 1869 */
059ff0fb 1870 send_async = false;
56e93d26
JQ
1871 }
1872 }
1873
1874 /* XBZRLE overflow or normal page */
1875 if (pages == -1) {
65dacaa0 1876 pages = save_normal_page(rs, block, offset, p, send_async);
56e93d26
JQ
1877 }
1878
1879 XBZRLE_cache_unlock();
1880
1881 return pages;
1882}
1883
b9ee2f7d
JQ
1884static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1885 ram_addr_t offset)
1886{
b9ee2f7d 1887 multifd_queue_page(block, offset);
b9ee2f7d
JQ
1888 ram_counters.normal++;
1889
1890 return 1;
1891}
1892
5e5fdcff 1893static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
6ef3771c 1894 ram_addr_t offset, uint8_t *source_buf)
56e93d26 1895{
53518d94 1896 RAMState *rs = ram_state;
a7a9a88f 1897 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
5e5fdcff 1898 bool zero_page = false;
6ef3771c 1899 int ret;
56e93d26 1900
5e5fdcff
XG
1901 if (save_zero_page_to_file(rs, f, block, offset)) {
1902 zero_page = true;
1903 goto exit;
1904 }
1905
6ef3771c 1906 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
34ab9e97
XG
1907
1908 /*
1909 * copy it to a internal buffer to avoid it being modified by VM
1910 * so that we can catch up the error during compression and
1911 * decompression
1912 */
1913 memcpy(source_buf, p, TARGET_PAGE_SIZE);
6ef3771c
XG
1914 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1915 if (ret < 0) {
1916 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
b3be2896 1917 error_report("compressed data failed!");
5e5fdcff 1918 return false;
b3be2896 1919 }
56e93d26 1920
5e5fdcff 1921exit:
6ef3771c 1922 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
5e5fdcff
XG
1923 return zero_page;
1924}
1925
1926static void
1927update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1928{
76e03000
XG
1929 ram_counters.transferred += bytes_xmit;
1930
5e5fdcff
XG
1931 if (param->zero_page) {
1932 ram_counters.duplicate++;
76e03000 1933 return;
5e5fdcff 1934 }
76e03000
XG
1935
1936 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1937 compression_counters.compressed_size += bytes_xmit - 8;
1938 compression_counters.pages++;
56e93d26
JQ
1939}
1940
32b05495
XG
1941static bool save_page_use_compression(RAMState *rs);
1942
ce25d337 1943static void flush_compressed_data(RAMState *rs)
56e93d26
JQ
1944{
1945 int idx, len, thread_count;
1946
32b05495 1947 if (!save_page_use_compression(rs)) {
56e93d26
JQ
1948 return;
1949 }
1950 thread_count = migrate_compress_threads();
a7a9a88f 1951
0d9f9a5c 1952 qemu_mutex_lock(&comp_done_lock);
56e93d26 1953 for (idx = 0; idx < thread_count; idx++) {
a7a9a88f 1954 while (!comp_param[idx].done) {
0d9f9a5c 1955 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26 1956 }
a7a9a88f 1957 }
0d9f9a5c 1958 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
1959
1960 for (idx = 0; idx < thread_count; idx++) {
1961 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 1962 if (!comp_param[idx].quit) {
ce25d337 1963 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
5e5fdcff
XG
1964 /*
1965 * it's safe to fetch zero_page without holding comp_done_lock
1966 * as there is no further request submitted to the thread,
1967 * i.e, the thread should be waiting for a request at this point.
1968 */
1969 update_compress_thread_counts(&comp_param[idx], len);
56e93d26 1970 }
a7a9a88f 1971 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26
JQ
1972 }
1973}
1974
1975static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1976 ram_addr_t offset)
1977{
1978 param->block = block;
1979 param->offset = offset;
1980}
1981
ce25d337
JQ
1982static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1983 ram_addr_t offset)
56e93d26
JQ
1984{
1985 int idx, thread_count, bytes_xmit = -1, pages = -1;
1d58872a 1986 bool wait = migrate_compress_wait_thread();
56e93d26
JQ
1987
1988 thread_count = migrate_compress_threads();
0d9f9a5c 1989 qemu_mutex_lock(&comp_done_lock);
1d58872a
XG
1990retry:
1991 for (idx = 0; idx < thread_count; idx++) {
1992 if (comp_param[idx].done) {
1993 comp_param[idx].done = false;
1994 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1995 qemu_mutex_lock(&comp_param[idx].mutex);
1996 set_compress_params(&comp_param[idx], block, offset);
1997 qemu_cond_signal(&comp_param[idx].cond);
1998 qemu_mutex_unlock(&comp_param[idx].mutex);
1999 pages = 1;
5e5fdcff 2000 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
56e93d26 2001 break;
56e93d26
JQ
2002 }
2003 }
1d58872a
XG
2004
2005 /*
2006 * wait for the free thread if the user specifies 'compress-wait-thread',
2007 * otherwise we will post the page out in the main thread as normal page.
2008 */
2009 if (pages < 0 && wait) {
2010 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2011 goto retry;
2012 }
0d9f9a5c 2013 qemu_mutex_unlock(&comp_done_lock);
56e93d26
JQ
2014
2015 return pages;
2016}
2017
3d0684b2
JQ
2018/**
2019 * find_dirty_block: find the next dirty page and update any state
2020 * associated with the search process.
b9e60928 2021 *
3d0684b2 2022 * Returns if a page is found
b9e60928 2023 *
6f37bb8b 2024 * @rs: current RAM state
3d0684b2
JQ
2025 * @pss: data about the state of the current dirty page scan
2026 * @again: set to false if the search has scanned the whole of RAM
b9e60928 2027 */
f20e2865 2028static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
b9e60928 2029{
f20e2865 2030 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
6f37bb8b 2031 if (pss->complete_round && pss->block == rs->last_seen_block &&
a935e30f 2032 pss->page >= rs->last_page) {
b9e60928
DDAG
2033 /*
2034 * We've been once around the RAM and haven't found anything.
2035 * Give up.
2036 */
2037 *again = false;
2038 return false;
2039 }
a935e30f 2040 if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
b9e60928 2041 /* Didn't find anything in this RAM Block */
a935e30f 2042 pss->page = 0;
b9e60928
DDAG
2043 pss->block = QLIST_NEXT_RCU(pss->block, next);
2044 if (!pss->block) {
48df9d80
XG
2045 /*
2046 * If memory migration starts over, we will meet a dirtied page
2047 * which may still exists in compression threads's ring, so we
2048 * should flush the compressed data to make sure the new page
2049 * is not overwritten by the old one in the destination.
2050 *
2051 * Also If xbzrle is on, stop using the data compression at this
2052 * point. In theory, xbzrle can do better than compression.
2053 */
2054 flush_compressed_data(rs);
2055
b9e60928
DDAG
2056 /* Hit the end of the list */
2057 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
2058 /* Flag that we've looped */
2059 pss->complete_round = true;
6f37bb8b 2060 rs->ram_bulk_stage = false;
b9e60928
DDAG
2061 }
2062 /* Didn't find anything this time, but try again on the new block */
2063 *again = true;
2064 return false;
2065 } else {
2066 /* Can go around again, but... */
2067 *again = true;
2068 /* We've found something so probably don't need to */
2069 return true;
2070 }
2071}
2072
3d0684b2
JQ
2073/**
2074 * unqueue_page: gets a page of the queue
2075 *
a82d593b 2076 * Helper for 'get_queued_page' - gets a page off the queue
a82d593b 2077 *
3d0684b2
JQ
2078 * Returns the block of the page (or NULL if none available)
2079 *
ec481c6c 2080 * @rs: current RAM state
3d0684b2 2081 * @offset: used to return the offset within the RAMBlock
a82d593b 2082 */
f20e2865 2083static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
a82d593b
DDAG
2084{
2085 RAMBlock *block = NULL;
2086
ae526e32
XG
2087 if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
2088 return NULL;
2089 }
2090
ec481c6c
JQ
2091 qemu_mutex_lock(&rs->src_page_req_mutex);
2092 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2093 struct RAMSrcPageRequest *entry =
2094 QSIMPLEQ_FIRST(&rs->src_page_requests);
a82d593b
DDAG
2095 block = entry->rb;
2096 *offset = entry->offset;
a82d593b
DDAG
2097
2098 if (entry->len > TARGET_PAGE_SIZE) {
2099 entry->len -= TARGET_PAGE_SIZE;
2100 entry->offset += TARGET_PAGE_SIZE;
2101 } else {
2102 memory_region_unref(block->mr);
ec481c6c 2103 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
a82d593b 2104 g_free(entry);
e03a34f8 2105 migration_consume_urgent_request();
a82d593b
DDAG
2106 }
2107 }
ec481c6c 2108 qemu_mutex_unlock(&rs->src_page_req_mutex);
a82d593b
DDAG
2109
2110 return block;
2111}
2112
3d0684b2
JQ
2113/**
2114 * get_queued_page: unqueue a page from the postocpy requests
2115 *
2116 * Skips pages that are already sent (!dirty)
a82d593b 2117 *
3d0684b2 2118 * Returns if a queued page is found
a82d593b 2119 *
6f37bb8b 2120 * @rs: current RAM state
3d0684b2 2121 * @pss: data about the state of the current dirty page scan
a82d593b 2122 */
f20e2865 2123static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
a82d593b
DDAG
2124{
2125 RAMBlock *block;
2126 ram_addr_t offset;
2127 bool dirty;
2128
2129 do {
f20e2865 2130 block = unqueue_page(rs, &offset);
a82d593b
DDAG
2131 /*
2132 * We're sending this page, and since it's postcopy nothing else
2133 * will dirty it, and we must make sure it doesn't get sent again
2134 * even if this queue request was received after the background
2135 * search already sent it.
2136 */
2137 if (block) {
f20e2865
JQ
2138 unsigned long page;
2139
6b6712ef
JQ
2140 page = offset >> TARGET_PAGE_BITS;
2141 dirty = test_bit(page, block->bmap);
a82d593b 2142 if (!dirty) {
06b10688 2143 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
6b6712ef 2144 page, test_bit(page, block->unsentmap));
a82d593b 2145 } else {
f20e2865 2146 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
a82d593b
DDAG
2147 }
2148 }
2149
2150 } while (block && !dirty);
2151
2152 if (block) {
2153 /*
2154 * As soon as we start servicing pages out of order, then we have
2155 * to kill the bulk stage, since the bulk stage assumes
2156 * in (migration_bitmap_find_and_reset_dirty) that every page is
2157 * dirty, that's no longer true.
2158 */
6f37bb8b 2159 rs->ram_bulk_stage = false;
a82d593b
DDAG
2160
2161 /*
2162 * We want the background search to continue from the queued page
2163 * since the guest is likely to want other pages near to the page
2164 * it just requested.
2165 */
2166 pss->block = block;
a935e30f 2167 pss->page = offset >> TARGET_PAGE_BITS;
a82d593b
DDAG
2168 }
2169
2170 return !!block;
2171}
2172
6c595cde 2173/**
5e58f968
JQ
2174 * migration_page_queue_free: drop any remaining pages in the ram
2175 * request queue
6c595cde 2176 *
3d0684b2
JQ
2177 * It should be empty at the end anyway, but in error cases there may
2178 * be some left. in case that there is any page left, we drop it.
2179 *
6c595cde 2180 */
83c13382 2181static void migration_page_queue_free(RAMState *rs)
6c595cde 2182{
ec481c6c 2183 struct RAMSrcPageRequest *mspr, *next_mspr;
6c595cde
DDAG
2184 /* This queue generally should be empty - but in the case of a failed
2185 * migration might have some droppings in.
2186 */
2187 rcu_read_lock();
ec481c6c 2188 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
6c595cde 2189 memory_region_unref(mspr->rb->mr);
ec481c6c 2190 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
6c595cde
DDAG
2191 g_free(mspr);
2192 }
2193 rcu_read_unlock();
2194}
2195
2196/**
3d0684b2
JQ
2197 * ram_save_queue_pages: queue the page for transmission
2198 *
2199 * A request from postcopy destination for example.
2200 *
2201 * Returns zero on success or negative on error
2202 *
3d0684b2
JQ
2203 * @rbname: Name of the RAMBLock of the request. NULL means the
2204 * same that last one.
2205 * @start: starting address from the start of the RAMBlock
2206 * @len: length (in bytes) to send
6c595cde 2207 */
96506894 2208int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
6c595cde
DDAG
2209{
2210 RAMBlock *ramblock;
53518d94 2211 RAMState *rs = ram_state;
6c595cde 2212
9360447d 2213 ram_counters.postcopy_requests++;
6c595cde
DDAG
2214 rcu_read_lock();
2215 if (!rbname) {
2216 /* Reuse last RAMBlock */
68a098f3 2217 ramblock = rs->last_req_rb;
6c595cde
DDAG
2218
2219 if (!ramblock) {
2220 /*
2221 * Shouldn't happen, we can't reuse the last RAMBlock if
2222 * it's the 1st request.
2223 */
2224 error_report("ram_save_queue_pages no previous block");
2225 goto err;
2226 }
2227 } else {
2228 ramblock = qemu_ram_block_by_name(rbname);
2229
2230 if (!ramblock) {
2231 /* We shouldn't be asked for a non-existent RAMBlock */
2232 error_report("ram_save_queue_pages no block '%s'", rbname);
2233 goto err;
2234 }
68a098f3 2235 rs->last_req_rb = ramblock;
6c595cde
DDAG
2236 }
2237 trace_ram_save_queue_pages(ramblock->idstr, start, len);
2238 if (start+len > ramblock->used_length) {
9458ad6b
JQ
2239 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2240 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
6c595cde
DDAG
2241 __func__, start, len, ramblock->used_length);
2242 goto err;
2243 }
2244
ec481c6c
JQ
2245 struct RAMSrcPageRequest *new_entry =
2246 g_malloc0(sizeof(struct RAMSrcPageRequest));
6c595cde
DDAG
2247 new_entry->rb = ramblock;
2248 new_entry->offset = start;
2249 new_entry->len = len;
2250
2251 memory_region_ref(ramblock->mr);
ec481c6c
JQ
2252 qemu_mutex_lock(&rs->src_page_req_mutex);
2253 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
e03a34f8 2254 migration_make_urgent_request();
ec481c6c 2255 qemu_mutex_unlock(&rs->src_page_req_mutex);
6c595cde
DDAG
2256 rcu_read_unlock();
2257
2258 return 0;
2259
2260err:
2261 rcu_read_unlock();
2262 return -1;
2263}
2264
d7400a34
XG
2265static bool save_page_use_compression(RAMState *rs)
2266{
2267 if (!migrate_use_compression()) {
2268 return false;
2269 }
2270
2271 /*
2272 * If xbzrle is on, stop using the data compression after first
2273 * round of migration even if compression is enabled. In theory,
2274 * xbzrle can do better than compression.
2275 */
2276 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
2277 return true;
2278 }
2279
2280 return false;
2281}
2282
5e5fdcff
XG
2283/*
2284 * try to compress the page before posting it out, return true if the page
2285 * has been properly handled by compression, otherwise needs other
2286 * paths to handle it
2287 */
2288static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2289{
2290 if (!save_page_use_compression(rs)) {
2291 return false;
2292 }
2293
2294 /*
2295 * When starting the process of a new block, the first page of
2296 * the block should be sent out before other pages in the same
2297 * block, and all the pages in last block should have been sent
2298 * out, keeping this order is important, because the 'cont' flag
2299 * is used to avoid resending the block name.
2300 *
2301 * We post the fist page as normal page as compression will take
2302 * much CPU resource.
2303 */
2304 if (block != rs->last_sent_block) {
2305 flush_compressed_data(rs);
2306 return false;
2307 }
2308
2309 if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2310 return true;
2311 }
2312
76e03000 2313 compression_counters.busy++;
5e5fdcff
XG
2314 return false;
2315}
2316
a82d593b 2317/**
3d0684b2 2318 * ram_save_target_page: save one target page
a82d593b 2319 *
3d0684b2 2320 * Returns the number of pages written
a82d593b 2321 *
6f37bb8b 2322 * @rs: current RAM state
3d0684b2 2323 * @pss: data about the page we want to send
a82d593b 2324 * @last_stage: if we are at the completion stage
a82d593b 2325 */
a0a8aa14 2326static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
f20e2865 2327 bool last_stage)
a82d593b 2328{
a8ec91f9
XG
2329 RAMBlock *block = pss->block;
2330 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2331 int res;
2332
2333 if (control_save_page(rs, block, offset, &res)) {
2334 return res;
2335 }
2336
5e5fdcff
XG
2337 if (save_compress_page(rs, block, offset)) {
2338 return 1;
d7400a34
XG
2339 }
2340
2341 res = save_zero_page(rs, block, offset);
2342 if (res > 0) {
2343 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2344 * page would be stale
2345 */
2346 if (!save_page_use_compression(rs)) {
2347 XBZRLE_cache_lock();
2348 xbzrle_cache_zero_page(rs, block->offset + offset);
2349 XBZRLE_cache_unlock();
2350 }
2351 ram_release_pages(block->idstr, offset, res);
2352 return res;
2353 }
2354
da3f56cb 2355 /*
5e5fdcff
XG
2356 * do not use multifd for compression as the first page in the new
2357 * block should be posted out before sending the compressed page
da3f56cb 2358 */
5e5fdcff 2359 if (!save_page_use_compression(rs) && migrate_use_multifd()) {
b9ee2f7d 2360 return ram_save_multifd_page(rs, block, offset);
a82d593b
DDAG
2361 }
2362
1faa5665 2363 return ram_save_page(rs, pss, last_stage);
a82d593b
DDAG
2364}
2365
2366/**
3d0684b2 2367 * ram_save_host_page: save a whole host page
a82d593b 2368 *
3d0684b2
JQ
2369 * Starting at *offset send pages up to the end of the current host
2370 * page. It's valid for the initial offset to point into the middle of
2371 * a host page in which case the remainder of the hostpage is sent.
2372 * Only dirty target pages are sent. Note that the host page size may
2373 * be a huge page for this block.
1eb3fc0a
DDAG
2374 * The saving stops at the boundary of the used_length of the block
2375 * if the RAMBlock isn't a multiple of the host page size.
a82d593b 2376 *
3d0684b2
JQ
2377 * Returns the number of pages written or negative on error
2378 *
6f37bb8b 2379 * @rs: current RAM state
3d0684b2 2380 * @ms: current migration state
3d0684b2 2381 * @pss: data about the page we want to send
a82d593b 2382 * @last_stage: if we are at the completion stage
a82d593b 2383 */
a0a8aa14 2384static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
f20e2865 2385 bool last_stage)
a82d593b
DDAG
2386{
2387 int tmppages, pages = 0;
a935e30f
JQ
2388 size_t pagesize_bits =
2389 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
4c011c37 2390
b895de50
CLG
2391 if (!qemu_ram_is_migratable(pss->block)) {
2392 error_report("block %s should not be migrated !", pss->block->idstr);
2393 return 0;
2394 }
2395
a82d593b 2396 do {
1faa5665
XG
2397 /* Check the pages is dirty and if it is send it */
2398 if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2399 pss->page++;
2400 continue;
2401 }
2402
f20e2865 2403 tmppages = ram_save_target_page(rs, pss, last_stage);
a82d593b
DDAG
2404 if (tmppages < 0) {
2405 return tmppages;
2406 }
2407
2408 pages += tmppages;
1faa5665
XG
2409 if (pss->block->unsentmap) {
2410 clear_bit(pss->page, pss->block->unsentmap);
2411 }
2412
a935e30f 2413 pss->page++;
1eb3fc0a
DDAG
2414 } while ((pss->page & (pagesize_bits - 1)) &&
2415 offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
a82d593b
DDAG
2416
2417 /* The offset we leave with is the last one we looked at */
a935e30f 2418 pss->page--;
a82d593b
DDAG
2419 return pages;
2420}
6c595cde 2421
56e93d26 2422/**
3d0684b2 2423 * ram_find_and_save_block: finds a dirty page and sends it to f
56e93d26
JQ
2424 *
2425 * Called within an RCU critical section.
2426 *
e8f3735f
XG
2427 * Returns the number of pages written where zero means no dirty pages,
2428 * or negative on error
56e93d26 2429 *
6f37bb8b 2430 * @rs: current RAM state
56e93d26 2431 * @last_stage: if we are at the completion stage
a82d593b
DDAG
2432 *
2433 * On systems where host-page-size > target-page-size it will send all the
2434 * pages in a host page that are dirty.
56e93d26
JQ
2435 */
2436
ce25d337 2437static int ram_find_and_save_block(RAMState *rs, bool last_stage)
56e93d26 2438{
b8fb8cb7 2439 PageSearchStatus pss;
56e93d26 2440 int pages = 0;
b9e60928 2441 bool again, found;
56e93d26 2442
0827b9e9
AA
2443 /* No dirty page as there is zero RAM */
2444 if (!ram_bytes_total()) {
2445 return pages;
2446 }
2447
6f37bb8b 2448 pss.block = rs->last_seen_block;
a935e30f 2449 pss.page = rs->last_page;
b8fb8cb7
DDAG
2450 pss.complete_round = false;
2451
2452 if (!pss.block) {
2453 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2454 }
56e93d26 2455
b9e60928 2456 do {
a82d593b 2457 again = true;
f20e2865 2458 found = get_queued_page(rs, &pss);
b9e60928 2459
a82d593b
DDAG
2460 if (!found) {
2461 /* priority queue empty, so just search for something dirty */
f20e2865 2462 found = find_dirty_block(rs, &pss, &again);
a82d593b 2463 }
f3f491fc 2464
a82d593b 2465 if (found) {
f20e2865 2466 pages = ram_save_host_page(rs, &pss, last_stage);
56e93d26 2467 }
b9e60928 2468 } while (!pages && again);
56e93d26 2469
6f37bb8b 2470 rs->last_seen_block = pss.block;
a935e30f 2471 rs->last_page = pss.page;
56e93d26
JQ
2472
2473 return pages;
2474}
2475
2476void acct_update_position(QEMUFile *f, size_t size, bool zero)
2477{
2478 uint64_t pages = size / TARGET_PAGE_SIZE;
f7ccd61b 2479
56e93d26 2480 if (zero) {
9360447d 2481 ram_counters.duplicate += pages;
56e93d26 2482 } else {
9360447d
JQ
2483 ram_counters.normal += pages;
2484 ram_counters.transferred += size;
56e93d26
JQ
2485 qemu_update_position(f, size);
2486 }
2487}
2488
56e93d26
JQ
2489uint64_t ram_bytes_total(void)
2490{
2491 RAMBlock *block;
2492 uint64_t total = 0;
2493
2494 rcu_read_lock();
b895de50 2495 RAMBLOCK_FOREACH_MIGRATABLE(block) {
56e93d26 2496 total += block->used_length;
99e15582 2497 }
56e93d26
JQ
2498 rcu_read_unlock();
2499 return total;
2500}
2501
f265e0e4 2502static void xbzrle_load_setup(void)
56e93d26 2503{
f265e0e4 2504 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
56e93d26
JQ
2505}
2506
f265e0e4
JQ
2507static void xbzrle_load_cleanup(void)
2508{
2509 g_free(XBZRLE.decoded_buf);
2510 XBZRLE.decoded_buf = NULL;
2511}
2512
7d7c96be
PX
2513static void ram_state_cleanup(RAMState **rsp)
2514{
b9ccaf6d
DDAG
2515 if (*rsp) {
2516 migration_page_queue_free(*rsp);
2517 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2518 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2519 g_free(*rsp);
2520 *rsp = NULL;
2521 }
7d7c96be
PX
2522}
2523
84593a08
PX
2524static void xbzrle_cleanup(void)
2525{
2526 XBZRLE_cache_lock();
2527 if (XBZRLE.cache) {
2528 cache_fini(XBZRLE.cache);
2529 g_free(XBZRLE.encoded_buf);
2530 g_free(XBZRLE.current_buf);
2531 g_free(XBZRLE.zero_target_page);
2532 XBZRLE.cache = NULL;
2533 XBZRLE.encoded_buf = NULL;
2534 XBZRLE.current_buf = NULL;
2535 XBZRLE.zero_target_page = NULL;
2536 }
2537 XBZRLE_cache_unlock();
2538}
2539
f265e0e4 2540static void ram_save_cleanup(void *opaque)
56e93d26 2541{
53518d94 2542 RAMState **rsp = opaque;
6b6712ef 2543 RAMBlock *block;
eb859c53 2544
2ff64038
LZ
2545 /* caller have hold iothread lock or is in a bh, so there is
2546 * no writing race against this migration_bitmap
2547 */
6b6712ef
JQ
2548 memory_global_dirty_log_stop();
2549
b895de50 2550 RAMBLOCK_FOREACH_MIGRATABLE(block) {
6b6712ef
JQ
2551 g_free(block->bmap);
2552 block->bmap = NULL;
2553 g_free(block->unsentmap);
2554 block->unsentmap = NULL;
56e93d26
JQ
2555 }
2556
84593a08 2557 xbzrle_cleanup();
f0afa331 2558 compress_threads_save_cleanup();
7d7c96be 2559 ram_state_cleanup(rsp);
56e93d26
JQ
2560}
2561
6f37bb8b 2562static void ram_state_reset(RAMState *rs)
56e93d26 2563{
6f37bb8b
JQ
2564 rs->last_seen_block = NULL;
2565 rs->last_sent_block = NULL;
269ace29 2566 rs->last_page = 0;
6f37bb8b
JQ
2567 rs->last_version = ram_list.version;
2568 rs->ram_bulk_stage = true;
56e93d26
JQ
2569}
2570
2571#define MAX_WAIT 50 /* ms, half buffered_file limit */
2572
4f2e4252
DDAG
2573/*
2574 * 'expected' is the value you expect the bitmap mostly to be full
2575 * of; it won't bother printing lines that are all this value.
2576 * If 'todump' is null the migration bitmap is dumped.
2577 */
6b6712ef
JQ
2578void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2579 unsigned long pages)
4f2e4252 2580{
4f2e4252
DDAG
2581 int64_t cur;
2582 int64_t linelen = 128;
2583 char linebuf[129];
2584
6b6712ef 2585 for (cur = 0; cur < pages; cur += linelen) {
4f2e4252
DDAG
2586 int64_t curb;
2587 bool found = false;
2588 /*
2589 * Last line; catch the case where the line length
2590 * is longer than remaining ram
2591 */
6b6712ef
JQ
2592 if (cur + linelen > pages) {
2593 linelen = pages - cur;
4f2e4252
DDAG
2594 }
2595 for (curb = 0; curb < linelen; curb++) {
2596 bool thisbit = test_bit(cur + curb, todump);
2597 linebuf[curb] = thisbit ? '1' : '.';
2598 found = found || (thisbit != expected);
2599 }
2600 if (found) {
2601 linebuf[curb] = '\0';
2602 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
2603 }
2604 }
2605}
2606
e0b266f0
DDAG
2607/* **** functions for postcopy ***** */
2608
ced1c616
PB
2609void ram_postcopy_migrated_memory_release(MigrationState *ms)
2610{
2611 struct RAMBlock *block;
ced1c616 2612
b895de50 2613 RAMBLOCK_FOREACH_MIGRATABLE(block) {
6b6712ef
JQ
2614 unsigned long *bitmap = block->bmap;
2615 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2616 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
ced1c616
PB
2617
2618 while (run_start < range) {
2619 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
aaa2064c 2620 ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
ced1c616
PB
2621 (run_end - run_start) << TARGET_PAGE_BITS);
2622 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2623 }
2624 }
2625}
2626
3d0684b2
JQ
2627/**
2628 * postcopy_send_discard_bm_ram: discard a RAMBlock
2629 *
2630 * Returns zero on success
2631 *
e0b266f0
DDAG
2632 * Callback from postcopy_each_ram_send_discard for each RAMBlock
2633 * Note: At this point the 'unsentmap' is the processed bitmap combined
2634 * with the dirtymap; so a '1' means it's either dirty or unsent.
3d0684b2
JQ
2635 *
2636 * @ms: current migration state
2637 * @pds: state for postcopy
2638 * @start: RAMBlock starting page
2639 * @length: RAMBlock size
e0b266f0
DDAG
2640 */
2641static int postcopy_send_discard_bm_ram(MigrationState *ms,
2642 PostcopyDiscardState *pds,
6b6712ef 2643 RAMBlock *block)
e0b266f0 2644{
6b6712ef 2645 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
e0b266f0 2646 unsigned long current;
6b6712ef 2647 unsigned long *unsentmap = block->unsentmap;
e0b266f0 2648
6b6712ef 2649 for (current = 0; current < end; ) {
e0b266f0
DDAG
2650 unsigned long one = find_next_bit(unsentmap, end, current);
2651
2652 if (one <= end) {
2653 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
2654 unsigned long discard_length;
2655
2656 if (zero >= end) {
2657 discard_length = end - one;
2658 } else {
2659 discard_length = zero - one;
2660 }
d688c62d
DDAG
2661 if (discard_length) {
2662 postcopy_discard_send_range(ms, pds, one, discard_length);
2663 }
e0b266f0
DDAG
2664 current = one + discard_length;
2665 } else {
2666 current = one;
2667 }
2668 }
2669
2670 return 0;
2671}
2672
3d0684b2
JQ
2673/**
2674 * postcopy_each_ram_send_discard: discard all RAMBlocks
2675 *
2676 * Returns 0 for success or negative for error
2677 *
e0b266f0
DDAG
2678 * Utility for the outgoing postcopy code.
2679 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2680 * passing it bitmap indexes and name.
e0b266f0
DDAG
2681 * (qemu_ram_foreach_block ends up passing unscaled lengths
2682 * which would mean postcopy code would have to deal with target page)
3d0684b2
JQ
2683 *
2684 * @ms: current migration state
e0b266f0
DDAG
2685 */
2686static int postcopy_each_ram_send_discard(MigrationState *ms)
2687{
2688 struct RAMBlock *block;
2689 int ret;
2690
b895de50 2691 RAMBLOCK_FOREACH_MIGRATABLE(block) {
6b6712ef
JQ
2692 PostcopyDiscardState *pds =
2693 postcopy_discard_send_init(ms, block->idstr);
e0b266f0
DDAG
2694
2695 /*
2696 * Postcopy sends chunks of bitmap over the wire, but it
2697 * just needs indexes at this point, avoids it having
2698 * target page specific code.
2699 */
6b6712ef 2700 ret = postcopy_send_discard_bm_ram(ms, pds, block);
e0b266f0
DDAG
2701 postcopy_discard_send_finish(ms, pds);
2702 if (ret) {
2703 return ret;
2704 }
2705 }
2706
2707 return 0;
2708}
2709
3d0684b2
JQ
2710/**
2711 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
2712 *
2713 * Helper for postcopy_chunk_hostpages; it's called twice to
2714 * canonicalize the two bitmaps, that are similar, but one is
2715 * inverted.
99e314eb 2716 *
3d0684b2
JQ
2717 * Postcopy requires that all target pages in a hostpage are dirty or
2718 * clean, not a mix. This function canonicalizes the bitmaps.
99e314eb 2719 *
3d0684b2
JQ
2720 * @ms: current migration state
2721 * @unsent_pass: if true we need to canonicalize partially unsent host pages
2722 * otherwise we need to canonicalize partially dirty host pages
2723 * @block: block that contains the page we want to canonicalize
2724 * @pds: state for postcopy
99e314eb
DDAG
2725 */
2726static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
2727 RAMBlock *block,
2728 PostcopyDiscardState *pds)
2729{
53518d94 2730 RAMState *rs = ram_state;
6b6712ef
JQ
2731 unsigned long *bitmap = block->bmap;
2732 unsigned long *unsentmap = block->unsentmap;
29c59172 2733 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
6b6712ef 2734 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
99e314eb
DDAG
2735 unsigned long run_start;
2736
29c59172
DDAG
2737 if (block->page_size == TARGET_PAGE_SIZE) {
2738 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2739 return;
2740 }
2741
99e314eb
DDAG
2742 if (unsent_pass) {
2743 /* Find a sent page */
6b6712ef 2744 run_start = find_next_zero_bit(unsentmap, pages, 0);
99e314eb
DDAG
2745 } else {
2746 /* Find a dirty page */
6b6712ef 2747 run_start = find_next_bit(bitmap, pages, 0);
99e314eb
DDAG
2748 }
2749
6b6712ef 2750 while (run_start < pages) {
99e314eb
DDAG
2751 bool do_fixup = false;
2752 unsigned long fixup_start_addr;
2753 unsigned long host_offset;
2754
2755 /*
2756 * If the start of this run of pages is in the middle of a host
2757 * page, then we need to fixup this host page.
2758 */
2759 host_offset = run_start % host_ratio;
2760 if (host_offset) {
2761 do_fixup = true;
2762 run_start -= host_offset;
2763 fixup_start_addr = run_start;
2764 /* For the next pass */
2765 run_start = run_start + host_ratio;
2766 } else {
2767 /* Find the end of this run */
2768 unsigned long run_end;
2769 if (unsent_pass) {
6b6712ef 2770 run_end = find_next_bit(unsentmap, pages, run_start + 1);
99e314eb 2771 } else {
6b6712ef 2772 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
99e314eb
DDAG
2773 }
2774 /*
2775 * If the end isn't at the start of a host page, then the
2776 * run doesn't finish at the end of a host page
2777 * and we need to discard.
2778 */
2779 host_offset = run_end % host_ratio;
2780 if (host_offset) {
2781 do_fixup = true;
2782 fixup_start_addr = run_end - host_offset;
2783 /*
2784 * This host page has gone, the next loop iteration starts
2785 * from after the fixup
2786 */
2787 run_start = fixup_start_addr + host_ratio;
2788 } else {
2789 /*
2790 * No discards on this iteration, next loop starts from
2791 * next sent/dirty page
2792 */
2793 run_start = run_end + 1;
2794 }
2795 }
2796
2797 if (do_fixup) {
2798 unsigned long page;
2799
2800 /* Tell the destination to discard this page */
2801 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
2802 /* For the unsent_pass we:
2803 * discard partially sent pages
2804 * For the !unsent_pass (dirty) we:
2805 * discard partially dirty pages that were sent
2806 * (any partially sent pages were already discarded
2807 * by the previous unsent_pass)
2808 */
2809 postcopy_discard_send_range(ms, pds, fixup_start_addr,
2810 host_ratio);
2811 }
2812
2813 /* Clean up the bitmap */
2814 for (page = fixup_start_addr;
2815 page < fixup_start_addr + host_ratio; page++) {
2816 /* All pages in this host page are now not sent */
2817 set_bit(page, unsentmap);
2818
2819 /*
2820 * Remark them as dirty, updating the count for any pages
2821 * that weren't previously dirty.
2822 */
0d8ec885 2823 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
99e314eb
DDAG
2824 }
2825 }
2826
2827 if (unsent_pass) {
2828 /* Find the next sent page for the next iteration */
6b6712ef 2829 run_start = find_next_zero_bit(unsentmap, pages, run_start);
99e314eb
DDAG
2830 } else {
2831 /* Find the next dirty page for the next iteration */
6b6712ef 2832 run_start = find_next_bit(bitmap, pages, run_start);
99e314eb
DDAG
2833 }
2834 }
2835}
2836
3d0684b2
JQ
2837/**
2838 * postcopy_chuck_hostpages: discrad any partially sent host page
2839 *
99e314eb
DDAG
2840 * Utility for the outgoing postcopy code.
2841 *
2842 * Discard any partially sent host-page size chunks, mark any partially
29c59172
DDAG
2843 * dirty host-page size chunks as all dirty. In this case the host-page
2844 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
99e314eb 2845 *
3d0684b2
JQ
2846 * Returns zero on success
2847 *
2848 * @ms: current migration state
6b6712ef 2849 * @block: block we want to work with
99e314eb 2850 */
6b6712ef 2851static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
99e314eb 2852{
6b6712ef
JQ
2853 PostcopyDiscardState *pds =
2854 postcopy_discard_send_init(ms, block->idstr);
99e314eb 2855
6b6712ef
JQ
2856 /* First pass: Discard all partially sent host pages */
2857 postcopy_chunk_hostpages_pass(ms, true, block, pds);
2858 /*
2859 * Second pass: Ensure that all partially dirty host pages are made
2860 * fully dirty.
2861 */
2862 postcopy_chunk_hostpages_pass(ms, false, block, pds);
99e314eb 2863
6b6712ef 2864 postcopy_discard_send_finish(ms, pds);
99e314eb
DDAG
2865 return 0;
2866}
2867
3d0684b2
JQ
2868/**
2869 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2870 *
2871 * Returns zero on success
2872 *
e0b266f0
DDAG
2873 * Transmit the set of pages to be discarded after precopy to the target
2874 * these are pages that:
2875 * a) Have been previously transmitted but are now dirty again
2876 * b) Pages that have never been transmitted, this ensures that
2877 * any pages on the destination that have been mapped by background
2878 * tasks get discarded (transparent huge pages is the specific concern)
2879 * Hopefully this is pretty sparse
3d0684b2
JQ
2880 *
2881 * @ms: current migration state
e0b266f0
DDAG
2882 */
2883int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2884{
53518d94 2885 RAMState *rs = ram_state;
6b6712ef 2886 RAMBlock *block;
e0b266f0 2887 int ret;
e0b266f0
DDAG
2888
2889 rcu_read_lock();
2890
2891 /* This should be our last sync, the src is now paused */
eb859c53 2892 migration_bitmap_sync(rs);
e0b266f0 2893
6b6712ef
JQ
2894 /* Easiest way to make sure we don't resume in the middle of a host-page */
2895 rs->last_seen_block = NULL;
2896 rs->last_sent_block = NULL;
2897 rs->last_page = 0;
e0b266f0 2898
b895de50 2899 RAMBLOCK_FOREACH_MIGRATABLE(block) {
6b6712ef
JQ
2900 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2901 unsigned long *bitmap = block->bmap;
2902 unsigned long *unsentmap = block->unsentmap;
2903
2904 if (!unsentmap) {
2905 /* We don't have a safe way to resize the sentmap, so
2906 * if the bitmap was resized it will be NULL at this
2907 * point.
2908 */
2909 error_report("migration ram resized during precopy phase");
2910 rcu_read_unlock();
2911 return -EINVAL;
2912 }
2913 /* Deal with TPS != HPS and huge pages */
2914 ret = postcopy_chunk_hostpages(ms, block);
2915 if (ret) {
2916 rcu_read_unlock();
2917 return ret;
2918 }
e0b266f0 2919
6b6712ef
JQ
2920 /*
2921 * Update the unsentmap to be unsentmap = unsentmap | dirty
2922 */
2923 bitmap_or(unsentmap, unsentmap, bitmap, pages);
e0b266f0 2924#ifdef DEBUG_POSTCOPY
6b6712ef 2925 ram_debug_dump_bitmap(unsentmap, true, pages);
e0b266f0 2926#endif
6b6712ef
JQ
2927 }
2928 trace_ram_postcopy_send_discard_bitmap();
e0b266f0
DDAG
2929
2930 ret = postcopy_each_ram_send_discard(ms);
2931 rcu_read_unlock();
2932
2933 return ret;
2934}
2935
3d0684b2
JQ
2936/**
2937 * ram_discard_range: discard dirtied pages at the beginning of postcopy
e0b266f0 2938 *
3d0684b2 2939 * Returns zero on success
e0b266f0 2940 *
36449157
JQ
2941 * @rbname: name of the RAMBlock of the request. NULL means the
2942 * same that last one.
3d0684b2
JQ
2943 * @start: RAMBlock starting page
2944 * @length: RAMBlock size
e0b266f0 2945 */
aaa2064c 2946int ram_discard_range(const char *rbname, uint64_t start, size_t length)
e0b266f0
DDAG
2947{
2948 int ret = -1;
2949
36449157 2950 trace_ram_discard_range(rbname, start, length);
d3a5038c 2951
e0b266f0 2952 rcu_read_lock();
36449157 2953 RAMBlock *rb = qemu_ram_block_by_name(rbname);
e0b266f0
DDAG
2954
2955 if (!rb) {
36449157 2956 error_report("ram_discard_range: Failed to find block '%s'", rbname);
e0b266f0
DDAG
2957 goto err;
2958 }
2959
814bb08f
PX
2960 /*
2961 * On source VM, we don't need to update the received bitmap since
2962 * we don't even have one.
2963 */
2964 if (rb->receivedmap) {
2965 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2966 length >> qemu_target_page_bits());
2967 }
2968
d3a5038c 2969 ret = ram_block_discard_range(rb, start, length);
e0b266f0
DDAG
2970
2971err:
2972 rcu_read_unlock();
2973
2974 return ret;
2975}
2976
84593a08
PX
2977/*
2978 * For every allocation, we will try not to crash the VM if the
2979 * allocation failed.
2980 */
2981static int xbzrle_init(void)
2982{
2983 Error *local_err = NULL;
2984
2985 if (!migrate_use_xbzrle()) {
2986 return 0;
2987 }
2988
2989 XBZRLE_cache_lock();
2990
2991 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2992 if (!XBZRLE.zero_target_page) {
2993 error_report("%s: Error allocating zero page", __func__);
2994 goto err_out;
2995 }
2996
2997 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2998 TARGET_PAGE_SIZE, &local_err);
2999 if (!XBZRLE.cache) {
3000 error_report_err(local_err);
3001 goto free_zero_page;
3002 }
3003
3004 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3005 if (!XBZRLE.encoded_buf) {
3006 error_report("%s: Error allocating encoded_buf", __func__);
3007 goto free_cache;
3008 }
3009
3010 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3011 if (!XBZRLE.current_buf) {
3012 error_report("%s: Error allocating current_buf", __func__);
3013 goto free_encoded_buf;
3014 }
3015
3016 /* We are all good */
3017 XBZRLE_cache_unlock();
3018 return 0;
3019
3020free_encoded_buf:
3021 g_free(XBZRLE.encoded_buf);
3022 XBZRLE.encoded_buf = NULL;
3023free_cache:
3024 cache_fini(XBZRLE.cache);
3025 XBZRLE.cache = NULL;
3026free_zero_page:
3027 g_free(XBZRLE.zero_target_page);
3028 XBZRLE.zero_target_page = NULL;
3029err_out:
3030 XBZRLE_cache_unlock();
3031 return -ENOMEM;
3032}
3033
53518d94 3034static int ram_state_init(RAMState **rsp)
56e93d26 3035{
7d00ee6a
PX
3036 *rsp = g_try_new0(RAMState, 1);
3037
3038 if (!*rsp) {
3039 error_report("%s: Init ramstate fail", __func__);
3040 return -1;
3041 }
53518d94
JQ
3042
3043 qemu_mutex_init(&(*rsp)->bitmap_mutex);
3044 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3045 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
56e93d26 3046
7d00ee6a
PX
3047 /*
3048 * Count the total number of pages used by ram blocks not including any
3049 * gaps due to alignment or unplugs.
3050 */
3051 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
3052
3053 ram_state_reset(*rsp);
3054
3055 return 0;
3056}
3057
d6eff5d7 3058static void ram_list_init_bitmaps(void)
7d00ee6a 3059{
d6eff5d7
PX
3060 RAMBlock *block;
3061 unsigned long pages;
56e93d26 3062
0827b9e9
AA
3063 /* Skip setting bitmap if there is no RAM */
3064 if (ram_bytes_total()) {
b895de50 3065 RAMBLOCK_FOREACH_MIGRATABLE(block) {
d6eff5d7 3066 pages = block->max_length >> TARGET_PAGE_BITS;
6b6712ef
JQ
3067 block->bmap = bitmap_new(pages);
3068 bitmap_set(block->bmap, 0, pages);
3069 if (migrate_postcopy_ram()) {
3070 block->unsentmap = bitmap_new(pages);
3071 bitmap_set(block->unsentmap, 0, pages);
3072 }
0827b9e9 3073 }
f3f491fc 3074 }
d6eff5d7
PX
3075}
3076
3077static void ram_init_bitmaps(RAMState *rs)
3078{
3079 /* For memory_global_dirty_log_start below. */
3080 qemu_mutex_lock_iothread();
3081 qemu_mutex_lock_ramlist();
3082 rcu_read_lock();
f3f491fc 3083
d6eff5d7 3084 ram_list_init_bitmaps();
56e93d26 3085 memory_global_dirty_log_start();
d6eff5d7
PX
3086 migration_bitmap_sync(rs);
3087
3088 rcu_read_unlock();
56e93d26 3089 qemu_mutex_unlock_ramlist();
49877834 3090 qemu_mutex_unlock_iothread();
d6eff5d7
PX
3091}
3092
3093static int ram_init_all(RAMState **rsp)
3094{
3095 if (ram_state_init(rsp)) {
3096 return -1;
3097 }
3098
3099 if (xbzrle_init()) {
3100 ram_state_cleanup(rsp);
3101 return -1;
3102 }
3103
3104 ram_init_bitmaps(*rsp);
a91246c9
HZ
3105
3106 return 0;
3107}
3108
08614f34
PX
3109static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3110{
3111 RAMBlock *block;
3112 uint64_t pages = 0;
3113
3114 /*
3115 * Postcopy is not using xbzrle/compression, so no need for that.
3116 * Also, since source are already halted, we don't need to care
3117 * about dirty page logging as well.
3118 */
3119
ff0769a4 3120 RAMBLOCK_FOREACH_MIGRATABLE(block) {
08614f34
PX
3121 pages += bitmap_count_one(block->bmap,
3122 block->used_length >> TARGET_PAGE_BITS);
3123 }
3124
3125 /* This may not be aligned with current bitmaps. Recalculate. */
3126 rs->migration_dirty_pages = pages;
3127
3128 rs->last_seen_block = NULL;
3129 rs->last_sent_block = NULL;
3130 rs->last_page = 0;
3131 rs->last_version = ram_list.version;
3132 /*
3133 * Disable the bulk stage, otherwise we'll resend the whole RAM no
3134 * matter what we have sent.
3135 */
3136 rs->ram_bulk_stage = false;
3137
3138 /* Update RAMState cache of output QEMUFile */
3139 rs->f = out;
3140
3141 trace_ram_state_resume_prepare(pages);
3142}
3143
3d0684b2
JQ
3144/*
3145 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
a91246c9
HZ
3146 * long-running RCU critical section. When rcu-reclaims in the code
3147 * start to become numerous it will be necessary to reduce the
3148 * granularity of these critical sections.
3149 */
3150
3d0684b2
JQ
3151/**
3152 * ram_save_setup: Setup RAM for migration
3153 *
3154 * Returns zero to indicate success and negative for error
3155 *
3156 * @f: QEMUFile where to send the data
3157 * @opaque: RAMState pointer
3158 */
a91246c9
HZ
3159static int ram_save_setup(QEMUFile *f, void *opaque)
3160{
53518d94 3161 RAMState **rsp = opaque;
a91246c9
HZ
3162 RAMBlock *block;
3163
dcaf446e
XG
3164 if (compress_threads_save_setup()) {
3165 return -1;
3166 }
3167
a91246c9
HZ
3168 /* migration has already setup the bitmap, reuse it. */
3169 if (!migration_in_colo_state()) {
7d00ee6a 3170 if (ram_init_all(rsp) != 0) {
dcaf446e 3171 compress_threads_save_cleanup();
a91246c9 3172 return -1;
53518d94 3173 }
a91246c9 3174 }
53518d94 3175 (*rsp)->f = f;
a91246c9
HZ
3176
3177 rcu_read_lock();
56e93d26
JQ
3178
3179 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
3180
b895de50 3181 RAMBLOCK_FOREACH_MIGRATABLE(block) {
56e93d26
JQ
3182 qemu_put_byte(f, strlen(block->idstr));
3183 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3184 qemu_put_be64(f, block->used_length);
ef08fb38
DDAG
3185 if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
3186 qemu_put_be64(f, block->page_size);
3187 }
56e93d26
JQ
3188 }
3189
3190 rcu_read_unlock();
3191
3192 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3193 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3194
6df264ac 3195 multifd_send_sync_main();
56e93d26 3196 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
35374cbd 3197 qemu_fflush(f);
56e93d26
JQ
3198
3199 return 0;
3200}
3201
3d0684b2
JQ
3202/**
3203 * ram_save_iterate: iterative stage for migration
3204 *
3205 * Returns zero to indicate success and negative for error
3206 *
3207 * @f: QEMUFile where to send the data
3208 * @opaque: RAMState pointer
3209 */
56e93d26
JQ
3210static int ram_save_iterate(QEMUFile *f, void *opaque)
3211{
53518d94
JQ
3212 RAMState **temp = opaque;
3213 RAMState *rs = *temp;
56e93d26
JQ
3214 int ret;
3215 int i;
3216 int64_t t0;
5c90308f 3217 int done = 0;
56e93d26 3218
b2557345
PL
3219 if (blk_mig_bulk_active()) {
3220 /* Avoid transferring ram during bulk phase of block migration as
3221 * the bulk phase will usually take a long time and transferring
3222 * ram updates during that time is pointless. */
3223 goto out;
3224 }
3225
56e93d26 3226 rcu_read_lock();
6f37bb8b
JQ
3227 if (ram_list.version != rs->last_version) {
3228 ram_state_reset(rs);
56e93d26
JQ
3229 }
3230
3231 /* Read version before ram_list.blocks */
3232 smp_rmb();
3233
3234 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3235
3236 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3237 i = 0;
e03a34f8
DDAG
3238 while ((ret = qemu_file_rate_limit(f)) == 0 ||
3239 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
56e93d26
JQ
3240 int pages;
3241
e03a34f8
DDAG
3242 if (qemu_file_get_error(f)) {
3243 break;
3244 }
3245
ce25d337 3246 pages = ram_find_and_save_block(rs, false);
56e93d26
JQ
3247 /* no more pages to sent */
3248 if (pages == 0) {
5c90308f 3249 done = 1;
56e93d26
JQ
3250 break;
3251 }
e8f3735f
XG
3252
3253 if (pages < 0) {
3254 qemu_file_set_error(f, pages);
3255 break;
3256 }
3257
be8b02ed 3258 rs->target_page_count += pages;
070afca2 3259
56e93d26
JQ
3260 /* we want to check in the 1st loop, just in case it was the 1st time
3261 and we had to sync the dirty bitmap.
3262 qemu_get_clock_ns() is a bit expensive, so we only check each some
3263 iterations
3264 */
3265 if ((i & 63) == 0) {
3266 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
3267 if (t1 > MAX_WAIT) {
55c4446b 3268 trace_ram_save_iterate_big_wait(t1, i);
56e93d26
JQ
3269 break;
3270 }
3271 }
3272 i++;
3273 }
56e93d26
JQ
3274 rcu_read_unlock();
3275
3276 /*
3277 * Must occur before EOS (or any QEMUFile operation)
3278 * because of RDMA protocol.
3279 */
3280 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3281
6df264ac 3282 multifd_send_sync_main();
b2557345 3283out:
56e93d26 3284 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
35374cbd 3285 qemu_fflush(f);
9360447d 3286 ram_counters.transferred += 8;
56e93d26
JQ
3287
3288 ret = qemu_file_get_error(f);
3289 if (ret < 0) {
3290 return ret;
3291 }
3292
5c90308f 3293 return done;
56e93d26
JQ
3294}
3295
3d0684b2
JQ
3296/**
3297 * ram_save_complete: function called to send the remaining amount of ram
3298 *
e8f3735f 3299 * Returns zero to indicate success or negative on error
3d0684b2
JQ
3300 *
3301 * Called with iothread lock
3302 *
3303 * @f: QEMUFile where to send the data
3304 * @opaque: RAMState pointer
3305 */
56e93d26
JQ
3306static int ram_save_complete(QEMUFile *f, void *opaque)
3307{
53518d94
JQ
3308 RAMState **temp = opaque;
3309 RAMState *rs = *temp;
e8f3735f 3310 int ret = 0;
6f37bb8b 3311
56e93d26
JQ
3312 rcu_read_lock();
3313
5727309d 3314 if (!migration_in_postcopy()) {
8d820d6f 3315 migration_bitmap_sync(rs);
663e6c1d 3316 }
56e93d26
JQ
3317
3318 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3319
3320 /* try transferring iterative blocks of memory */
3321
3322 /* flush all remaining blocks regardless of rate limiting */
3323 while (true) {
3324 int pages;
3325
ce25d337 3326 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
56e93d26
JQ
3327 /* no more blocks to sent */
3328 if (pages == 0) {
3329 break;
3330 }
e8f3735f
XG
3331 if (pages < 0) {
3332 ret = pages;
3333 break;
3334 }
56e93d26
JQ
3335 }
3336
ce25d337 3337 flush_compressed_data(rs);
56e93d26 3338 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
56e93d26
JQ
3339
3340 rcu_read_unlock();
d09a6fde 3341
6df264ac 3342 multifd_send_sync_main();
56e93d26 3343 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
35374cbd 3344 qemu_fflush(f);
56e93d26 3345
e8f3735f 3346 return ret;
56e93d26
JQ
3347}
3348
c31b098f 3349static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
47995026
VSO
3350 uint64_t *res_precopy_only,
3351 uint64_t *res_compatible,
3352 uint64_t *res_postcopy_only)
56e93d26 3353{
53518d94
JQ
3354 RAMState **temp = opaque;
3355 RAMState *rs = *temp;
56e93d26
JQ
3356 uint64_t remaining_size;
3357
9edabd4d 3358 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 3359
5727309d 3360 if (!migration_in_postcopy() &&
663e6c1d 3361 remaining_size < max_size) {
56e93d26
JQ
3362 qemu_mutex_lock_iothread();
3363 rcu_read_lock();
8d820d6f 3364 migration_bitmap_sync(rs);
56e93d26
JQ
3365 rcu_read_unlock();
3366 qemu_mutex_unlock_iothread();
9edabd4d 3367 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 3368 }
c31b098f 3369
86e1167e
VSO
3370 if (migrate_postcopy_ram()) {
3371 /* We can do postcopy, and all the data is postcopiable */
47995026 3372 *res_compatible += remaining_size;
86e1167e 3373 } else {
47995026 3374 *res_precopy_only += remaining_size;
86e1167e 3375 }
56e93d26
JQ
3376}
3377
3378static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3379{
3380 unsigned int xh_len;
3381 int xh_flags;
063e760a 3382 uint8_t *loaded_data;
56e93d26 3383
56e93d26
JQ
3384 /* extract RLE header */
3385 xh_flags = qemu_get_byte(f);
3386 xh_len = qemu_get_be16(f);
3387
3388 if (xh_flags != ENCODING_FLAG_XBZRLE) {
3389 error_report("Failed to load XBZRLE page - wrong compression!");
3390 return -1;
3391 }
3392
3393 if (xh_len > TARGET_PAGE_SIZE) {
3394 error_report("Failed to load XBZRLE page - len overflow!");
3395 return -1;
3396 }
f265e0e4 3397 loaded_data = XBZRLE.decoded_buf;
56e93d26 3398 /* load data and decode */
f265e0e4 3399 /* it can change loaded_data to point to an internal buffer */
063e760a 3400 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
56e93d26
JQ
3401
3402 /* decode RLE */
063e760a 3403 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
56e93d26
JQ
3404 TARGET_PAGE_SIZE) == -1) {
3405 error_report("Failed to load XBZRLE page - decode error!");
3406 return -1;
3407 }
3408
3409 return 0;
3410}
3411
3d0684b2
JQ
3412/**
3413 * ram_block_from_stream: read a RAMBlock id from the migration stream
3414 *
3415 * Must be called from within a rcu critical section.
3416 *
56e93d26 3417 * Returns a pointer from within the RCU-protected ram_list.
a7180877 3418 *
3d0684b2
JQ
3419 * @f: QEMUFile where to read the data from
3420 * @flags: Page flags (mostly to see if it's a continuation of previous block)
a7180877 3421 */
3d0684b2 3422static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
56e93d26
JQ
3423{
3424 static RAMBlock *block = NULL;
3425 char id[256];
3426 uint8_t len;
3427
3428 if (flags & RAM_SAVE_FLAG_CONTINUE) {
4c4bad48 3429 if (!block) {
56e93d26
JQ
3430 error_report("Ack, bad migration stream!");
3431 return NULL;
3432 }
4c4bad48 3433 return block;
56e93d26
JQ
3434 }
3435
3436 len = qemu_get_byte(f);
3437 qemu_get_buffer(f, (uint8_t *)id, len);
3438 id[len] = 0;
3439
e3dd7493 3440 block = qemu_ram_block_by_name(id);
4c4bad48
HZ
3441 if (!block) {
3442 error_report("Can't find block %s", id);
3443 return NULL;
56e93d26
JQ
3444 }
3445
b895de50
CLG
3446 if (!qemu_ram_is_migratable(block)) {
3447 error_report("block %s should not be migrated !", id);
3448 return NULL;
3449 }
3450
4c4bad48
HZ
3451 return block;
3452}
3453
3454static inline void *host_from_ram_block_offset(RAMBlock *block,
3455 ram_addr_t offset)
3456{
3457 if (!offset_in_ramblock(block, offset)) {
3458 return NULL;
3459 }
3460
3461 return block->host + offset;
56e93d26
JQ
3462}
3463
13af18f2
ZC
3464static inline void *colo_cache_from_block_offset(RAMBlock *block,
3465 ram_addr_t offset)
3466{
3467 if (!offset_in_ramblock(block, offset)) {
3468 return NULL;
3469 }
3470 if (!block->colo_cache) {
3471 error_report("%s: colo_cache is NULL in block :%s",
3472 __func__, block->idstr);
3473 return NULL;
3474 }
7d9acafa
ZC
3475
3476 /*
3477 * During colo checkpoint, we need bitmap of these migrated pages.
3478 * It help us to decide which pages in ram cache should be flushed
3479 * into VM's RAM later.
3480 */
3481 if (!test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3482 ram_state->migration_dirty_pages++;
3483 }
13af18f2
ZC
3484 return block->colo_cache + offset;
3485}
3486
3d0684b2
JQ
3487/**
3488 * ram_handle_compressed: handle the zero page case
3489 *
56e93d26
JQ
3490 * If a page (or a whole RDMA chunk) has been
3491 * determined to be zero, then zap it.
3d0684b2
JQ
3492 *
3493 * @host: host address for the zero page
3494 * @ch: what the page is filled from. We only support zero
3495 * @size: size of the zero page
56e93d26
JQ
3496 */
3497void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3498{
3499 if (ch != 0 || !is_zero_range(host, size)) {
3500 memset(host, ch, size);
3501 }
3502}
3503
797ca154
XG
3504/* return the size after decompression, or negative value on error */
3505static int
3506qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3507 const uint8_t *source, size_t source_len)
3508{
3509 int err;
3510
3511 err = inflateReset(stream);
3512 if (err != Z_OK) {
3513 return -1;
3514 }
3515
3516 stream->avail_in = source_len;
3517 stream->next_in = (uint8_t *)source;
3518 stream->avail_out = dest_len;
3519 stream->next_out = dest;
3520
3521 err = inflate(stream, Z_NO_FLUSH);
3522 if (err != Z_STREAM_END) {
3523 return -1;
3524 }
3525
3526 return stream->total_out;
3527}
3528
56e93d26
JQ
3529static void *do_data_decompress(void *opaque)
3530{
3531 DecompressParam *param = opaque;
3532 unsigned long pagesize;
33d151f4 3533 uint8_t *des;
34ab9e97 3534 int len, ret;
56e93d26 3535
33d151f4 3536 qemu_mutex_lock(&param->mutex);
90e56fb4 3537 while (!param->quit) {
33d151f4
LL
3538 if (param->des) {
3539 des = param->des;
3540 len = param->len;
3541 param->des = 0;
3542 qemu_mutex_unlock(&param->mutex);
3543
56e93d26 3544 pagesize = TARGET_PAGE_SIZE;
34ab9e97
XG
3545
3546 ret = qemu_uncompress_data(&param->stream, des, pagesize,
3547 param->compbuf, len);
f548222c 3548 if (ret < 0 && migrate_get_current()->decompress_error_check) {
34ab9e97
XG
3549 error_report("decompress data failed");
3550 qemu_file_set_error(decomp_file, ret);
3551 }
73a8912b 3552
33d151f4
LL
3553 qemu_mutex_lock(&decomp_done_lock);
3554 param->done = true;
3555 qemu_cond_signal(&decomp_done_cond);
3556 qemu_mutex_unlock(&decomp_done_lock);
3557
3558 qemu_mutex_lock(&param->mutex);
3559 } else {
3560 qemu_cond_wait(&param->cond, &param->mutex);
3561 }
56e93d26 3562 }
33d151f4 3563 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
3564
3565 return NULL;
3566}
3567
34ab9e97 3568static int wait_for_decompress_done(void)
5533b2e9
LL
3569{
3570 int idx, thread_count;
3571
3572 if (!migrate_use_compression()) {
34ab9e97 3573 return 0;
5533b2e9
LL
3574 }
3575
3576 thread_count = migrate_decompress_threads();
3577 qemu_mutex_lock(&decomp_done_lock);
3578 for (idx = 0; idx < thread_count; idx++) {
3579 while (!decomp_param[idx].done) {
3580 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3581 }
3582 }
3583 qemu_mutex_unlock(&decomp_done_lock);
34ab9e97 3584 return qemu_file_get_error(decomp_file);
5533b2e9
LL
3585}
3586
f0afa331 3587static void compress_threads_load_cleanup(void)
56e93d26
JQ
3588{
3589 int i, thread_count;
3590
3416ab5b
JQ
3591 if (!migrate_use_compression()) {
3592 return;
3593 }
56e93d26
JQ
3594 thread_count = migrate_decompress_threads();
3595 for (i = 0; i < thread_count; i++) {
797ca154
XG
3596 /*
3597 * we use it as a indicator which shows if the thread is
3598 * properly init'd or not
3599 */
3600 if (!decomp_param[i].compbuf) {
3601 break;
3602 }
3603
56e93d26 3604 qemu_mutex_lock(&decomp_param[i].mutex);
90e56fb4 3605 decomp_param[i].quit = true;
56e93d26
JQ
3606 qemu_cond_signal(&decomp_param[i].cond);
3607 qemu_mutex_unlock(&decomp_param[i].mutex);
3608 }
3609 for (i = 0; i < thread_count; i++) {
797ca154
XG
3610 if (!decomp_param[i].compbuf) {
3611 break;
3612 }
3613
56e93d26
JQ
3614 qemu_thread_join(decompress_threads + i);
3615 qemu_mutex_destroy(&decomp_param[i].mutex);
3616 qemu_cond_destroy(&decomp_param[i].cond);
797ca154 3617 inflateEnd(&decomp_param[i].stream);
56e93d26 3618 g_free(decomp_param[i].compbuf);
797ca154 3619 decomp_param[i].compbuf = NULL;
56e93d26
JQ
3620 }
3621 g_free(decompress_threads);
3622 g_free(decomp_param);
56e93d26
JQ
3623 decompress_threads = NULL;
3624 decomp_param = NULL;
34ab9e97 3625 decomp_file = NULL;
56e93d26
JQ
3626}
3627
34ab9e97 3628static int compress_threads_load_setup(QEMUFile *f)
797ca154
XG
3629{
3630 int i, thread_count;
3631
3632 if (!migrate_use_compression()) {
3633 return 0;
3634 }
3635
3636 thread_count = migrate_decompress_threads();
3637 decompress_threads = g_new0(QemuThread, thread_count);
3638 decomp_param = g_new0(DecompressParam, thread_count);
3639 qemu_mutex_init(&decomp_done_lock);
3640 qemu_cond_init(&decomp_done_cond);
34ab9e97 3641 decomp_file = f;
797ca154
XG
3642 for (i = 0; i < thread_count; i++) {
3643 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3644 goto exit;
3645 }
3646
3647 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3648 qemu_mutex_init(&decomp_param[i].mutex);
3649 qemu_cond_init(&decomp_param[i].cond);
3650 decomp_param[i].done = true;
3651 decomp_param[i].quit = false;
3652 qemu_thread_create(decompress_threads + i, "decompress",
3653 do_data_decompress, decomp_param + i,
3654 QEMU_THREAD_JOINABLE);
3655 }
3656 return 0;
3657exit:
3658 compress_threads_load_cleanup();
3659 return -1;
3660}
3661
c1bc6626 3662static void decompress_data_with_multi_threads(QEMUFile *f,
56e93d26
JQ
3663 void *host, int len)
3664{
3665 int idx, thread_count;
3666
3667 thread_count = migrate_decompress_threads();
73a8912b 3668 qemu_mutex_lock(&decomp_done_lock);
56e93d26
JQ
3669 while (true) {
3670 for (idx = 0; idx < thread_count; idx++) {
73a8912b 3671 if (decomp_param[idx].done) {
33d151f4
LL
3672 decomp_param[idx].done = false;
3673 qemu_mutex_lock(&decomp_param[idx].mutex);
c1bc6626 3674 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
56e93d26
JQ
3675 decomp_param[idx].des = host;
3676 decomp_param[idx].len = len;
33d151f4
LL
3677 qemu_cond_signal(&decomp_param[idx].cond);
3678 qemu_mutex_unlock(&decomp_param[idx].mutex);
56e93d26
JQ
3679 break;
3680 }
3681 }
3682 if (idx < thread_count) {
3683 break;
73a8912b
LL
3684 } else {
3685 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
56e93d26
JQ
3686 }
3687 }
73a8912b 3688 qemu_mutex_unlock(&decomp_done_lock);
56e93d26
JQ
3689}
3690
13af18f2
ZC
3691/*
3692 * colo cache: this is for secondary VM, we cache the whole
3693 * memory of the secondary VM, it is need to hold the global lock
3694 * to call this helper.
3695 */
3696int colo_init_ram_cache(void)
3697{
3698 RAMBlock *block;
3699
3700 rcu_read_lock();
7d9acafa 3701 RAMBLOCK_FOREACH_MIGRATABLE(block) {
13af18f2
ZC
3702 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3703 NULL,
3704 false);
3705 if (!block->colo_cache) {
3706 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3707 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3708 block->used_length);
3709 goto out_locked;
3710 }
3711 memcpy(block->colo_cache, block->host, block->used_length);
3712 }
3713 rcu_read_unlock();
7d9acafa
ZC
3714 /*
3715 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3716 * with to decide which page in cache should be flushed into SVM's RAM. Here
3717 * we use the same name 'ram_bitmap' as for migration.
3718 */
3719 if (ram_bytes_total()) {
3720 RAMBlock *block;
3721
3722 RAMBLOCK_FOREACH_MIGRATABLE(block) {
3723 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3724
3725 block->bmap = bitmap_new(pages);
3726 bitmap_set(block->bmap, 0, pages);
3727 }
3728 }
3729 ram_state = g_new0(RAMState, 1);
3730 ram_state->migration_dirty_pages = 0;
d1955d22 3731 memory_global_dirty_log_start();
7d9acafa 3732
13af18f2
ZC
3733 return 0;
3734
3735out_locked:
7d9acafa
ZC
3736
3737 RAMBLOCK_FOREACH_MIGRATABLE(block) {
13af18f2
ZC
3738 if (block->colo_cache) {
3739 qemu_anon_ram_free(block->colo_cache, block->used_length);
3740 block->colo_cache = NULL;
3741 }
3742 }
3743
3744 rcu_read_unlock();
3745 return -errno;
3746}
3747
3748/* It is need to hold the global lock to call this helper */
3749void colo_release_ram_cache(void)
3750{
3751 RAMBlock *block;
3752
d1955d22 3753 memory_global_dirty_log_stop();
7d9acafa
ZC
3754 RAMBLOCK_FOREACH_MIGRATABLE(block) {
3755 g_free(block->bmap);
3756 block->bmap = NULL;
3757 }
3758
13af18f2 3759 rcu_read_lock();
7d9acafa
ZC
3760
3761 RAMBLOCK_FOREACH_MIGRATABLE(block) {
13af18f2
ZC
3762 if (block->colo_cache) {
3763 qemu_anon_ram_free(block->colo_cache, block->used_length);
3764 block->colo_cache = NULL;
3765 }
3766 }
7d9acafa 3767
13af18f2 3768 rcu_read_unlock();
7d9acafa
ZC
3769 g_free(ram_state);
3770 ram_state = NULL;
13af18f2
ZC
3771}
3772
f265e0e4
JQ
3773/**
3774 * ram_load_setup: Setup RAM for migration incoming side
3775 *
3776 * Returns zero to indicate success and negative for error
3777 *
3778 * @f: QEMUFile where to receive the data
3779 * @opaque: RAMState pointer
3780 */
3781static int ram_load_setup(QEMUFile *f, void *opaque)
3782{
34ab9e97 3783 if (compress_threads_load_setup(f)) {
797ca154
XG
3784 return -1;
3785 }
3786
f265e0e4 3787 xbzrle_load_setup();
f9494614 3788 ramblock_recv_map_init();
13af18f2 3789
f265e0e4
JQ
3790 return 0;
3791}
3792
3793static int ram_load_cleanup(void *opaque)
3794{
f9494614 3795 RAMBlock *rb;
56eb90af
JH
3796
3797 RAMBLOCK_FOREACH_MIGRATABLE(rb) {
3798 if (ramblock_is_pmem(rb)) {
3799 pmem_persist(rb->host, rb->used_length);
3800 }
3801 }
3802
f265e0e4 3803 xbzrle_load_cleanup();
f0afa331 3804 compress_threads_load_cleanup();
f9494614 3805
b895de50 3806 RAMBLOCK_FOREACH_MIGRATABLE(rb) {
f9494614
AP
3807 g_free(rb->receivedmap);
3808 rb->receivedmap = NULL;
3809 }
13af18f2 3810
f265e0e4
JQ
3811 return 0;
3812}
3813
3d0684b2
JQ
3814/**
3815 * ram_postcopy_incoming_init: allocate postcopy data structures
3816 *
3817 * Returns 0 for success and negative if there was one error
3818 *
3819 * @mis: current migration incoming state
3820 *
3821 * Allocate data structures etc needed by incoming migration with
3822 * postcopy-ram. postcopy-ram's similarly names
3823 * postcopy_ram_incoming_init does the work.
1caddf8a
DDAG
3824 */
3825int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3826{
c136180c 3827 return postcopy_ram_incoming_init(mis);
1caddf8a
DDAG
3828}
3829
3d0684b2
JQ
3830/**
3831 * ram_load_postcopy: load a page in postcopy case
3832 *
3833 * Returns 0 for success or -errno in case of error
3834 *
a7180877
DDAG
3835 * Called in postcopy mode by ram_load().
3836 * rcu_read_lock is taken prior to this being called.
3d0684b2
JQ
3837 *
3838 * @f: QEMUFile where to send the data
a7180877
DDAG
3839 */
3840static int ram_load_postcopy(QEMUFile *f)
3841{
3842 int flags = 0, ret = 0;
3843 bool place_needed = false;
1aa83678 3844 bool matches_target_page_size = false;
a7180877
DDAG
3845 MigrationIncomingState *mis = migration_incoming_get_current();
3846 /* Temporary page that is later 'placed' */
3847 void *postcopy_host_page = postcopy_get_tmp_page(mis);
c53b7ddc 3848 void *last_host = NULL;
a3b6ff6d 3849 bool all_zero = false;
a7180877
DDAG
3850
3851 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3852 ram_addr_t addr;
3853 void *host = NULL;
3854 void *page_buffer = NULL;
3855 void *place_source = NULL;
df9ff5e1 3856 RAMBlock *block = NULL;
a7180877 3857 uint8_t ch;
a7180877
DDAG
3858
3859 addr = qemu_get_be64(f);
7a9ddfbf
PX
3860
3861 /*
3862 * If qemu file error, we should stop here, and then "addr"
3863 * may be invalid
3864 */
3865 ret = qemu_file_get_error(f);
3866 if (ret) {
3867 break;
3868 }
3869
a7180877
DDAG
3870 flags = addr & ~TARGET_PAGE_MASK;
3871 addr &= TARGET_PAGE_MASK;
3872
3873 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3874 place_needed = false;
bb890ed5 3875 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
df9ff5e1 3876 block = ram_block_from_stream(f, flags);
4c4bad48
HZ
3877
3878 host = host_from_ram_block_offset(block, addr);
a7180877
DDAG
3879 if (!host) {
3880 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3881 ret = -EINVAL;
3882 break;
3883 }
1aa83678 3884 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
a7180877 3885 /*
28abd200
DDAG
3886 * Postcopy requires that we place whole host pages atomically;
3887 * these may be huge pages for RAMBlocks that are backed by
3888 * hugetlbfs.
a7180877
DDAG
3889 * To make it atomic, the data is read into a temporary page
3890 * that's moved into place later.
3891 * The migration protocol uses, possibly smaller, target-pages
3892 * however the source ensures it always sends all the components
3893 * of a host page in order.
3894 */
3895 page_buffer = postcopy_host_page +
28abd200 3896 ((uintptr_t)host & (block->page_size - 1));
a7180877 3897 /* If all TP are zero then we can optimise the place */
28abd200 3898 if (!((uintptr_t)host & (block->page_size - 1))) {
a7180877 3899 all_zero = true;
c53b7ddc
DDAG
3900 } else {
3901 /* not the 1st TP within the HP */
3902 if (host != (last_host + TARGET_PAGE_SIZE)) {
9af9e0fe 3903 error_report("Non-sequential target page %p/%p",
c53b7ddc
DDAG
3904 host, last_host);
3905 ret = -EINVAL;
3906 break;
3907 }
a7180877
DDAG
3908 }
3909
c53b7ddc 3910
a7180877
DDAG
3911 /*
3912 * If it's the last part of a host page then we place the host
3913 * page
3914 */
3915 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
28abd200 3916 (block->page_size - 1)) == 0;
a7180877
DDAG
3917 place_source = postcopy_host_page;
3918 }
c53b7ddc 3919 last_host = host;
a7180877
DDAG
3920
3921 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
bb890ed5 3922 case RAM_SAVE_FLAG_ZERO:
a7180877
DDAG
3923 ch = qemu_get_byte(f);
3924 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3925 if (ch) {
3926 all_zero = false;
3927 }
3928 break;
3929
3930 case RAM_SAVE_FLAG_PAGE:
3931 all_zero = false;
1aa83678
PX
3932 if (!matches_target_page_size) {
3933 /* For huge pages, we always use temporary buffer */
a7180877
DDAG
3934 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3935 } else {
1aa83678
PX
3936 /*
3937 * For small pages that matches target page size, we
3938 * avoid the qemu_file copy. Instead we directly use
3939 * the buffer of QEMUFile to place the page. Note: we
3940 * cannot do any QEMUFile operation before using that
3941 * buffer to make sure the buffer is valid when
3942 * placing the page.
a7180877
DDAG
3943 */
3944 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3945 TARGET_PAGE_SIZE);
3946 }
3947 break;
3948 case RAM_SAVE_FLAG_EOS:
3949 /* normal exit */
6df264ac 3950 multifd_recv_sync_main();
a7180877
DDAG
3951 break;
3952 default:
3953 error_report("Unknown combination of migration flags: %#x"
3954 " (postcopy mode)", flags);
3955 ret = -EINVAL;
7a9ddfbf
PX
3956 break;
3957 }
3958
3959 /* Detect for any possible file errors */
3960 if (!ret && qemu_file_get_error(f)) {
3961 ret = qemu_file_get_error(f);
a7180877
DDAG
3962 }
3963
7a9ddfbf 3964 if (!ret && place_needed) {
a7180877 3965 /* This gets called at the last target page in the host page */
df9ff5e1
DDAG
3966 void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
3967
a7180877 3968 if (all_zero) {
df9ff5e1 3969 ret = postcopy_place_page_zero(mis, place_dest,
8be4620b 3970 block);
a7180877 3971 } else {
df9ff5e1 3972 ret = postcopy_place_page(mis, place_dest,
8be4620b 3973 place_source, block);
a7180877
DDAG
3974 }
3975 }
a7180877
DDAG
3976 }
3977
3978 return ret;
3979}
3980
acab30b8
DHB
3981static bool postcopy_is_advised(void)
3982{
3983 PostcopyState ps = postcopy_state_get();
3984 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3985}
3986
3987static bool postcopy_is_running(void)
3988{
3989 PostcopyState ps = postcopy_state_get();
3990 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3991}
3992
e6f4aa18
ZC
3993/*
3994 * Flush content of RAM cache into SVM's memory.
3995 * Only flush the pages that be dirtied by PVM or SVM or both.
3996 */
3997static void colo_flush_ram_cache(void)
3998{
3999 RAMBlock *block = NULL;
4000 void *dst_host;
4001 void *src_host;
4002 unsigned long offset = 0;
4003
d1955d22
HZ
4004 memory_global_dirty_log_sync();
4005 rcu_read_lock();
4006 RAMBLOCK_FOREACH_MIGRATABLE(block) {
4007 migration_bitmap_sync_range(ram_state, block, 0, block->used_length);
4008 }
4009 rcu_read_unlock();
4010
e6f4aa18
ZC
4011 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4012 rcu_read_lock();
4013 block = QLIST_FIRST_RCU(&ram_list.blocks);
4014
4015 while (block) {
4016 offset = migration_bitmap_find_dirty(ram_state, block, offset);
4017
4018 if (offset << TARGET_PAGE_BITS >= block->used_length) {
4019 offset = 0;
4020 block = QLIST_NEXT_RCU(block, next);
4021 } else {
4022 migration_bitmap_clear_dirty(ram_state, block, offset);
4023 dst_host = block->host + (offset << TARGET_PAGE_BITS);
4024 src_host = block->colo_cache + (offset << TARGET_PAGE_BITS);
4025 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
4026 }
4027 }
4028
4029 rcu_read_unlock();
4030 trace_colo_flush_ram_cache_end();
4031}
4032
56e93d26
JQ
4033static int ram_load(QEMUFile *f, void *opaque, int version_id)
4034{
edc60127 4035 int flags = 0, ret = 0, invalid_flags = 0;
56e93d26
JQ
4036 static uint64_t seq_iter;
4037 int len = 0;
a7180877
DDAG
4038 /*
4039 * If system is running in postcopy mode, page inserts to host memory must
4040 * be atomic
4041 */
acab30b8 4042 bool postcopy_running = postcopy_is_running();
ef08fb38 4043 /* ADVISE is earlier, it shows the source has the postcopy capability on */
acab30b8 4044 bool postcopy_advised = postcopy_is_advised();
56e93d26
JQ
4045
4046 seq_iter++;
4047
4048 if (version_id != 4) {
4049 ret = -EINVAL;
4050 }
4051
edc60127
JQ
4052 if (!migrate_use_compression()) {
4053 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4054 }
56e93d26
JQ
4055 /* This RCU critical section can be very long running.
4056 * When RCU reclaims in the code start to become numerous,
4057 * it will be necessary to reduce the granularity of this
4058 * critical section.
4059 */
4060 rcu_read_lock();
a7180877
DDAG
4061
4062 if (postcopy_running) {
4063 ret = ram_load_postcopy(f);
4064 }
4065
4066 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
56e93d26 4067 ram_addr_t addr, total_ram_bytes;
a776aa15 4068 void *host = NULL;
56e93d26
JQ
4069 uint8_t ch;
4070
4071 addr = qemu_get_be64(f);
4072 flags = addr & ~TARGET_PAGE_MASK;
4073 addr &= TARGET_PAGE_MASK;
4074
edc60127
JQ
4075 if (flags & invalid_flags) {
4076 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4077 error_report("Received an unexpected compressed page");
4078 }
4079
4080 ret = -EINVAL;
4081 break;
4082 }
4083
bb890ed5 4084 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
a776aa15 4085 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4c4bad48
HZ
4086 RAMBlock *block = ram_block_from_stream(f, flags);
4087
13af18f2
ZC
4088 /*
4089 * After going into COLO, we should load the Page into colo_cache.
4090 */
4091 if (migration_incoming_in_colo_state()) {
4092 host = colo_cache_from_block_offset(block, addr);
4093 } else {
4094 host = host_from_ram_block_offset(block, addr);
4095 }
a776aa15
DDAG
4096 if (!host) {
4097 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4098 ret = -EINVAL;
4099 break;
4100 }
13af18f2
ZC
4101
4102 if (!migration_incoming_in_colo_state()) {
4103 ramblock_recv_bitmap_set(block, host);
4104 }
4105
1db9d8e5 4106 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
a776aa15
DDAG
4107 }
4108
56e93d26
JQ
4109 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4110 case RAM_SAVE_FLAG_MEM_SIZE:
4111 /* Synchronize RAM block list */
4112 total_ram_bytes = addr;
4113 while (!ret && total_ram_bytes) {
4114 RAMBlock *block;
56e93d26
JQ
4115 char id[256];
4116 ram_addr_t length;
4117
4118 len = qemu_get_byte(f);
4119 qemu_get_buffer(f, (uint8_t *)id, len);
4120 id[len] = 0;
4121 length = qemu_get_be64(f);
4122
e3dd7493 4123 block = qemu_ram_block_by_name(id);
b895de50
CLG
4124 if (block && !qemu_ram_is_migratable(block)) {
4125 error_report("block %s should not be migrated !", id);
4126 ret = -EINVAL;
4127 } else if (block) {
e3dd7493
DDAG
4128 if (length != block->used_length) {
4129 Error *local_err = NULL;
56e93d26 4130
fa53a0e5 4131 ret = qemu_ram_resize(block, length,
e3dd7493
DDAG
4132 &local_err);
4133 if (local_err) {
4134 error_report_err(local_err);
56e93d26 4135 }
56e93d26 4136 }
ef08fb38
DDAG
4137 /* For postcopy we need to check hugepage sizes match */
4138 if (postcopy_advised &&
4139 block->page_size != qemu_host_page_size) {
4140 uint64_t remote_page_size = qemu_get_be64(f);
4141 if (remote_page_size != block->page_size) {
4142 error_report("Mismatched RAM page size %s "
4143 "(local) %zd != %" PRId64,
4144 id, block->page_size,
4145 remote_page_size);
4146 ret = -EINVAL;
4147 }
4148 }
e3dd7493
DDAG
4149 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4150 block->idstr);
4151 } else {
56e93d26
JQ
4152 error_report("Unknown ramblock \"%s\", cannot "
4153 "accept migration", id);
4154 ret = -EINVAL;
4155 }
4156
4157 total_ram_bytes -= length;
4158 }
4159 break;
a776aa15 4160
bb890ed5 4161 case RAM_SAVE_FLAG_ZERO:
56e93d26
JQ
4162 ch = qemu_get_byte(f);
4163 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4164 break;
a776aa15 4165
56e93d26 4166 case RAM_SAVE_FLAG_PAGE:
56e93d26
JQ
4167 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4168 break;
56e93d26 4169
a776aa15 4170 case RAM_SAVE_FLAG_COMPRESS_PAGE:
56e93d26
JQ
4171 len = qemu_get_be32(f);
4172 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4173 error_report("Invalid compressed data length: %d", len);
4174 ret = -EINVAL;
4175 break;
4176 }
c1bc6626 4177 decompress_data_with_multi_threads(f, host, len);
56e93d26 4178 break;
a776aa15 4179
56e93d26 4180 case RAM_SAVE_FLAG_XBZRLE:
56e93d26
JQ
4181 if (load_xbzrle(f, addr, host) < 0) {
4182 error_report("Failed to decompress XBZRLE page at "
4183 RAM_ADDR_FMT, addr);
4184 ret = -EINVAL;
4185 break;
4186 }
4187 break;
4188 case RAM_SAVE_FLAG_EOS:
4189 /* normal exit */
6df264ac 4190 multifd_recv_sync_main();
56e93d26
JQ
4191 break;
4192 default:
4193 if (flags & RAM_SAVE_FLAG_HOOK) {
632e3a5c 4194 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
56e93d26
JQ
4195 } else {
4196 error_report("Unknown combination of migration flags: %#x",
4197 flags);
4198 ret = -EINVAL;
4199 }
4200 }
4201 if (!ret) {
4202 ret = qemu_file_get_error(f);
4203 }
4204 }
4205
34ab9e97 4206 ret |= wait_for_decompress_done();
56e93d26 4207 rcu_read_unlock();
55c4446b 4208 trace_ram_load_complete(ret, seq_iter);
e6f4aa18
ZC
4209
4210 if (!ret && migration_incoming_in_colo_state()) {
4211 colo_flush_ram_cache();
4212 }
56e93d26
JQ
4213 return ret;
4214}
4215
c6467627
VSO
4216static bool ram_has_postcopy(void *opaque)
4217{
469dd51b
JH
4218 RAMBlock *rb;
4219 RAMBLOCK_FOREACH_MIGRATABLE(rb) {
4220 if (ramblock_is_pmem(rb)) {
4221 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4222 "is not supported now!", rb->idstr, rb->host);
4223 return false;
4224 }
4225 }
4226
c6467627
VSO
4227 return migrate_postcopy_ram();
4228}
4229
edd090c7
PX
4230/* Sync all the dirty bitmap with destination VM. */
4231static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4232{
4233 RAMBlock *block;
4234 QEMUFile *file = s->to_dst_file;
4235 int ramblock_count = 0;
4236
4237 trace_ram_dirty_bitmap_sync_start();
4238
ff0769a4 4239 RAMBLOCK_FOREACH_MIGRATABLE(block) {
edd090c7
PX
4240 qemu_savevm_send_recv_bitmap(file, block->idstr);
4241 trace_ram_dirty_bitmap_request(block->idstr);
4242 ramblock_count++;
4243 }
4244
4245 trace_ram_dirty_bitmap_sync_wait();
4246
4247 /* Wait until all the ramblocks' dirty bitmap synced */
4248 while (ramblock_count--) {
4249 qemu_sem_wait(&s->rp_state.rp_sem);
4250 }
4251
4252 trace_ram_dirty_bitmap_sync_complete();
4253
4254 return 0;
4255}
4256
4257static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4258{
4259 qemu_sem_post(&s->rp_state.rp_sem);
4260}
4261
a335debb
PX
4262/*
4263 * Read the received bitmap, revert it as the initial dirty bitmap.
4264 * This is only used when the postcopy migration is paused but wants
4265 * to resume from a middle point.
4266 */
4267int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4268{
4269 int ret = -EINVAL;
4270 QEMUFile *file = s->rp_state.from_dst_file;
4271 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
a725ef9f 4272 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
4273 uint64_t size, end_mark;
4274
4275 trace_ram_dirty_bitmap_reload_begin(block->idstr);
4276
4277 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4278 error_report("%s: incorrect state %s", __func__,
4279 MigrationStatus_str(s->state));
4280 return -EINVAL;
4281 }
4282
4283 /*
4284 * Note: see comments in ramblock_recv_bitmap_send() on why we
4285 * need the endianess convertion, and the paddings.
4286 */
4287 local_size = ROUND_UP(local_size, 8);
4288
4289 /* Add paddings */
4290 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4291
4292 size = qemu_get_be64(file);
4293
4294 /* The size of the bitmap should match with our ramblock */
4295 if (size != local_size) {
4296 error_report("%s: ramblock '%s' bitmap size mismatch "
4297 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4298 block->idstr, size, local_size);
4299 ret = -EINVAL;
4300 goto out;
4301 }
4302
4303 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4304 end_mark = qemu_get_be64(file);
4305
4306 ret = qemu_file_get_error(file);
4307 if (ret || size != local_size) {
4308 error_report("%s: read bitmap failed for ramblock '%s': %d"
4309 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4310 __func__, block->idstr, ret, local_size, size);
4311 ret = -EIO;
4312 goto out;
4313 }
4314
4315 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4316 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
4317 __func__, block->idstr, end_mark);
4318 ret = -EINVAL;
4319 goto out;
4320 }
4321
4322 /*
4323 * Endianess convertion. We are during postcopy (though paused).
4324 * The dirty bitmap won't change. We can directly modify it.
4325 */
4326 bitmap_from_le(block->bmap, le_bitmap, nbits);
4327
4328 /*
4329 * What we received is "received bitmap". Revert it as the initial
4330 * dirty bitmap for this ramblock.
4331 */
4332 bitmap_complement(block->bmap, block->bmap, nbits);
4333
4334 trace_ram_dirty_bitmap_reload_complete(block->idstr);
4335
edd090c7
PX
4336 /*
4337 * We succeeded to sync bitmap for current ramblock. If this is
4338 * the last one to sync, we need to notify the main send thread.
4339 */
4340 ram_dirty_bitmap_reload_notify(s);
4341
a335debb
PX
4342 ret = 0;
4343out:
bf269906 4344 g_free(le_bitmap);
a335debb
PX
4345 return ret;
4346}
4347
edd090c7
PX
4348static int ram_resume_prepare(MigrationState *s, void *opaque)
4349{
4350 RAMState *rs = *(RAMState **)opaque;
08614f34 4351 int ret;
edd090c7 4352
08614f34
PX
4353 ret = ram_dirty_bitmap_sync_all(s, rs);
4354 if (ret) {
4355 return ret;
4356 }
4357
4358 ram_state_resume_prepare(rs, s->to_dst_file);
4359
4360 return 0;
edd090c7
PX
4361}
4362
56e93d26 4363static SaveVMHandlers savevm_ram_handlers = {
9907e842 4364 .save_setup = ram_save_setup,
56e93d26 4365 .save_live_iterate = ram_save_iterate,
763c906b 4366 .save_live_complete_postcopy = ram_save_complete,
a3e06c3d 4367 .save_live_complete_precopy = ram_save_complete,
c6467627 4368 .has_postcopy = ram_has_postcopy,
56e93d26
JQ
4369 .save_live_pending = ram_save_pending,
4370 .load_state = ram_load,
f265e0e4
JQ
4371 .save_cleanup = ram_save_cleanup,
4372 .load_setup = ram_load_setup,
4373 .load_cleanup = ram_load_cleanup,
edd090c7 4374 .resume_prepare = ram_resume_prepare,
56e93d26
JQ
4375};
4376
4377void ram_mig_init(void)
4378{
4379 qemu_mutex_init(&XBZRLE.lock);
6f37bb8b 4380 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
56e93d26 4381}