]> git.proxmox.com Git - mirror_qemu.git/blame - migration/ram.c
migration: Create load_setup()/cleanup() methods
[mirror_qemu.git] / migration / ram.c
CommitLineData
56e93d26
JQ
1/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
76cc7b58
JQ
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
56e93d26
JQ
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
1393a485 28#include "qemu/osdep.h"
33c11879 29#include "cpu.h"
56e93d26 30#include <zlib.h>
4addcd4f 31#include "qapi-event.h"
f348b6d1 32#include "qemu/cutils.h"
56e93d26
JQ
33#include "qemu/bitops.h"
34#include "qemu/bitmap.h"
7205c9ec 35#include "qemu/main-loop.h"
709e3fe8 36#include "xbzrle.h"
7b1e1a22 37#include "ram.h"
6666c96a 38#include "migration.h"
f2a8f0a6 39#include "migration/register.h"
7b1e1a22 40#include "migration/misc.h"
08a0aee1 41#include "qemu-file.h"
be07b0ac 42#include "postcopy-ram.h"
56e93d26 43#include "migration/page_cache.h"
56e93d26 44#include "qemu/error-report.h"
56e93d26 45#include "trace.h"
56e93d26 46#include "exec/ram_addr.h"
56e93d26 47#include "qemu/rcu_queue.h"
a91246c9 48#include "migration/colo.h"
56e93d26 49
56e93d26
JQ
50/***********************************************************/
51/* ram save/restore */
52
bb890ed5
JQ
53/* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
54 * worked for pages that where filled with the same char. We switched
55 * it to only search for the zero value. And to avoid confusion with
56 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
57 */
58
56e93d26 59#define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
bb890ed5 60#define RAM_SAVE_FLAG_ZERO 0x02
56e93d26
JQ
61#define RAM_SAVE_FLAG_MEM_SIZE 0x04
62#define RAM_SAVE_FLAG_PAGE 0x08
63#define RAM_SAVE_FLAG_EOS 0x10
64#define RAM_SAVE_FLAG_CONTINUE 0x20
65#define RAM_SAVE_FLAG_XBZRLE 0x40
66/* 0x80 is reserved in migration.h start with 0x100 next */
67#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
68
56e93d26
JQ
69static inline bool is_zero_range(uint8_t *p, uint64_t size)
70{
a1febc49 71 return buffer_is_zero(p, size);
56e93d26
JQ
72}
73
9360447d
JQ
74XBZRLECacheStats xbzrle_counters;
75
56e93d26
JQ
76/* struct contains XBZRLE cache and a static page
77 used by the compression */
78static struct {
79 /* buffer used for XBZRLE encoding */
80 uint8_t *encoded_buf;
81 /* buffer for storing page content */
82 uint8_t *current_buf;
83 /* Cache for XBZRLE, Protected by lock. */
84 PageCache *cache;
85 QemuMutex lock;
c00e0928
JQ
86 /* it will store a page full of zeros */
87 uint8_t *zero_target_page;
56e93d26
JQ
88} XBZRLE;
89
90/* buffer used for XBZRLE decoding */
91static uint8_t *xbzrle_decoded_buf;
92
93static void XBZRLE_cache_lock(void)
94{
95 if (migrate_use_xbzrle())
96 qemu_mutex_lock(&XBZRLE.lock);
97}
98
99static void XBZRLE_cache_unlock(void)
100{
101 if (migrate_use_xbzrle())
102 qemu_mutex_unlock(&XBZRLE.lock);
103}
104
3d0684b2
JQ
105/**
106 * xbzrle_cache_resize: resize the xbzrle cache
107 *
108 * This function is called from qmp_migrate_set_cache_size in main
109 * thread, possibly while a migration is in progress. A running
110 * migration may be using the cache and might finish during this call,
111 * hence changes to the cache are protected by XBZRLE.lock().
112 *
113 * Returns the new_size or negative in case of error.
114 *
115 * @new_size: new cache size
56e93d26
JQ
116 */
117int64_t xbzrle_cache_resize(int64_t new_size)
118{
119 PageCache *new_cache;
120 int64_t ret;
121
122 if (new_size < TARGET_PAGE_SIZE) {
123 return -1;
124 }
125
126 XBZRLE_cache_lock();
127
128 if (XBZRLE.cache != NULL) {
129 if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
130 goto out_new_size;
131 }
132 new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
133 TARGET_PAGE_SIZE);
134 if (!new_cache) {
135 error_report("Error creating cache");
136 ret = -1;
137 goto out;
138 }
139
140 cache_fini(XBZRLE.cache);
141 XBZRLE.cache = new_cache;
142 }
143
144out_new_size:
145 ret = pow2floor(new_size);
146out:
147 XBZRLE_cache_unlock();
148 return ret;
149}
150
ec481c6c
JQ
151/*
152 * An outstanding page request, on the source, having been received
153 * and queued
154 */
155struct RAMSrcPageRequest {
156 RAMBlock *rb;
157 hwaddr offset;
158 hwaddr len;
159
160 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
161};
162
6f37bb8b
JQ
163/* State of RAM for migration */
164struct RAMState {
204b88b8
JQ
165 /* QEMUFile used for this migration */
166 QEMUFile *f;
6f37bb8b
JQ
167 /* Last block that we have visited searching for dirty pages */
168 RAMBlock *last_seen_block;
169 /* Last block from where we have sent data */
170 RAMBlock *last_sent_block;
269ace29
JQ
171 /* Last dirty target page we have sent */
172 ram_addr_t last_page;
6f37bb8b
JQ
173 /* last ram version we have seen */
174 uint32_t last_version;
175 /* We are in the first round */
176 bool ram_bulk_stage;
8d820d6f
JQ
177 /* How many times we have dirty too many pages */
178 int dirty_rate_high_cnt;
f664da80
JQ
179 /* these variables are used for bitmap sync */
180 /* last time we did a full bitmap_sync */
181 int64_t time_last_bitmap_sync;
eac74159 182 /* bytes transferred at start_time */
c4bdf0cf 183 uint64_t bytes_xfer_prev;
a66cd90c 184 /* number of dirty pages since start_time */
68908ed6 185 uint64_t num_dirty_pages_period;
b5833fde
JQ
186 /* xbzrle misses since the beginning of the period */
187 uint64_t xbzrle_cache_miss_prev;
36040d9c
JQ
188 /* number of iterations at the beginning of period */
189 uint64_t iterations_prev;
23b28c3c
JQ
190 /* Iterations since start */
191 uint64_t iterations;
108cfae0 192 /* protects modification of the bitmap */
9360447d
JQ
193 uint64_t migration_dirty_pages;
194 /* number of dirty bits in the bitmap */
108cfae0 195 QemuMutex bitmap_mutex;
68a098f3
JQ
196 /* The RAMBlock used in the last src_page_requests */
197 RAMBlock *last_req_rb;
ec481c6c
JQ
198 /* Queue of outstanding page requests from the destination */
199 QemuMutex src_page_req_mutex;
200 QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
6f37bb8b
JQ
201};
202typedef struct RAMState RAMState;
203
53518d94 204static RAMState *ram_state;
6f37bb8b 205
9edabd4d 206uint64_t ram_bytes_remaining(void)
2f4fde93 207{
53518d94 208 return ram_state->migration_dirty_pages * TARGET_PAGE_SIZE;
2f4fde93
JQ
209}
210
9360447d 211MigrationStats ram_counters;
96506894 212
b8fb8cb7
DDAG
213/* used by the search for pages to send */
214struct PageSearchStatus {
215 /* Current block being searched */
216 RAMBlock *block;
a935e30f
JQ
217 /* Current page to search from */
218 unsigned long page;
b8fb8cb7
DDAG
219 /* Set once we wrap around */
220 bool complete_round;
221};
222typedef struct PageSearchStatus PageSearchStatus;
223
56e93d26 224struct CompressParam {
56e93d26 225 bool done;
90e56fb4 226 bool quit;
56e93d26
JQ
227 QEMUFile *file;
228 QemuMutex mutex;
229 QemuCond cond;
230 RAMBlock *block;
231 ram_addr_t offset;
232};
233typedef struct CompressParam CompressParam;
234
235struct DecompressParam {
73a8912b 236 bool done;
90e56fb4 237 bool quit;
56e93d26
JQ
238 QemuMutex mutex;
239 QemuCond cond;
240 void *des;
d341d9f3 241 uint8_t *compbuf;
56e93d26
JQ
242 int len;
243};
244typedef struct DecompressParam DecompressParam;
245
246static CompressParam *comp_param;
247static QemuThread *compress_threads;
248/* comp_done_cond is used to wake up the migration thread when
249 * one of the compression threads has finished the compression.
250 * comp_done_lock is used to co-work with comp_done_cond.
251 */
0d9f9a5c
LL
252static QemuMutex comp_done_lock;
253static QemuCond comp_done_cond;
56e93d26
JQ
254/* The empty QEMUFileOps will be used by file in CompressParam */
255static const QEMUFileOps empty_ops = { };
256
56e93d26
JQ
257static DecompressParam *decomp_param;
258static QemuThread *decompress_threads;
73a8912b
LL
259static QemuMutex decomp_done_lock;
260static QemuCond decomp_done_cond;
56e93d26 261
a7a9a88f
LL
262static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
263 ram_addr_t offset);
56e93d26
JQ
264
265static void *do_data_compress(void *opaque)
266{
267 CompressParam *param = opaque;
a7a9a88f
LL
268 RAMBlock *block;
269 ram_addr_t offset;
56e93d26 270
a7a9a88f 271 qemu_mutex_lock(&param->mutex);
90e56fb4 272 while (!param->quit) {
a7a9a88f
LL
273 if (param->block) {
274 block = param->block;
275 offset = param->offset;
276 param->block = NULL;
277 qemu_mutex_unlock(&param->mutex);
278
279 do_compress_ram_page(param->file, block, offset);
280
0d9f9a5c 281 qemu_mutex_lock(&comp_done_lock);
a7a9a88f 282 param->done = true;
0d9f9a5c
LL
283 qemu_cond_signal(&comp_done_cond);
284 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
285
286 qemu_mutex_lock(&param->mutex);
287 } else {
56e93d26
JQ
288 qemu_cond_wait(&param->cond, &param->mutex);
289 }
56e93d26 290 }
a7a9a88f 291 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
292
293 return NULL;
294}
295
296static inline void terminate_compression_threads(void)
297{
298 int idx, thread_count;
299
300 thread_count = migrate_compress_threads();
3d0684b2 301
56e93d26
JQ
302 for (idx = 0; idx < thread_count; idx++) {
303 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 304 comp_param[idx].quit = true;
56e93d26
JQ
305 qemu_cond_signal(&comp_param[idx].cond);
306 qemu_mutex_unlock(&comp_param[idx].mutex);
307 }
308}
309
310void migrate_compress_threads_join(void)
311{
312 int i, thread_count;
313
314 if (!migrate_use_compression()) {
315 return;
316 }
317 terminate_compression_threads();
318 thread_count = migrate_compress_threads();
319 for (i = 0; i < thread_count; i++) {
320 qemu_thread_join(compress_threads + i);
321 qemu_fclose(comp_param[i].file);
322 qemu_mutex_destroy(&comp_param[i].mutex);
323 qemu_cond_destroy(&comp_param[i].cond);
324 }
0d9f9a5c
LL
325 qemu_mutex_destroy(&comp_done_lock);
326 qemu_cond_destroy(&comp_done_cond);
56e93d26
JQ
327 g_free(compress_threads);
328 g_free(comp_param);
56e93d26
JQ
329 compress_threads = NULL;
330 comp_param = NULL;
56e93d26
JQ
331}
332
333void migrate_compress_threads_create(void)
334{
335 int i, thread_count;
336
337 if (!migrate_use_compression()) {
338 return;
339 }
56e93d26
JQ
340 thread_count = migrate_compress_threads();
341 compress_threads = g_new0(QemuThread, thread_count);
342 comp_param = g_new0(CompressParam, thread_count);
0d9f9a5c
LL
343 qemu_cond_init(&comp_done_cond);
344 qemu_mutex_init(&comp_done_lock);
56e93d26 345 for (i = 0; i < thread_count; i++) {
e110aa91
C
346 /* comp_param[i].file is just used as a dummy buffer to save data,
347 * set its ops to empty.
56e93d26
JQ
348 */
349 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
350 comp_param[i].done = true;
90e56fb4 351 comp_param[i].quit = false;
56e93d26
JQ
352 qemu_mutex_init(&comp_param[i].mutex);
353 qemu_cond_init(&comp_param[i].cond);
354 qemu_thread_create(compress_threads + i, "compress",
355 do_data_compress, comp_param + i,
356 QEMU_THREAD_JOINABLE);
357 }
358}
359
360/**
3d0684b2 361 * save_page_header: write page header to wire
56e93d26
JQ
362 *
363 * If this is the 1st block, it also writes the block identification
364 *
3d0684b2 365 * Returns the number of bytes written
56e93d26
JQ
366 *
367 * @f: QEMUFile where to send the data
368 * @block: block that contains the page we want to send
369 * @offset: offset inside the block for the page
370 * in the lower bits, it contains flags
371 */
2bf3aa85
JQ
372static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
373 ram_addr_t offset)
56e93d26 374{
9f5f380b 375 size_t size, len;
56e93d26 376
24795694
JQ
377 if (block == rs->last_sent_block) {
378 offset |= RAM_SAVE_FLAG_CONTINUE;
379 }
2bf3aa85 380 qemu_put_be64(f, offset);
56e93d26
JQ
381 size = 8;
382
383 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
9f5f380b 384 len = strlen(block->idstr);
2bf3aa85
JQ
385 qemu_put_byte(f, len);
386 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
9f5f380b 387 size += 1 + len;
24795694 388 rs->last_sent_block = block;
56e93d26
JQ
389 }
390 return size;
391}
392
3d0684b2
JQ
393/**
394 * mig_throttle_guest_down: throotle down the guest
395 *
396 * Reduce amount of guest cpu execution to hopefully slow down memory
397 * writes. If guest dirty memory rate is reduced below the rate at
398 * which we can transfer pages to the destination then we should be
399 * able to complete migration. Some workloads dirty memory way too
400 * fast and will not effectively converge, even with auto-converge.
070afca2
JH
401 */
402static void mig_throttle_guest_down(void)
403{
404 MigrationState *s = migrate_get_current();
2594f56d
DB
405 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
406 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
070afca2
JH
407
408 /* We have not started throttling yet. Let's start it. */
409 if (!cpu_throttle_active()) {
410 cpu_throttle_set(pct_initial);
411 } else {
412 /* Throttling already on, just increase the rate */
413 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
414 }
415}
416
3d0684b2
JQ
417/**
418 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
419 *
6f37bb8b 420 * @rs: current RAM state
3d0684b2
JQ
421 * @current_addr: address for the zero page
422 *
423 * Update the xbzrle cache to reflect a page that's been sent as all 0.
56e93d26
JQ
424 * The important thing is that a stale (not-yet-0'd) page be replaced
425 * by the new data.
426 * As a bonus, if the page wasn't in the cache it gets added so that
3d0684b2 427 * when a small write is made into the 0'd page it gets XBZRLE sent.
56e93d26 428 */
6f37bb8b 429static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
56e93d26 430{
6f37bb8b 431 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
56e93d26
JQ
432 return;
433 }
434
435 /* We don't care if this fails to allocate a new cache page
436 * as long as it updated an old one */
c00e0928 437 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
9360447d 438 ram_counters.dirty_sync_count);
56e93d26
JQ
439}
440
441#define ENCODING_FLAG_XBZRLE 0x1
442
443/**
444 * save_xbzrle_page: compress and send current page
445 *
446 * Returns: 1 means that we wrote the page
447 * 0 means that page is identical to the one already sent
448 * -1 means that xbzrle would be longer than normal
449 *
5a987738 450 * @rs: current RAM state
3d0684b2
JQ
451 * @current_data: pointer to the address of the page contents
452 * @current_addr: addr of the page
56e93d26
JQ
453 * @block: block that contains the page we want to send
454 * @offset: offset inside the block for the page
455 * @last_stage: if we are at the completion stage
56e93d26 456 */
204b88b8 457static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
56e93d26 458 ram_addr_t current_addr, RAMBlock *block,
072c2511 459 ram_addr_t offset, bool last_stage)
56e93d26
JQ
460{
461 int encoded_len = 0, bytes_xbzrle;
462 uint8_t *prev_cached_page;
463
9360447d
JQ
464 if (!cache_is_cached(XBZRLE.cache, current_addr,
465 ram_counters.dirty_sync_count)) {
466 xbzrle_counters.cache_miss++;
56e93d26
JQ
467 if (!last_stage) {
468 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
9360447d 469 ram_counters.dirty_sync_count) == -1) {
56e93d26
JQ
470 return -1;
471 } else {
472 /* update *current_data when the page has been
473 inserted into cache */
474 *current_data = get_cached_data(XBZRLE.cache, current_addr);
475 }
476 }
477 return -1;
478 }
479
480 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
481
482 /* save current buffer into memory */
483 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
484
485 /* XBZRLE encoding (if there is no overflow) */
486 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
487 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
488 TARGET_PAGE_SIZE);
489 if (encoded_len == 0) {
55c4446b 490 trace_save_xbzrle_page_skipping();
56e93d26
JQ
491 return 0;
492 } else if (encoded_len == -1) {
55c4446b 493 trace_save_xbzrle_page_overflow();
9360447d 494 xbzrle_counters.overflow++;
56e93d26
JQ
495 /* update data in the cache */
496 if (!last_stage) {
497 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
498 *current_data = prev_cached_page;
499 }
500 return -1;
501 }
502
503 /* we need to update the data in the cache, in order to get the same data */
504 if (!last_stage) {
505 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
506 }
507
508 /* Send XBZRLE based compressed page */
2bf3aa85 509 bytes_xbzrle = save_page_header(rs, rs->f, block,
204b88b8
JQ
510 offset | RAM_SAVE_FLAG_XBZRLE);
511 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
512 qemu_put_be16(rs->f, encoded_len);
513 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
56e93d26 514 bytes_xbzrle += encoded_len + 1 + 2;
9360447d
JQ
515 xbzrle_counters.pages++;
516 xbzrle_counters.bytes += bytes_xbzrle;
517 ram_counters.transferred += bytes_xbzrle;
56e93d26
JQ
518
519 return 1;
520}
521
3d0684b2
JQ
522/**
523 * migration_bitmap_find_dirty: find the next dirty page from start
f3f491fc 524 *
3d0684b2
JQ
525 * Called with rcu_read_lock() to protect migration_bitmap
526 *
527 * Returns the byte offset within memory region of the start of a dirty page
528 *
6f37bb8b 529 * @rs: current RAM state
3d0684b2 530 * @rb: RAMBlock where to search for dirty pages
a935e30f 531 * @start: page where we start the search
f3f491fc 532 */
56e93d26 533static inline
a935e30f 534unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
f20e2865 535 unsigned long start)
56e93d26 536{
6b6712ef
JQ
537 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
538 unsigned long *bitmap = rb->bmap;
56e93d26
JQ
539 unsigned long next;
540
6b6712ef
JQ
541 if (rs->ram_bulk_stage && start > 0) {
542 next = start + 1;
56e93d26 543 } else {
6b6712ef 544 next = find_next_bit(bitmap, size, start);
56e93d26
JQ
545 }
546
6b6712ef 547 return next;
56e93d26
JQ
548}
549
06b10688 550static inline bool migration_bitmap_clear_dirty(RAMState *rs,
f20e2865
JQ
551 RAMBlock *rb,
552 unsigned long page)
a82d593b
DDAG
553{
554 bool ret;
a82d593b 555
6b6712ef 556 ret = test_and_clear_bit(page, rb->bmap);
a82d593b
DDAG
557
558 if (ret) {
0d8ec885 559 rs->migration_dirty_pages--;
a82d593b
DDAG
560 }
561 return ret;
562}
563
15440dd5
JQ
564static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
565 ram_addr_t start, ram_addr_t length)
56e93d26 566{
0d8ec885 567 rs->migration_dirty_pages +=
6b6712ef 568 cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
0d8ec885 569 &rs->num_dirty_pages_period);
56e93d26
JQ
570}
571
3d0684b2
JQ
572/**
573 * ram_pagesize_summary: calculate all the pagesizes of a VM
574 *
575 * Returns a summary bitmap of the page sizes of all RAMBlocks
576 *
577 * For VMs with just normal pages this is equivalent to the host page
578 * size. If it's got some huge pages then it's the OR of all the
579 * different page sizes.
e8ca1db2
DDAG
580 */
581uint64_t ram_pagesize_summary(void)
582{
583 RAMBlock *block;
584 uint64_t summary = 0;
585
99e15582 586 RAMBLOCK_FOREACH(block) {
e8ca1db2
DDAG
587 summary |= block->page_size;
588 }
589
590 return summary;
591}
592
8d820d6f 593static void migration_bitmap_sync(RAMState *rs)
56e93d26
JQ
594{
595 RAMBlock *block;
56e93d26 596 int64_t end_time;
c4bdf0cf 597 uint64_t bytes_xfer_now;
56e93d26 598
9360447d 599 ram_counters.dirty_sync_count++;
56e93d26 600
f664da80
JQ
601 if (!rs->time_last_bitmap_sync) {
602 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
56e93d26
JQ
603 }
604
605 trace_migration_bitmap_sync_start();
9c1f8f44 606 memory_global_dirty_log_sync();
56e93d26 607
108cfae0 608 qemu_mutex_lock(&rs->bitmap_mutex);
56e93d26 609 rcu_read_lock();
99e15582 610 RAMBLOCK_FOREACH(block) {
15440dd5 611 migration_bitmap_sync_range(rs, block, 0, block->used_length);
56e93d26
JQ
612 }
613 rcu_read_unlock();
108cfae0 614 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26 615
a66cd90c 616 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1ffb5dfd 617
56e93d26
JQ
618 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
619
620 /* more than 1 second = 1000 millisecons */
f664da80 621 if (end_time > rs->time_last_bitmap_sync + 1000) {
d693c6f1 622 /* calculate period counters */
9360447d 623 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
d693c6f1 624 / (end_time - rs->time_last_bitmap_sync);
9360447d 625 bytes_xfer_now = ram_counters.transferred;
d693c6f1 626
56e93d26
JQ
627 if (migrate_auto_converge()) {
628 /* The following detection logic can be refined later. For now:
629 Check to see if the dirtied bytes is 50% more than the approx.
630 amount of bytes that just got transferred since the last time we
070afca2
JH
631 were in this routine. If that happens twice, start or increase
632 throttling */
070afca2 633
d693c6f1 634 if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
eac74159 635 (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
b4a3c64b 636 (++rs->dirty_rate_high_cnt >= 2)) {
56e93d26 637 trace_migration_throttle();
8d820d6f 638 rs->dirty_rate_high_cnt = 0;
070afca2 639 mig_throttle_guest_down();
d693c6f1 640 }
56e93d26 641 }
070afca2 642
56e93d26 643 if (migrate_use_xbzrle()) {
23b28c3c 644 if (rs->iterations_prev != rs->iterations) {
9360447d
JQ
645 xbzrle_counters.cache_miss_rate =
646 (double)(xbzrle_counters.cache_miss -
b5833fde 647 rs->xbzrle_cache_miss_prev) /
23b28c3c 648 (rs->iterations - rs->iterations_prev);
56e93d26 649 }
23b28c3c 650 rs->iterations_prev = rs->iterations;
9360447d 651 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
56e93d26 652 }
d693c6f1
FF
653
654 /* reset period counters */
f664da80 655 rs->time_last_bitmap_sync = end_time;
a66cd90c 656 rs->num_dirty_pages_period = 0;
d2a4d85a 657 rs->bytes_xfer_prev = bytes_xfer_now;
56e93d26 658 }
4addcd4f 659 if (migrate_use_events()) {
9360447d 660 qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
4addcd4f 661 }
56e93d26
JQ
662}
663
664/**
3d0684b2 665 * save_zero_page: send the zero page to the stream
56e93d26 666 *
3d0684b2 667 * Returns the number of pages written.
56e93d26 668 *
f7ccd61b 669 * @rs: current RAM state
56e93d26
JQ
670 * @block: block that contains the page we want to send
671 * @offset: offset inside the block for the page
672 * @p: pointer to the page
56e93d26 673 */
ce25d337
JQ
674static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
675 uint8_t *p)
56e93d26
JQ
676{
677 int pages = -1;
678
679 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
9360447d
JQ
680 ram_counters.duplicate++;
681 ram_counters.transferred +=
bb890ed5 682 save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
ce25d337 683 qemu_put_byte(rs->f, 0);
9360447d 684 ram_counters.transferred += 1;
56e93d26
JQ
685 pages = 1;
686 }
687
688 return pages;
689}
690
5727309d 691static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
53f09a10 692{
5727309d 693 if (!migrate_release_ram() || !migration_in_postcopy()) {
53f09a10
PB
694 return;
695 }
696
aaa2064c 697 ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
53f09a10
PB
698}
699
56e93d26 700/**
3d0684b2 701 * ram_save_page: send the given page to the stream
56e93d26 702 *
3d0684b2 703 * Returns the number of pages written.
3fd3c4b3
DDAG
704 * < 0 - error
705 * >=0 - Number of pages written - this might legally be 0
706 * if xbzrle noticed the page was the same.
56e93d26 707 *
6f37bb8b 708 * @rs: current RAM state
56e93d26
JQ
709 * @block: block that contains the page we want to send
710 * @offset: offset inside the block for the page
711 * @last_stage: if we are at the completion stage
56e93d26 712 */
a0a8aa14 713static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
56e93d26
JQ
714{
715 int pages = -1;
716 uint64_t bytes_xmit;
717 ram_addr_t current_addr;
56e93d26
JQ
718 uint8_t *p;
719 int ret;
720 bool send_async = true;
a08f6890 721 RAMBlock *block = pss->block;
a935e30f 722 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
56e93d26 723
2f68e399 724 p = block->host + offset;
1db9d8e5 725 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
56e93d26
JQ
726
727 /* In doubt sent page as normal */
728 bytes_xmit = 0;
ce25d337 729 ret = ram_control_save_page(rs->f, block->offset,
56e93d26
JQ
730 offset, TARGET_PAGE_SIZE, &bytes_xmit);
731 if (bytes_xmit) {
9360447d 732 ram_counters.transferred += bytes_xmit;
56e93d26
JQ
733 pages = 1;
734 }
735
736 XBZRLE_cache_lock();
737
738 current_addr = block->offset + offset;
739
56e93d26
JQ
740 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
741 if (ret != RAM_SAVE_CONTROL_DELAYED) {
742 if (bytes_xmit > 0) {
9360447d 743 ram_counters.normal++;
56e93d26 744 } else if (bytes_xmit == 0) {
9360447d 745 ram_counters.duplicate++;
56e93d26
JQ
746 }
747 }
748 } else {
ce25d337 749 pages = save_zero_page(rs, block, offset, p);
56e93d26
JQ
750 if (pages > 0) {
751 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
752 * page would be stale
753 */
6f37bb8b 754 xbzrle_cache_zero_page(rs, current_addr);
a935e30f 755 ram_release_pages(block->idstr, offset, pages);
6f37bb8b 756 } else if (!rs->ram_bulk_stage &&
5727309d 757 !migration_in_postcopy() && migrate_use_xbzrle()) {
204b88b8 758 pages = save_xbzrle_page(rs, &p, current_addr, block,
072c2511 759 offset, last_stage);
56e93d26
JQ
760 if (!last_stage) {
761 /* Can't send this cached data async, since the cache page
762 * might get updated before it gets to the wire
763 */
764 send_async = false;
765 }
766 }
767 }
768
769 /* XBZRLE overflow or normal page */
770 if (pages == -1) {
9360447d
JQ
771 ram_counters.transferred +=
772 save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_PAGE);
56e93d26 773 if (send_async) {
ce25d337 774 qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
53f09a10 775 migrate_release_ram() &
5727309d 776 migration_in_postcopy());
56e93d26 777 } else {
ce25d337 778 qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
56e93d26 779 }
9360447d 780 ram_counters.transferred += TARGET_PAGE_SIZE;
56e93d26 781 pages = 1;
9360447d 782 ram_counters.normal++;
56e93d26
JQ
783 }
784
785 XBZRLE_cache_unlock();
786
787 return pages;
788}
789
a7a9a88f
LL
790static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
791 ram_addr_t offset)
56e93d26 792{
53518d94 793 RAMState *rs = ram_state;
56e93d26 794 int bytes_sent, blen;
a7a9a88f 795 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
56e93d26 796
2bf3aa85 797 bytes_sent = save_page_header(rs, f, block, offset |
56e93d26 798 RAM_SAVE_FLAG_COMPRESS_PAGE);
a7a9a88f 799 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
56e93d26 800 migrate_compress_level());
b3be2896
LL
801 if (blen < 0) {
802 bytes_sent = 0;
803 qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
804 error_report("compressed data failed!");
805 } else {
806 bytes_sent += blen;
5727309d 807 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
b3be2896 808 }
56e93d26
JQ
809
810 return bytes_sent;
811}
812
ce25d337 813static void flush_compressed_data(RAMState *rs)
56e93d26
JQ
814{
815 int idx, len, thread_count;
816
817 if (!migrate_use_compression()) {
818 return;
819 }
820 thread_count = migrate_compress_threads();
a7a9a88f 821
0d9f9a5c 822 qemu_mutex_lock(&comp_done_lock);
56e93d26 823 for (idx = 0; idx < thread_count; idx++) {
a7a9a88f 824 while (!comp_param[idx].done) {
0d9f9a5c 825 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26 826 }
a7a9a88f 827 }
0d9f9a5c 828 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
829
830 for (idx = 0; idx < thread_count; idx++) {
831 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 832 if (!comp_param[idx].quit) {
ce25d337 833 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
9360447d 834 ram_counters.transferred += len;
56e93d26 835 }
a7a9a88f 836 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26
JQ
837 }
838}
839
840static inline void set_compress_params(CompressParam *param, RAMBlock *block,
841 ram_addr_t offset)
842{
843 param->block = block;
844 param->offset = offset;
845}
846
ce25d337
JQ
847static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
848 ram_addr_t offset)
56e93d26
JQ
849{
850 int idx, thread_count, bytes_xmit = -1, pages = -1;
851
852 thread_count = migrate_compress_threads();
0d9f9a5c 853 qemu_mutex_lock(&comp_done_lock);
56e93d26
JQ
854 while (true) {
855 for (idx = 0; idx < thread_count; idx++) {
856 if (comp_param[idx].done) {
a7a9a88f 857 comp_param[idx].done = false;
ce25d337 858 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
a7a9a88f 859 qemu_mutex_lock(&comp_param[idx].mutex);
56e93d26 860 set_compress_params(&comp_param[idx], block, offset);
a7a9a88f
LL
861 qemu_cond_signal(&comp_param[idx].cond);
862 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26 863 pages = 1;
9360447d
JQ
864 ram_counters.normal++;
865 ram_counters.transferred += bytes_xmit;
56e93d26
JQ
866 break;
867 }
868 }
869 if (pages > 0) {
870 break;
871 } else {
0d9f9a5c 872 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26
JQ
873 }
874 }
0d9f9a5c 875 qemu_mutex_unlock(&comp_done_lock);
56e93d26
JQ
876
877 return pages;
878}
879
880/**
881 * ram_save_compressed_page: compress the given page and send it to the stream
882 *
3d0684b2 883 * Returns the number of pages written.
56e93d26 884 *
6f37bb8b 885 * @rs: current RAM state
56e93d26
JQ
886 * @block: block that contains the page we want to send
887 * @offset: offset inside the block for the page
888 * @last_stage: if we are at the completion stage
56e93d26 889 */
a0a8aa14
JQ
890static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
891 bool last_stage)
56e93d26
JQ
892{
893 int pages = -1;
fc50438e 894 uint64_t bytes_xmit = 0;
56e93d26 895 uint8_t *p;
fc50438e 896 int ret, blen;
a08f6890 897 RAMBlock *block = pss->block;
a935e30f 898 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
56e93d26 899
2f68e399 900 p = block->host + offset;
56e93d26 901
ce25d337 902 ret = ram_control_save_page(rs->f, block->offset,
56e93d26
JQ
903 offset, TARGET_PAGE_SIZE, &bytes_xmit);
904 if (bytes_xmit) {
9360447d 905 ram_counters.transferred += bytes_xmit;
56e93d26
JQ
906 pages = 1;
907 }
56e93d26
JQ
908 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
909 if (ret != RAM_SAVE_CONTROL_DELAYED) {
910 if (bytes_xmit > 0) {
9360447d 911 ram_counters.normal++;
56e93d26 912 } else if (bytes_xmit == 0) {
9360447d 913 ram_counters.duplicate++;
56e93d26
JQ
914 }
915 }
916 } else {
917 /* When starting the process of a new block, the first page of
918 * the block should be sent out before other pages in the same
919 * block, and all the pages in last block should have been sent
920 * out, keeping this order is important, because the 'cont' flag
921 * is used to avoid resending the block name.
922 */
6f37bb8b 923 if (block != rs->last_sent_block) {
ce25d337
JQ
924 flush_compressed_data(rs);
925 pages = save_zero_page(rs, block, offset, p);
56e93d26 926 if (pages == -1) {
fc50438e 927 /* Make sure the first page is sent out before other pages */
2bf3aa85 928 bytes_xmit = save_page_header(rs, rs->f, block, offset |
fc50438e 929 RAM_SAVE_FLAG_COMPRESS_PAGE);
ce25d337 930 blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
fc50438e
LL
931 migrate_compress_level());
932 if (blen > 0) {
9360447d
JQ
933 ram_counters.transferred += bytes_xmit + blen;
934 ram_counters.normal++;
b3be2896 935 pages = 1;
fc50438e 936 } else {
ce25d337 937 qemu_file_set_error(rs->f, blen);
fc50438e 938 error_report("compressed data failed!");
b3be2896 939 }
56e93d26 940 }
53f09a10 941 if (pages > 0) {
a935e30f 942 ram_release_pages(block->idstr, offset, pages);
53f09a10 943 }
56e93d26 944 } else {
ce25d337 945 pages = save_zero_page(rs, block, offset, p);
56e93d26 946 if (pages == -1) {
ce25d337 947 pages = compress_page_with_multi_thread(rs, block, offset);
53f09a10 948 } else {
a935e30f 949 ram_release_pages(block->idstr, offset, pages);
56e93d26
JQ
950 }
951 }
952 }
953
954 return pages;
955}
956
3d0684b2
JQ
957/**
958 * find_dirty_block: find the next dirty page and update any state
959 * associated with the search process.
b9e60928 960 *
3d0684b2 961 * Returns if a page is found
b9e60928 962 *
6f37bb8b 963 * @rs: current RAM state
3d0684b2
JQ
964 * @pss: data about the state of the current dirty page scan
965 * @again: set to false if the search has scanned the whole of RAM
b9e60928 966 */
f20e2865 967static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
b9e60928 968{
f20e2865 969 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
6f37bb8b 970 if (pss->complete_round && pss->block == rs->last_seen_block &&
a935e30f 971 pss->page >= rs->last_page) {
b9e60928
DDAG
972 /*
973 * We've been once around the RAM and haven't found anything.
974 * Give up.
975 */
976 *again = false;
977 return false;
978 }
a935e30f 979 if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
b9e60928 980 /* Didn't find anything in this RAM Block */
a935e30f 981 pss->page = 0;
b9e60928
DDAG
982 pss->block = QLIST_NEXT_RCU(pss->block, next);
983 if (!pss->block) {
984 /* Hit the end of the list */
985 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
986 /* Flag that we've looped */
987 pss->complete_round = true;
6f37bb8b 988 rs->ram_bulk_stage = false;
b9e60928
DDAG
989 if (migrate_use_xbzrle()) {
990 /* If xbzrle is on, stop using the data compression at this
991 * point. In theory, xbzrle can do better than compression.
992 */
ce25d337 993 flush_compressed_data(rs);
b9e60928
DDAG
994 }
995 }
996 /* Didn't find anything this time, but try again on the new block */
997 *again = true;
998 return false;
999 } else {
1000 /* Can go around again, but... */
1001 *again = true;
1002 /* We've found something so probably don't need to */
1003 return true;
1004 }
1005}
1006
3d0684b2
JQ
1007/**
1008 * unqueue_page: gets a page of the queue
1009 *
a82d593b 1010 * Helper for 'get_queued_page' - gets a page off the queue
a82d593b 1011 *
3d0684b2
JQ
1012 * Returns the block of the page (or NULL if none available)
1013 *
ec481c6c 1014 * @rs: current RAM state
3d0684b2 1015 * @offset: used to return the offset within the RAMBlock
a82d593b 1016 */
f20e2865 1017static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
a82d593b
DDAG
1018{
1019 RAMBlock *block = NULL;
1020
ec481c6c
JQ
1021 qemu_mutex_lock(&rs->src_page_req_mutex);
1022 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1023 struct RAMSrcPageRequest *entry =
1024 QSIMPLEQ_FIRST(&rs->src_page_requests);
a82d593b
DDAG
1025 block = entry->rb;
1026 *offset = entry->offset;
a82d593b
DDAG
1027
1028 if (entry->len > TARGET_PAGE_SIZE) {
1029 entry->len -= TARGET_PAGE_SIZE;
1030 entry->offset += TARGET_PAGE_SIZE;
1031 } else {
1032 memory_region_unref(block->mr);
ec481c6c 1033 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
a82d593b
DDAG
1034 g_free(entry);
1035 }
1036 }
ec481c6c 1037 qemu_mutex_unlock(&rs->src_page_req_mutex);
a82d593b
DDAG
1038
1039 return block;
1040}
1041
3d0684b2
JQ
1042/**
1043 * get_queued_page: unqueue a page from the postocpy requests
1044 *
1045 * Skips pages that are already sent (!dirty)
a82d593b 1046 *
3d0684b2 1047 * Returns if a queued page is found
a82d593b 1048 *
6f37bb8b 1049 * @rs: current RAM state
3d0684b2 1050 * @pss: data about the state of the current dirty page scan
a82d593b 1051 */
f20e2865 1052static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
a82d593b
DDAG
1053{
1054 RAMBlock *block;
1055 ram_addr_t offset;
1056 bool dirty;
1057
1058 do {
f20e2865 1059 block = unqueue_page(rs, &offset);
a82d593b
DDAG
1060 /*
1061 * We're sending this page, and since it's postcopy nothing else
1062 * will dirty it, and we must make sure it doesn't get sent again
1063 * even if this queue request was received after the background
1064 * search already sent it.
1065 */
1066 if (block) {
f20e2865
JQ
1067 unsigned long page;
1068
6b6712ef
JQ
1069 page = offset >> TARGET_PAGE_BITS;
1070 dirty = test_bit(page, block->bmap);
a82d593b 1071 if (!dirty) {
06b10688 1072 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
6b6712ef 1073 page, test_bit(page, block->unsentmap));
a82d593b 1074 } else {
f20e2865 1075 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
a82d593b
DDAG
1076 }
1077 }
1078
1079 } while (block && !dirty);
1080
1081 if (block) {
1082 /*
1083 * As soon as we start servicing pages out of order, then we have
1084 * to kill the bulk stage, since the bulk stage assumes
1085 * in (migration_bitmap_find_and_reset_dirty) that every page is
1086 * dirty, that's no longer true.
1087 */
6f37bb8b 1088 rs->ram_bulk_stage = false;
a82d593b
DDAG
1089
1090 /*
1091 * We want the background search to continue from the queued page
1092 * since the guest is likely to want other pages near to the page
1093 * it just requested.
1094 */
1095 pss->block = block;
a935e30f 1096 pss->page = offset >> TARGET_PAGE_BITS;
a82d593b
DDAG
1097 }
1098
1099 return !!block;
1100}
1101
6c595cde 1102/**
5e58f968
JQ
1103 * migration_page_queue_free: drop any remaining pages in the ram
1104 * request queue
6c595cde 1105 *
3d0684b2
JQ
1106 * It should be empty at the end anyway, but in error cases there may
1107 * be some left. in case that there is any page left, we drop it.
1108 *
6c595cde 1109 */
83c13382 1110static void migration_page_queue_free(RAMState *rs)
6c595cde 1111{
ec481c6c 1112 struct RAMSrcPageRequest *mspr, *next_mspr;
6c595cde
DDAG
1113 /* This queue generally should be empty - but in the case of a failed
1114 * migration might have some droppings in.
1115 */
1116 rcu_read_lock();
ec481c6c 1117 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
6c595cde 1118 memory_region_unref(mspr->rb->mr);
ec481c6c 1119 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
6c595cde
DDAG
1120 g_free(mspr);
1121 }
1122 rcu_read_unlock();
1123}
1124
1125/**
3d0684b2
JQ
1126 * ram_save_queue_pages: queue the page for transmission
1127 *
1128 * A request from postcopy destination for example.
1129 *
1130 * Returns zero on success or negative on error
1131 *
3d0684b2
JQ
1132 * @rbname: Name of the RAMBLock of the request. NULL means the
1133 * same that last one.
1134 * @start: starting address from the start of the RAMBlock
1135 * @len: length (in bytes) to send
6c595cde 1136 */
96506894 1137int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
6c595cde
DDAG
1138{
1139 RAMBlock *ramblock;
53518d94 1140 RAMState *rs = ram_state;
6c595cde 1141
9360447d 1142 ram_counters.postcopy_requests++;
6c595cde
DDAG
1143 rcu_read_lock();
1144 if (!rbname) {
1145 /* Reuse last RAMBlock */
68a098f3 1146 ramblock = rs->last_req_rb;
6c595cde
DDAG
1147
1148 if (!ramblock) {
1149 /*
1150 * Shouldn't happen, we can't reuse the last RAMBlock if
1151 * it's the 1st request.
1152 */
1153 error_report("ram_save_queue_pages no previous block");
1154 goto err;
1155 }
1156 } else {
1157 ramblock = qemu_ram_block_by_name(rbname);
1158
1159 if (!ramblock) {
1160 /* We shouldn't be asked for a non-existent RAMBlock */
1161 error_report("ram_save_queue_pages no block '%s'", rbname);
1162 goto err;
1163 }
68a098f3 1164 rs->last_req_rb = ramblock;
6c595cde
DDAG
1165 }
1166 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1167 if (start+len > ramblock->used_length) {
9458ad6b
JQ
1168 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1169 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
6c595cde
DDAG
1170 __func__, start, len, ramblock->used_length);
1171 goto err;
1172 }
1173
ec481c6c
JQ
1174 struct RAMSrcPageRequest *new_entry =
1175 g_malloc0(sizeof(struct RAMSrcPageRequest));
6c595cde
DDAG
1176 new_entry->rb = ramblock;
1177 new_entry->offset = start;
1178 new_entry->len = len;
1179
1180 memory_region_ref(ramblock->mr);
ec481c6c
JQ
1181 qemu_mutex_lock(&rs->src_page_req_mutex);
1182 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1183 qemu_mutex_unlock(&rs->src_page_req_mutex);
6c595cde
DDAG
1184 rcu_read_unlock();
1185
1186 return 0;
1187
1188err:
1189 rcu_read_unlock();
1190 return -1;
1191}
1192
a82d593b 1193/**
3d0684b2 1194 * ram_save_target_page: save one target page
a82d593b 1195 *
3d0684b2 1196 * Returns the number of pages written
a82d593b 1197 *
6f37bb8b 1198 * @rs: current RAM state
3d0684b2 1199 * @ms: current migration state
3d0684b2 1200 * @pss: data about the page we want to send
a82d593b 1201 * @last_stage: if we are at the completion stage
a82d593b 1202 */
a0a8aa14 1203static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
f20e2865 1204 bool last_stage)
a82d593b
DDAG
1205{
1206 int res = 0;
1207
1208 /* Check the pages is dirty and if it is send it */
f20e2865 1209 if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
6d358d94
JQ
1210 /*
1211 * If xbzrle is on, stop using the data compression after first
1212 * round of migration even if compression is enabled. In theory,
1213 * xbzrle can do better than compression.
1214 */
6b6712ef
JQ
1215 if (migrate_use_compression() &&
1216 (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
a0a8aa14 1217 res = ram_save_compressed_page(rs, pss, last_stage);
a82d593b 1218 } else {
a0a8aa14 1219 res = ram_save_page(rs, pss, last_stage);
a82d593b
DDAG
1220 }
1221
1222 if (res < 0) {
1223 return res;
1224 }
6b6712ef
JQ
1225 if (pss->block->unsentmap) {
1226 clear_bit(pss->page, pss->block->unsentmap);
a82d593b
DDAG
1227 }
1228 }
1229
1230 return res;
1231}
1232
1233/**
3d0684b2 1234 * ram_save_host_page: save a whole host page
a82d593b 1235 *
3d0684b2
JQ
1236 * Starting at *offset send pages up to the end of the current host
1237 * page. It's valid for the initial offset to point into the middle of
1238 * a host page in which case the remainder of the hostpage is sent.
1239 * Only dirty target pages are sent. Note that the host page size may
1240 * be a huge page for this block.
1eb3fc0a
DDAG
1241 * The saving stops at the boundary of the used_length of the block
1242 * if the RAMBlock isn't a multiple of the host page size.
a82d593b 1243 *
3d0684b2
JQ
1244 * Returns the number of pages written or negative on error
1245 *
6f37bb8b 1246 * @rs: current RAM state
3d0684b2 1247 * @ms: current migration state
3d0684b2 1248 * @pss: data about the page we want to send
a82d593b 1249 * @last_stage: if we are at the completion stage
a82d593b 1250 */
a0a8aa14 1251static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
f20e2865 1252 bool last_stage)
a82d593b
DDAG
1253{
1254 int tmppages, pages = 0;
a935e30f
JQ
1255 size_t pagesize_bits =
1256 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
4c011c37 1257
a82d593b 1258 do {
f20e2865 1259 tmppages = ram_save_target_page(rs, pss, last_stage);
a82d593b
DDAG
1260 if (tmppages < 0) {
1261 return tmppages;
1262 }
1263
1264 pages += tmppages;
a935e30f 1265 pss->page++;
1eb3fc0a
DDAG
1266 } while ((pss->page & (pagesize_bits - 1)) &&
1267 offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
a82d593b
DDAG
1268
1269 /* The offset we leave with is the last one we looked at */
a935e30f 1270 pss->page--;
a82d593b
DDAG
1271 return pages;
1272}
6c595cde 1273
56e93d26 1274/**
3d0684b2 1275 * ram_find_and_save_block: finds a dirty page and sends it to f
56e93d26
JQ
1276 *
1277 * Called within an RCU critical section.
1278 *
3d0684b2 1279 * Returns the number of pages written where zero means no dirty pages
56e93d26 1280 *
6f37bb8b 1281 * @rs: current RAM state
56e93d26 1282 * @last_stage: if we are at the completion stage
a82d593b
DDAG
1283 *
1284 * On systems where host-page-size > target-page-size it will send all the
1285 * pages in a host page that are dirty.
56e93d26
JQ
1286 */
1287
ce25d337 1288static int ram_find_and_save_block(RAMState *rs, bool last_stage)
56e93d26 1289{
b8fb8cb7 1290 PageSearchStatus pss;
56e93d26 1291 int pages = 0;
b9e60928 1292 bool again, found;
56e93d26 1293
0827b9e9
AA
1294 /* No dirty page as there is zero RAM */
1295 if (!ram_bytes_total()) {
1296 return pages;
1297 }
1298
6f37bb8b 1299 pss.block = rs->last_seen_block;
a935e30f 1300 pss.page = rs->last_page;
b8fb8cb7
DDAG
1301 pss.complete_round = false;
1302
1303 if (!pss.block) {
1304 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1305 }
56e93d26 1306
b9e60928 1307 do {
a82d593b 1308 again = true;
f20e2865 1309 found = get_queued_page(rs, &pss);
b9e60928 1310
a82d593b
DDAG
1311 if (!found) {
1312 /* priority queue empty, so just search for something dirty */
f20e2865 1313 found = find_dirty_block(rs, &pss, &again);
a82d593b 1314 }
f3f491fc 1315
a82d593b 1316 if (found) {
f20e2865 1317 pages = ram_save_host_page(rs, &pss, last_stage);
56e93d26 1318 }
b9e60928 1319 } while (!pages && again);
56e93d26 1320
6f37bb8b 1321 rs->last_seen_block = pss.block;
a935e30f 1322 rs->last_page = pss.page;
56e93d26
JQ
1323
1324 return pages;
1325}
1326
1327void acct_update_position(QEMUFile *f, size_t size, bool zero)
1328{
1329 uint64_t pages = size / TARGET_PAGE_SIZE;
f7ccd61b 1330
56e93d26 1331 if (zero) {
9360447d 1332 ram_counters.duplicate += pages;
56e93d26 1333 } else {
9360447d
JQ
1334 ram_counters.normal += pages;
1335 ram_counters.transferred += size;
56e93d26
JQ
1336 qemu_update_position(f, size);
1337 }
1338}
1339
56e93d26
JQ
1340uint64_t ram_bytes_total(void)
1341{
1342 RAMBlock *block;
1343 uint64_t total = 0;
1344
1345 rcu_read_lock();
99e15582 1346 RAMBLOCK_FOREACH(block) {
56e93d26 1347 total += block->used_length;
99e15582 1348 }
56e93d26
JQ
1349 rcu_read_unlock();
1350 return total;
1351}
1352
1353void free_xbzrle_decoded_buf(void)
1354{
1355 g_free(xbzrle_decoded_buf);
1356 xbzrle_decoded_buf = NULL;
1357}
1358
6ad2a215 1359static void ram_migration_cleanup(void *opaque)
56e93d26 1360{
53518d94 1361 RAMState **rsp = opaque;
6b6712ef 1362 RAMBlock *block;
eb859c53 1363
2ff64038
LZ
1364 /* caller have hold iothread lock or is in a bh, so there is
1365 * no writing race against this migration_bitmap
1366 */
6b6712ef
JQ
1367 memory_global_dirty_log_stop();
1368
1369 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1370 g_free(block->bmap);
1371 block->bmap = NULL;
1372 g_free(block->unsentmap);
1373 block->unsentmap = NULL;
56e93d26
JQ
1374 }
1375
1376 XBZRLE_cache_lock();
1377 if (XBZRLE.cache) {
1378 cache_fini(XBZRLE.cache);
1379 g_free(XBZRLE.encoded_buf);
1380 g_free(XBZRLE.current_buf);
c00e0928 1381 g_free(XBZRLE.zero_target_page);
56e93d26
JQ
1382 XBZRLE.cache = NULL;
1383 XBZRLE.encoded_buf = NULL;
1384 XBZRLE.current_buf = NULL;
c00e0928 1385 XBZRLE.zero_target_page = NULL;
56e93d26
JQ
1386 }
1387 XBZRLE_cache_unlock();
53518d94
JQ
1388 migration_page_queue_free(*rsp);
1389 g_free(*rsp);
1390 *rsp = NULL;
56e93d26
JQ
1391}
1392
6f37bb8b 1393static void ram_state_reset(RAMState *rs)
56e93d26 1394{
6f37bb8b
JQ
1395 rs->last_seen_block = NULL;
1396 rs->last_sent_block = NULL;
269ace29 1397 rs->last_page = 0;
6f37bb8b
JQ
1398 rs->last_version = ram_list.version;
1399 rs->ram_bulk_stage = true;
56e93d26
JQ
1400}
1401
1402#define MAX_WAIT 50 /* ms, half buffered_file limit */
1403
4f2e4252
DDAG
1404/*
1405 * 'expected' is the value you expect the bitmap mostly to be full
1406 * of; it won't bother printing lines that are all this value.
1407 * If 'todump' is null the migration bitmap is dumped.
1408 */
6b6712ef
JQ
1409void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1410 unsigned long pages)
4f2e4252 1411{
4f2e4252
DDAG
1412 int64_t cur;
1413 int64_t linelen = 128;
1414 char linebuf[129];
1415
6b6712ef 1416 for (cur = 0; cur < pages; cur += linelen) {
4f2e4252
DDAG
1417 int64_t curb;
1418 bool found = false;
1419 /*
1420 * Last line; catch the case where the line length
1421 * is longer than remaining ram
1422 */
6b6712ef
JQ
1423 if (cur + linelen > pages) {
1424 linelen = pages - cur;
4f2e4252
DDAG
1425 }
1426 for (curb = 0; curb < linelen; curb++) {
1427 bool thisbit = test_bit(cur + curb, todump);
1428 linebuf[curb] = thisbit ? '1' : '.';
1429 found = found || (thisbit != expected);
1430 }
1431 if (found) {
1432 linebuf[curb] = '\0';
1433 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
1434 }
1435 }
1436}
1437
e0b266f0
DDAG
1438/* **** functions for postcopy ***** */
1439
ced1c616
PB
1440void ram_postcopy_migrated_memory_release(MigrationState *ms)
1441{
1442 struct RAMBlock *block;
ced1c616 1443
99e15582 1444 RAMBLOCK_FOREACH(block) {
6b6712ef
JQ
1445 unsigned long *bitmap = block->bmap;
1446 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1447 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
ced1c616
PB
1448
1449 while (run_start < range) {
1450 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
aaa2064c 1451 ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
ced1c616
PB
1452 (run_end - run_start) << TARGET_PAGE_BITS);
1453 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1454 }
1455 }
1456}
1457
3d0684b2
JQ
1458/**
1459 * postcopy_send_discard_bm_ram: discard a RAMBlock
1460 *
1461 * Returns zero on success
1462 *
e0b266f0
DDAG
1463 * Callback from postcopy_each_ram_send_discard for each RAMBlock
1464 * Note: At this point the 'unsentmap' is the processed bitmap combined
1465 * with the dirtymap; so a '1' means it's either dirty or unsent.
3d0684b2
JQ
1466 *
1467 * @ms: current migration state
1468 * @pds: state for postcopy
1469 * @start: RAMBlock starting page
1470 * @length: RAMBlock size
e0b266f0
DDAG
1471 */
1472static int postcopy_send_discard_bm_ram(MigrationState *ms,
1473 PostcopyDiscardState *pds,
6b6712ef 1474 RAMBlock *block)
e0b266f0 1475{
6b6712ef 1476 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
e0b266f0 1477 unsigned long current;
6b6712ef 1478 unsigned long *unsentmap = block->unsentmap;
e0b266f0 1479
6b6712ef 1480 for (current = 0; current < end; ) {
e0b266f0
DDAG
1481 unsigned long one = find_next_bit(unsentmap, end, current);
1482
1483 if (one <= end) {
1484 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1485 unsigned long discard_length;
1486
1487 if (zero >= end) {
1488 discard_length = end - one;
1489 } else {
1490 discard_length = zero - one;
1491 }
d688c62d
DDAG
1492 if (discard_length) {
1493 postcopy_discard_send_range(ms, pds, one, discard_length);
1494 }
e0b266f0
DDAG
1495 current = one + discard_length;
1496 } else {
1497 current = one;
1498 }
1499 }
1500
1501 return 0;
1502}
1503
3d0684b2
JQ
1504/**
1505 * postcopy_each_ram_send_discard: discard all RAMBlocks
1506 *
1507 * Returns 0 for success or negative for error
1508 *
e0b266f0
DDAG
1509 * Utility for the outgoing postcopy code.
1510 * Calls postcopy_send_discard_bm_ram for each RAMBlock
1511 * passing it bitmap indexes and name.
e0b266f0
DDAG
1512 * (qemu_ram_foreach_block ends up passing unscaled lengths
1513 * which would mean postcopy code would have to deal with target page)
3d0684b2
JQ
1514 *
1515 * @ms: current migration state
e0b266f0
DDAG
1516 */
1517static int postcopy_each_ram_send_discard(MigrationState *ms)
1518{
1519 struct RAMBlock *block;
1520 int ret;
1521
99e15582 1522 RAMBLOCK_FOREACH(block) {
6b6712ef
JQ
1523 PostcopyDiscardState *pds =
1524 postcopy_discard_send_init(ms, block->idstr);
e0b266f0
DDAG
1525
1526 /*
1527 * Postcopy sends chunks of bitmap over the wire, but it
1528 * just needs indexes at this point, avoids it having
1529 * target page specific code.
1530 */
6b6712ef 1531 ret = postcopy_send_discard_bm_ram(ms, pds, block);
e0b266f0
DDAG
1532 postcopy_discard_send_finish(ms, pds);
1533 if (ret) {
1534 return ret;
1535 }
1536 }
1537
1538 return 0;
1539}
1540
3d0684b2
JQ
1541/**
1542 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1543 *
1544 * Helper for postcopy_chunk_hostpages; it's called twice to
1545 * canonicalize the two bitmaps, that are similar, but one is
1546 * inverted.
99e314eb 1547 *
3d0684b2
JQ
1548 * Postcopy requires that all target pages in a hostpage are dirty or
1549 * clean, not a mix. This function canonicalizes the bitmaps.
99e314eb 1550 *
3d0684b2
JQ
1551 * @ms: current migration state
1552 * @unsent_pass: if true we need to canonicalize partially unsent host pages
1553 * otherwise we need to canonicalize partially dirty host pages
1554 * @block: block that contains the page we want to canonicalize
1555 * @pds: state for postcopy
99e314eb
DDAG
1556 */
1557static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1558 RAMBlock *block,
1559 PostcopyDiscardState *pds)
1560{
53518d94 1561 RAMState *rs = ram_state;
6b6712ef
JQ
1562 unsigned long *bitmap = block->bmap;
1563 unsigned long *unsentmap = block->unsentmap;
29c59172 1564 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
6b6712ef 1565 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
99e314eb
DDAG
1566 unsigned long run_start;
1567
29c59172
DDAG
1568 if (block->page_size == TARGET_PAGE_SIZE) {
1569 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1570 return;
1571 }
1572
99e314eb
DDAG
1573 if (unsent_pass) {
1574 /* Find a sent page */
6b6712ef 1575 run_start = find_next_zero_bit(unsentmap, pages, 0);
99e314eb
DDAG
1576 } else {
1577 /* Find a dirty page */
6b6712ef 1578 run_start = find_next_bit(bitmap, pages, 0);
99e314eb
DDAG
1579 }
1580
6b6712ef 1581 while (run_start < pages) {
99e314eb
DDAG
1582 bool do_fixup = false;
1583 unsigned long fixup_start_addr;
1584 unsigned long host_offset;
1585
1586 /*
1587 * If the start of this run of pages is in the middle of a host
1588 * page, then we need to fixup this host page.
1589 */
1590 host_offset = run_start % host_ratio;
1591 if (host_offset) {
1592 do_fixup = true;
1593 run_start -= host_offset;
1594 fixup_start_addr = run_start;
1595 /* For the next pass */
1596 run_start = run_start + host_ratio;
1597 } else {
1598 /* Find the end of this run */
1599 unsigned long run_end;
1600 if (unsent_pass) {
6b6712ef 1601 run_end = find_next_bit(unsentmap, pages, run_start + 1);
99e314eb 1602 } else {
6b6712ef 1603 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
99e314eb
DDAG
1604 }
1605 /*
1606 * If the end isn't at the start of a host page, then the
1607 * run doesn't finish at the end of a host page
1608 * and we need to discard.
1609 */
1610 host_offset = run_end % host_ratio;
1611 if (host_offset) {
1612 do_fixup = true;
1613 fixup_start_addr = run_end - host_offset;
1614 /*
1615 * This host page has gone, the next loop iteration starts
1616 * from after the fixup
1617 */
1618 run_start = fixup_start_addr + host_ratio;
1619 } else {
1620 /*
1621 * No discards on this iteration, next loop starts from
1622 * next sent/dirty page
1623 */
1624 run_start = run_end + 1;
1625 }
1626 }
1627
1628 if (do_fixup) {
1629 unsigned long page;
1630
1631 /* Tell the destination to discard this page */
1632 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1633 /* For the unsent_pass we:
1634 * discard partially sent pages
1635 * For the !unsent_pass (dirty) we:
1636 * discard partially dirty pages that were sent
1637 * (any partially sent pages were already discarded
1638 * by the previous unsent_pass)
1639 */
1640 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1641 host_ratio);
1642 }
1643
1644 /* Clean up the bitmap */
1645 for (page = fixup_start_addr;
1646 page < fixup_start_addr + host_ratio; page++) {
1647 /* All pages in this host page are now not sent */
1648 set_bit(page, unsentmap);
1649
1650 /*
1651 * Remark them as dirty, updating the count for any pages
1652 * that weren't previously dirty.
1653 */
0d8ec885 1654 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
99e314eb
DDAG
1655 }
1656 }
1657
1658 if (unsent_pass) {
1659 /* Find the next sent page for the next iteration */
6b6712ef 1660 run_start = find_next_zero_bit(unsentmap, pages, run_start);
99e314eb
DDAG
1661 } else {
1662 /* Find the next dirty page for the next iteration */
6b6712ef 1663 run_start = find_next_bit(bitmap, pages, run_start);
99e314eb
DDAG
1664 }
1665 }
1666}
1667
3d0684b2
JQ
1668/**
1669 * postcopy_chuck_hostpages: discrad any partially sent host page
1670 *
99e314eb
DDAG
1671 * Utility for the outgoing postcopy code.
1672 *
1673 * Discard any partially sent host-page size chunks, mark any partially
29c59172
DDAG
1674 * dirty host-page size chunks as all dirty. In this case the host-page
1675 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
99e314eb 1676 *
3d0684b2
JQ
1677 * Returns zero on success
1678 *
1679 * @ms: current migration state
6b6712ef 1680 * @block: block we want to work with
99e314eb 1681 */
6b6712ef 1682static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
99e314eb 1683{
6b6712ef
JQ
1684 PostcopyDiscardState *pds =
1685 postcopy_discard_send_init(ms, block->idstr);
99e314eb 1686
6b6712ef
JQ
1687 /* First pass: Discard all partially sent host pages */
1688 postcopy_chunk_hostpages_pass(ms, true, block, pds);
1689 /*
1690 * Second pass: Ensure that all partially dirty host pages are made
1691 * fully dirty.
1692 */
1693 postcopy_chunk_hostpages_pass(ms, false, block, pds);
99e314eb 1694
6b6712ef 1695 postcopy_discard_send_finish(ms, pds);
99e314eb
DDAG
1696 return 0;
1697}
1698
3d0684b2
JQ
1699/**
1700 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1701 *
1702 * Returns zero on success
1703 *
e0b266f0
DDAG
1704 * Transmit the set of pages to be discarded after precopy to the target
1705 * these are pages that:
1706 * a) Have been previously transmitted but are now dirty again
1707 * b) Pages that have never been transmitted, this ensures that
1708 * any pages on the destination that have been mapped by background
1709 * tasks get discarded (transparent huge pages is the specific concern)
1710 * Hopefully this is pretty sparse
3d0684b2
JQ
1711 *
1712 * @ms: current migration state
e0b266f0
DDAG
1713 */
1714int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1715{
53518d94 1716 RAMState *rs = ram_state;
6b6712ef 1717 RAMBlock *block;
e0b266f0 1718 int ret;
e0b266f0
DDAG
1719
1720 rcu_read_lock();
1721
1722 /* This should be our last sync, the src is now paused */
eb859c53 1723 migration_bitmap_sync(rs);
e0b266f0 1724
6b6712ef
JQ
1725 /* Easiest way to make sure we don't resume in the middle of a host-page */
1726 rs->last_seen_block = NULL;
1727 rs->last_sent_block = NULL;
1728 rs->last_page = 0;
e0b266f0 1729
6b6712ef
JQ
1730 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1731 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1732 unsigned long *bitmap = block->bmap;
1733 unsigned long *unsentmap = block->unsentmap;
1734
1735 if (!unsentmap) {
1736 /* We don't have a safe way to resize the sentmap, so
1737 * if the bitmap was resized it will be NULL at this
1738 * point.
1739 */
1740 error_report("migration ram resized during precopy phase");
1741 rcu_read_unlock();
1742 return -EINVAL;
1743 }
1744 /* Deal with TPS != HPS and huge pages */
1745 ret = postcopy_chunk_hostpages(ms, block);
1746 if (ret) {
1747 rcu_read_unlock();
1748 return ret;
1749 }
e0b266f0 1750
6b6712ef
JQ
1751 /*
1752 * Update the unsentmap to be unsentmap = unsentmap | dirty
1753 */
1754 bitmap_or(unsentmap, unsentmap, bitmap, pages);
e0b266f0 1755#ifdef DEBUG_POSTCOPY
6b6712ef 1756 ram_debug_dump_bitmap(unsentmap, true, pages);
e0b266f0 1757#endif
6b6712ef
JQ
1758 }
1759 trace_ram_postcopy_send_discard_bitmap();
e0b266f0
DDAG
1760
1761 ret = postcopy_each_ram_send_discard(ms);
1762 rcu_read_unlock();
1763
1764 return ret;
1765}
1766
3d0684b2
JQ
1767/**
1768 * ram_discard_range: discard dirtied pages at the beginning of postcopy
e0b266f0 1769 *
3d0684b2 1770 * Returns zero on success
e0b266f0 1771 *
36449157
JQ
1772 * @rbname: name of the RAMBlock of the request. NULL means the
1773 * same that last one.
3d0684b2
JQ
1774 * @start: RAMBlock starting page
1775 * @length: RAMBlock size
e0b266f0 1776 */
aaa2064c 1777int ram_discard_range(const char *rbname, uint64_t start, size_t length)
e0b266f0
DDAG
1778{
1779 int ret = -1;
1780
36449157 1781 trace_ram_discard_range(rbname, start, length);
d3a5038c 1782
e0b266f0 1783 rcu_read_lock();
36449157 1784 RAMBlock *rb = qemu_ram_block_by_name(rbname);
e0b266f0
DDAG
1785
1786 if (!rb) {
36449157 1787 error_report("ram_discard_range: Failed to find block '%s'", rbname);
e0b266f0
DDAG
1788 goto err;
1789 }
1790
d3a5038c 1791 ret = ram_block_discard_range(rb, start, length);
e0b266f0
DDAG
1792
1793err:
1794 rcu_read_unlock();
1795
1796 return ret;
1797}
1798
53518d94 1799static int ram_state_init(RAMState **rsp)
56e93d26 1800{
53518d94
JQ
1801 *rsp = g_new0(RAMState, 1);
1802
1803 qemu_mutex_init(&(*rsp)->bitmap_mutex);
1804 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
1805 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
56e93d26
JQ
1806
1807 if (migrate_use_xbzrle()) {
1808 XBZRLE_cache_lock();
c00e0928 1809 XBZRLE.zero_target_page = g_malloc0(TARGET_PAGE_SIZE);
56e93d26
JQ
1810 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1811 TARGET_PAGE_SIZE,
1812 TARGET_PAGE_SIZE);
1813 if (!XBZRLE.cache) {
1814 XBZRLE_cache_unlock();
1815 error_report("Error creating cache");
53518d94
JQ
1816 g_free(*rsp);
1817 *rsp = NULL;
56e93d26
JQ
1818 return -1;
1819 }
1820 XBZRLE_cache_unlock();
1821
1822 /* We prefer not to abort if there is no memory */
1823 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1824 if (!XBZRLE.encoded_buf) {
1825 error_report("Error allocating encoded_buf");
53518d94
JQ
1826 g_free(*rsp);
1827 *rsp = NULL;
56e93d26
JQ
1828 return -1;
1829 }
1830
1831 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1832 if (!XBZRLE.current_buf) {
1833 error_report("Error allocating current_buf");
1834 g_free(XBZRLE.encoded_buf);
1835 XBZRLE.encoded_buf = NULL;
53518d94
JQ
1836 g_free(*rsp);
1837 *rsp = NULL;
56e93d26
JQ
1838 return -1;
1839 }
56e93d26
JQ
1840 }
1841
49877834
PB
1842 /* For memory_global_dirty_log_start below. */
1843 qemu_mutex_lock_iothread();
1844
56e93d26
JQ
1845 qemu_mutex_lock_ramlist();
1846 rcu_read_lock();
53518d94 1847 ram_state_reset(*rsp);
56e93d26 1848
0827b9e9
AA
1849 /* Skip setting bitmap if there is no RAM */
1850 if (ram_bytes_total()) {
6b6712ef
JQ
1851 RAMBlock *block;
1852
1853 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1854 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
0827b9e9 1855
6b6712ef
JQ
1856 block->bmap = bitmap_new(pages);
1857 bitmap_set(block->bmap, 0, pages);
1858 if (migrate_postcopy_ram()) {
1859 block->unsentmap = bitmap_new(pages);
1860 bitmap_set(block->unsentmap, 0, pages);
1861 }
0827b9e9 1862 }
f3f491fc
DDAG
1863 }
1864
56e93d26
JQ
1865 /*
1866 * Count the total number of pages used by ram blocks not including any
1867 * gaps due to alignment or unplugs.
1868 */
53518d94 1869 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
56e93d26
JQ
1870
1871 memory_global_dirty_log_start();
53518d94 1872 migration_bitmap_sync(*rsp);
56e93d26 1873 qemu_mutex_unlock_ramlist();
49877834 1874 qemu_mutex_unlock_iothread();
a91246c9
HZ
1875 rcu_read_unlock();
1876
1877 return 0;
1878}
1879
3d0684b2
JQ
1880/*
1881 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
a91246c9
HZ
1882 * long-running RCU critical section. When rcu-reclaims in the code
1883 * start to become numerous it will be necessary to reduce the
1884 * granularity of these critical sections.
1885 */
1886
3d0684b2
JQ
1887/**
1888 * ram_save_setup: Setup RAM for migration
1889 *
1890 * Returns zero to indicate success and negative for error
1891 *
1892 * @f: QEMUFile where to send the data
1893 * @opaque: RAMState pointer
1894 */
a91246c9
HZ
1895static int ram_save_setup(QEMUFile *f, void *opaque)
1896{
53518d94 1897 RAMState **rsp = opaque;
a91246c9
HZ
1898 RAMBlock *block;
1899
1900 /* migration has already setup the bitmap, reuse it. */
1901 if (!migration_in_colo_state()) {
53518d94 1902 if (ram_state_init(rsp) != 0) {
a91246c9 1903 return -1;
53518d94 1904 }
a91246c9 1905 }
53518d94 1906 (*rsp)->f = f;
a91246c9
HZ
1907
1908 rcu_read_lock();
56e93d26
JQ
1909
1910 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1911
99e15582 1912 RAMBLOCK_FOREACH(block) {
56e93d26
JQ
1913 qemu_put_byte(f, strlen(block->idstr));
1914 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1915 qemu_put_be64(f, block->used_length);
ef08fb38
DDAG
1916 if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
1917 qemu_put_be64(f, block->page_size);
1918 }
56e93d26
JQ
1919 }
1920
1921 rcu_read_unlock();
1922
1923 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1924 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1925
1926 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1927
1928 return 0;
1929}
1930
3d0684b2
JQ
1931/**
1932 * ram_save_iterate: iterative stage for migration
1933 *
1934 * Returns zero to indicate success and negative for error
1935 *
1936 * @f: QEMUFile where to send the data
1937 * @opaque: RAMState pointer
1938 */
56e93d26
JQ
1939static int ram_save_iterate(QEMUFile *f, void *opaque)
1940{
53518d94
JQ
1941 RAMState **temp = opaque;
1942 RAMState *rs = *temp;
56e93d26
JQ
1943 int ret;
1944 int i;
1945 int64_t t0;
5c90308f 1946 int done = 0;
56e93d26
JQ
1947
1948 rcu_read_lock();
6f37bb8b
JQ
1949 if (ram_list.version != rs->last_version) {
1950 ram_state_reset(rs);
56e93d26
JQ
1951 }
1952
1953 /* Read version before ram_list.blocks */
1954 smp_rmb();
1955
1956 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
1957
1958 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1959 i = 0;
1960 while ((ret = qemu_file_rate_limit(f)) == 0) {
1961 int pages;
1962
ce25d337 1963 pages = ram_find_and_save_block(rs, false);
56e93d26
JQ
1964 /* no more pages to sent */
1965 if (pages == 0) {
5c90308f 1966 done = 1;
56e93d26
JQ
1967 break;
1968 }
23b28c3c 1969 rs->iterations++;
070afca2 1970
56e93d26
JQ
1971 /* we want to check in the 1st loop, just in case it was the 1st time
1972 and we had to sync the dirty bitmap.
1973 qemu_get_clock_ns() is a bit expensive, so we only check each some
1974 iterations
1975 */
1976 if ((i & 63) == 0) {
1977 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
1978 if (t1 > MAX_WAIT) {
55c4446b 1979 trace_ram_save_iterate_big_wait(t1, i);
56e93d26
JQ
1980 break;
1981 }
1982 }
1983 i++;
1984 }
ce25d337 1985 flush_compressed_data(rs);
56e93d26
JQ
1986 rcu_read_unlock();
1987
1988 /*
1989 * Must occur before EOS (or any QEMUFile operation)
1990 * because of RDMA protocol.
1991 */
1992 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
1993
1994 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
9360447d 1995 ram_counters.transferred += 8;
56e93d26
JQ
1996
1997 ret = qemu_file_get_error(f);
1998 if (ret < 0) {
1999 return ret;
2000 }
2001
5c90308f 2002 return done;
56e93d26
JQ
2003}
2004
3d0684b2
JQ
2005/**
2006 * ram_save_complete: function called to send the remaining amount of ram
2007 *
2008 * Returns zero to indicate success
2009 *
2010 * Called with iothread lock
2011 *
2012 * @f: QEMUFile where to send the data
2013 * @opaque: RAMState pointer
2014 */
56e93d26
JQ
2015static int ram_save_complete(QEMUFile *f, void *opaque)
2016{
53518d94
JQ
2017 RAMState **temp = opaque;
2018 RAMState *rs = *temp;
6f37bb8b 2019
56e93d26
JQ
2020 rcu_read_lock();
2021
5727309d 2022 if (!migration_in_postcopy()) {
8d820d6f 2023 migration_bitmap_sync(rs);
663e6c1d 2024 }
56e93d26
JQ
2025
2026 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2027
2028 /* try transferring iterative blocks of memory */
2029
2030 /* flush all remaining blocks regardless of rate limiting */
2031 while (true) {
2032 int pages;
2033
ce25d337 2034 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
56e93d26
JQ
2035 /* no more blocks to sent */
2036 if (pages == 0) {
2037 break;
2038 }
2039 }
2040
ce25d337 2041 flush_compressed_data(rs);
56e93d26 2042 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
56e93d26
JQ
2043
2044 rcu_read_unlock();
d09a6fde 2045
56e93d26
JQ
2046 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2047
2048 return 0;
2049}
2050
c31b098f
DDAG
2051static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2052 uint64_t *non_postcopiable_pending,
2053 uint64_t *postcopiable_pending)
56e93d26 2054{
53518d94
JQ
2055 RAMState **temp = opaque;
2056 RAMState *rs = *temp;
56e93d26
JQ
2057 uint64_t remaining_size;
2058
9edabd4d 2059 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 2060
5727309d 2061 if (!migration_in_postcopy() &&
663e6c1d 2062 remaining_size < max_size) {
56e93d26
JQ
2063 qemu_mutex_lock_iothread();
2064 rcu_read_lock();
8d820d6f 2065 migration_bitmap_sync(rs);
56e93d26
JQ
2066 rcu_read_unlock();
2067 qemu_mutex_unlock_iothread();
9edabd4d 2068 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 2069 }
c31b098f
DDAG
2070
2071 /* We can do postcopy, and all the data is postcopiable */
2072 *postcopiable_pending += remaining_size;
56e93d26
JQ
2073}
2074
2075static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2076{
2077 unsigned int xh_len;
2078 int xh_flags;
063e760a 2079 uint8_t *loaded_data;
56e93d26
JQ
2080
2081 if (!xbzrle_decoded_buf) {
2082 xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2083 }
063e760a 2084 loaded_data = xbzrle_decoded_buf;
56e93d26
JQ
2085
2086 /* extract RLE header */
2087 xh_flags = qemu_get_byte(f);
2088 xh_len = qemu_get_be16(f);
2089
2090 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2091 error_report("Failed to load XBZRLE page - wrong compression!");
2092 return -1;
2093 }
2094
2095 if (xh_len > TARGET_PAGE_SIZE) {
2096 error_report("Failed to load XBZRLE page - len overflow!");
2097 return -1;
2098 }
2099 /* load data and decode */
063e760a 2100 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
56e93d26
JQ
2101
2102 /* decode RLE */
063e760a 2103 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
56e93d26
JQ
2104 TARGET_PAGE_SIZE) == -1) {
2105 error_report("Failed to load XBZRLE page - decode error!");
2106 return -1;
2107 }
2108
2109 return 0;
2110}
2111
3d0684b2
JQ
2112/**
2113 * ram_block_from_stream: read a RAMBlock id from the migration stream
2114 *
2115 * Must be called from within a rcu critical section.
2116 *
56e93d26 2117 * Returns a pointer from within the RCU-protected ram_list.
a7180877 2118 *
3d0684b2
JQ
2119 * @f: QEMUFile where to read the data from
2120 * @flags: Page flags (mostly to see if it's a continuation of previous block)
a7180877 2121 */
3d0684b2 2122static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
56e93d26
JQ
2123{
2124 static RAMBlock *block = NULL;
2125 char id[256];
2126 uint8_t len;
2127
2128 if (flags & RAM_SAVE_FLAG_CONTINUE) {
4c4bad48 2129 if (!block) {
56e93d26
JQ
2130 error_report("Ack, bad migration stream!");
2131 return NULL;
2132 }
4c4bad48 2133 return block;
56e93d26
JQ
2134 }
2135
2136 len = qemu_get_byte(f);
2137 qemu_get_buffer(f, (uint8_t *)id, len);
2138 id[len] = 0;
2139
e3dd7493 2140 block = qemu_ram_block_by_name(id);
4c4bad48
HZ
2141 if (!block) {
2142 error_report("Can't find block %s", id);
2143 return NULL;
56e93d26
JQ
2144 }
2145
4c4bad48
HZ
2146 return block;
2147}
2148
2149static inline void *host_from_ram_block_offset(RAMBlock *block,
2150 ram_addr_t offset)
2151{
2152 if (!offset_in_ramblock(block, offset)) {
2153 return NULL;
2154 }
2155
2156 return block->host + offset;
56e93d26
JQ
2157}
2158
3d0684b2
JQ
2159/**
2160 * ram_handle_compressed: handle the zero page case
2161 *
56e93d26
JQ
2162 * If a page (or a whole RDMA chunk) has been
2163 * determined to be zero, then zap it.
3d0684b2
JQ
2164 *
2165 * @host: host address for the zero page
2166 * @ch: what the page is filled from. We only support zero
2167 * @size: size of the zero page
56e93d26
JQ
2168 */
2169void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2170{
2171 if (ch != 0 || !is_zero_range(host, size)) {
2172 memset(host, ch, size);
2173 }
2174}
2175
2176static void *do_data_decompress(void *opaque)
2177{
2178 DecompressParam *param = opaque;
2179 unsigned long pagesize;
33d151f4
LL
2180 uint8_t *des;
2181 int len;
56e93d26 2182
33d151f4 2183 qemu_mutex_lock(&param->mutex);
90e56fb4 2184 while (!param->quit) {
33d151f4
LL
2185 if (param->des) {
2186 des = param->des;
2187 len = param->len;
2188 param->des = 0;
2189 qemu_mutex_unlock(&param->mutex);
2190
56e93d26 2191 pagesize = TARGET_PAGE_SIZE;
73a8912b
LL
2192 /* uncompress() will return failed in some case, especially
2193 * when the page is dirted when doing the compression, it's
2194 * not a problem because the dirty page will be retransferred
2195 * and uncompress() won't break the data in other pages.
2196 */
33d151f4
LL
2197 uncompress((Bytef *)des, &pagesize,
2198 (const Bytef *)param->compbuf, len);
73a8912b 2199
33d151f4
LL
2200 qemu_mutex_lock(&decomp_done_lock);
2201 param->done = true;
2202 qemu_cond_signal(&decomp_done_cond);
2203 qemu_mutex_unlock(&decomp_done_lock);
2204
2205 qemu_mutex_lock(&param->mutex);
2206 } else {
2207 qemu_cond_wait(&param->cond, &param->mutex);
2208 }
56e93d26 2209 }
33d151f4 2210 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
2211
2212 return NULL;
2213}
2214
5533b2e9
LL
2215static void wait_for_decompress_done(void)
2216{
2217 int idx, thread_count;
2218
2219 if (!migrate_use_compression()) {
2220 return;
2221 }
2222
2223 thread_count = migrate_decompress_threads();
2224 qemu_mutex_lock(&decomp_done_lock);
2225 for (idx = 0; idx < thread_count; idx++) {
2226 while (!decomp_param[idx].done) {
2227 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2228 }
2229 }
2230 qemu_mutex_unlock(&decomp_done_lock);
2231}
2232
56e93d26
JQ
2233void migrate_decompress_threads_create(void)
2234{
2235 int i, thread_count;
2236
3416ab5b
JQ
2237 if (!migrate_use_compression()) {
2238 return;
2239 }
56e93d26
JQ
2240 thread_count = migrate_decompress_threads();
2241 decompress_threads = g_new0(QemuThread, thread_count);
2242 decomp_param = g_new0(DecompressParam, thread_count);
73a8912b
LL
2243 qemu_mutex_init(&decomp_done_lock);
2244 qemu_cond_init(&decomp_done_cond);
56e93d26
JQ
2245 for (i = 0; i < thread_count; i++) {
2246 qemu_mutex_init(&decomp_param[i].mutex);
2247 qemu_cond_init(&decomp_param[i].cond);
2248 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
73a8912b 2249 decomp_param[i].done = true;
90e56fb4 2250 decomp_param[i].quit = false;
56e93d26
JQ
2251 qemu_thread_create(decompress_threads + i, "decompress",
2252 do_data_decompress, decomp_param + i,
2253 QEMU_THREAD_JOINABLE);
2254 }
2255}
2256
2257void migrate_decompress_threads_join(void)
2258{
2259 int i, thread_count;
2260
3416ab5b
JQ
2261 if (!migrate_use_compression()) {
2262 return;
2263 }
56e93d26
JQ
2264 thread_count = migrate_decompress_threads();
2265 for (i = 0; i < thread_count; i++) {
2266 qemu_mutex_lock(&decomp_param[i].mutex);
90e56fb4 2267 decomp_param[i].quit = true;
56e93d26
JQ
2268 qemu_cond_signal(&decomp_param[i].cond);
2269 qemu_mutex_unlock(&decomp_param[i].mutex);
2270 }
2271 for (i = 0; i < thread_count; i++) {
2272 qemu_thread_join(decompress_threads + i);
2273 qemu_mutex_destroy(&decomp_param[i].mutex);
2274 qemu_cond_destroy(&decomp_param[i].cond);
2275 g_free(decomp_param[i].compbuf);
2276 }
2277 g_free(decompress_threads);
2278 g_free(decomp_param);
56e93d26
JQ
2279 decompress_threads = NULL;
2280 decomp_param = NULL;
56e93d26
JQ
2281}
2282
c1bc6626 2283static void decompress_data_with_multi_threads(QEMUFile *f,
56e93d26
JQ
2284 void *host, int len)
2285{
2286 int idx, thread_count;
2287
2288 thread_count = migrate_decompress_threads();
73a8912b 2289 qemu_mutex_lock(&decomp_done_lock);
56e93d26
JQ
2290 while (true) {
2291 for (idx = 0; idx < thread_count; idx++) {
73a8912b 2292 if (decomp_param[idx].done) {
33d151f4
LL
2293 decomp_param[idx].done = false;
2294 qemu_mutex_lock(&decomp_param[idx].mutex);
c1bc6626 2295 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
56e93d26
JQ
2296 decomp_param[idx].des = host;
2297 decomp_param[idx].len = len;
33d151f4
LL
2298 qemu_cond_signal(&decomp_param[idx].cond);
2299 qemu_mutex_unlock(&decomp_param[idx].mutex);
56e93d26
JQ
2300 break;
2301 }
2302 }
2303 if (idx < thread_count) {
2304 break;
73a8912b
LL
2305 } else {
2306 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
56e93d26
JQ
2307 }
2308 }
73a8912b 2309 qemu_mutex_unlock(&decomp_done_lock);
56e93d26
JQ
2310}
2311
3d0684b2
JQ
2312/**
2313 * ram_postcopy_incoming_init: allocate postcopy data structures
2314 *
2315 * Returns 0 for success and negative if there was one error
2316 *
2317 * @mis: current migration incoming state
2318 *
2319 * Allocate data structures etc needed by incoming migration with
2320 * postcopy-ram. postcopy-ram's similarly names
2321 * postcopy_ram_incoming_init does the work.
1caddf8a
DDAG
2322 */
2323int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2324{
b8c48993 2325 unsigned long ram_pages = last_ram_page();
1caddf8a
DDAG
2326
2327 return postcopy_ram_incoming_init(mis, ram_pages);
2328}
2329
3d0684b2
JQ
2330/**
2331 * ram_load_postcopy: load a page in postcopy case
2332 *
2333 * Returns 0 for success or -errno in case of error
2334 *
a7180877
DDAG
2335 * Called in postcopy mode by ram_load().
2336 * rcu_read_lock is taken prior to this being called.
3d0684b2
JQ
2337 *
2338 * @f: QEMUFile where to send the data
a7180877
DDAG
2339 */
2340static int ram_load_postcopy(QEMUFile *f)
2341{
2342 int flags = 0, ret = 0;
2343 bool place_needed = false;
28abd200 2344 bool matching_page_sizes = false;
a7180877
DDAG
2345 MigrationIncomingState *mis = migration_incoming_get_current();
2346 /* Temporary page that is later 'placed' */
2347 void *postcopy_host_page = postcopy_get_tmp_page(mis);
c53b7ddc 2348 void *last_host = NULL;
a3b6ff6d 2349 bool all_zero = false;
a7180877
DDAG
2350
2351 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2352 ram_addr_t addr;
2353 void *host = NULL;
2354 void *page_buffer = NULL;
2355 void *place_source = NULL;
df9ff5e1 2356 RAMBlock *block = NULL;
a7180877 2357 uint8_t ch;
a7180877
DDAG
2358
2359 addr = qemu_get_be64(f);
2360 flags = addr & ~TARGET_PAGE_MASK;
2361 addr &= TARGET_PAGE_MASK;
2362
2363 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2364 place_needed = false;
bb890ed5 2365 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
df9ff5e1 2366 block = ram_block_from_stream(f, flags);
4c4bad48
HZ
2367
2368 host = host_from_ram_block_offset(block, addr);
a7180877
DDAG
2369 if (!host) {
2370 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2371 ret = -EINVAL;
2372 break;
2373 }
28abd200 2374 matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
a7180877 2375 /*
28abd200
DDAG
2376 * Postcopy requires that we place whole host pages atomically;
2377 * these may be huge pages for RAMBlocks that are backed by
2378 * hugetlbfs.
a7180877
DDAG
2379 * To make it atomic, the data is read into a temporary page
2380 * that's moved into place later.
2381 * The migration protocol uses, possibly smaller, target-pages
2382 * however the source ensures it always sends all the components
2383 * of a host page in order.
2384 */
2385 page_buffer = postcopy_host_page +
28abd200 2386 ((uintptr_t)host & (block->page_size - 1));
a7180877 2387 /* If all TP are zero then we can optimise the place */
28abd200 2388 if (!((uintptr_t)host & (block->page_size - 1))) {
a7180877 2389 all_zero = true;
c53b7ddc
DDAG
2390 } else {
2391 /* not the 1st TP within the HP */
2392 if (host != (last_host + TARGET_PAGE_SIZE)) {
9af9e0fe 2393 error_report("Non-sequential target page %p/%p",
c53b7ddc
DDAG
2394 host, last_host);
2395 ret = -EINVAL;
2396 break;
2397 }
a7180877
DDAG
2398 }
2399
c53b7ddc 2400
a7180877
DDAG
2401 /*
2402 * If it's the last part of a host page then we place the host
2403 * page
2404 */
2405 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
28abd200 2406 (block->page_size - 1)) == 0;
a7180877
DDAG
2407 place_source = postcopy_host_page;
2408 }
c53b7ddc 2409 last_host = host;
a7180877
DDAG
2410
2411 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
bb890ed5 2412 case RAM_SAVE_FLAG_ZERO:
a7180877
DDAG
2413 ch = qemu_get_byte(f);
2414 memset(page_buffer, ch, TARGET_PAGE_SIZE);
2415 if (ch) {
2416 all_zero = false;
2417 }
2418 break;
2419
2420 case RAM_SAVE_FLAG_PAGE:
2421 all_zero = false;
2422 if (!place_needed || !matching_page_sizes) {
2423 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2424 } else {
2425 /* Avoids the qemu_file copy during postcopy, which is
2426 * going to do a copy later; can only do it when we
2427 * do this read in one go (matching page sizes)
2428 */
2429 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2430 TARGET_PAGE_SIZE);
2431 }
2432 break;
2433 case RAM_SAVE_FLAG_EOS:
2434 /* normal exit */
2435 break;
2436 default:
2437 error_report("Unknown combination of migration flags: %#x"
2438 " (postcopy mode)", flags);
2439 ret = -EINVAL;
2440 }
2441
2442 if (place_needed) {
2443 /* This gets called at the last target page in the host page */
df9ff5e1
DDAG
2444 void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2445
a7180877 2446 if (all_zero) {
df9ff5e1
DDAG
2447 ret = postcopy_place_page_zero(mis, place_dest,
2448 block->page_size);
a7180877 2449 } else {
df9ff5e1
DDAG
2450 ret = postcopy_place_page(mis, place_dest,
2451 place_source, block->page_size);
a7180877
DDAG
2452 }
2453 }
2454 if (!ret) {
2455 ret = qemu_file_get_error(f);
2456 }
2457 }
2458
2459 return ret;
2460}
2461
56e93d26
JQ
2462static int ram_load(QEMUFile *f, void *opaque, int version_id)
2463{
edc60127 2464 int flags = 0, ret = 0, invalid_flags = 0;
56e93d26
JQ
2465 static uint64_t seq_iter;
2466 int len = 0;
a7180877
DDAG
2467 /*
2468 * If system is running in postcopy mode, page inserts to host memory must
2469 * be atomic
2470 */
2471 bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
ef08fb38
DDAG
2472 /* ADVISE is earlier, it shows the source has the postcopy capability on */
2473 bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
56e93d26
JQ
2474
2475 seq_iter++;
2476
2477 if (version_id != 4) {
2478 ret = -EINVAL;
2479 }
2480
edc60127
JQ
2481 if (!migrate_use_compression()) {
2482 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
2483 }
56e93d26
JQ
2484 /* This RCU critical section can be very long running.
2485 * When RCU reclaims in the code start to become numerous,
2486 * it will be necessary to reduce the granularity of this
2487 * critical section.
2488 */
2489 rcu_read_lock();
a7180877
DDAG
2490
2491 if (postcopy_running) {
2492 ret = ram_load_postcopy(f);
2493 }
2494
2495 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
56e93d26 2496 ram_addr_t addr, total_ram_bytes;
a776aa15 2497 void *host = NULL;
56e93d26
JQ
2498 uint8_t ch;
2499
2500 addr = qemu_get_be64(f);
2501 flags = addr & ~TARGET_PAGE_MASK;
2502 addr &= TARGET_PAGE_MASK;
2503
edc60127
JQ
2504 if (flags & invalid_flags) {
2505 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
2506 error_report("Received an unexpected compressed page");
2507 }
2508
2509 ret = -EINVAL;
2510 break;
2511 }
2512
bb890ed5 2513 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
a776aa15 2514 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4c4bad48
HZ
2515 RAMBlock *block = ram_block_from_stream(f, flags);
2516
2517 host = host_from_ram_block_offset(block, addr);
a776aa15
DDAG
2518 if (!host) {
2519 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2520 ret = -EINVAL;
2521 break;
2522 }
1db9d8e5 2523 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
a776aa15
DDAG
2524 }
2525
56e93d26
JQ
2526 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2527 case RAM_SAVE_FLAG_MEM_SIZE:
2528 /* Synchronize RAM block list */
2529 total_ram_bytes = addr;
2530 while (!ret && total_ram_bytes) {
2531 RAMBlock *block;
56e93d26
JQ
2532 char id[256];
2533 ram_addr_t length;
2534
2535 len = qemu_get_byte(f);
2536 qemu_get_buffer(f, (uint8_t *)id, len);
2537 id[len] = 0;
2538 length = qemu_get_be64(f);
2539
e3dd7493
DDAG
2540 block = qemu_ram_block_by_name(id);
2541 if (block) {
2542 if (length != block->used_length) {
2543 Error *local_err = NULL;
56e93d26 2544
fa53a0e5 2545 ret = qemu_ram_resize(block, length,
e3dd7493
DDAG
2546 &local_err);
2547 if (local_err) {
2548 error_report_err(local_err);
56e93d26 2549 }
56e93d26 2550 }
ef08fb38
DDAG
2551 /* For postcopy we need to check hugepage sizes match */
2552 if (postcopy_advised &&
2553 block->page_size != qemu_host_page_size) {
2554 uint64_t remote_page_size = qemu_get_be64(f);
2555 if (remote_page_size != block->page_size) {
2556 error_report("Mismatched RAM page size %s "
2557 "(local) %zd != %" PRId64,
2558 id, block->page_size,
2559 remote_page_size);
2560 ret = -EINVAL;
2561 }
2562 }
e3dd7493
DDAG
2563 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2564 block->idstr);
2565 } else {
56e93d26
JQ
2566 error_report("Unknown ramblock \"%s\", cannot "
2567 "accept migration", id);
2568 ret = -EINVAL;
2569 }
2570
2571 total_ram_bytes -= length;
2572 }
2573 break;
a776aa15 2574
bb890ed5 2575 case RAM_SAVE_FLAG_ZERO:
56e93d26
JQ
2576 ch = qemu_get_byte(f);
2577 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2578 break;
a776aa15 2579
56e93d26 2580 case RAM_SAVE_FLAG_PAGE:
56e93d26
JQ
2581 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2582 break;
56e93d26 2583
a776aa15 2584 case RAM_SAVE_FLAG_COMPRESS_PAGE:
56e93d26
JQ
2585 len = qemu_get_be32(f);
2586 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2587 error_report("Invalid compressed data length: %d", len);
2588 ret = -EINVAL;
2589 break;
2590 }
c1bc6626 2591 decompress_data_with_multi_threads(f, host, len);
56e93d26 2592 break;
a776aa15 2593
56e93d26 2594 case RAM_SAVE_FLAG_XBZRLE:
56e93d26
JQ
2595 if (load_xbzrle(f, addr, host) < 0) {
2596 error_report("Failed to decompress XBZRLE page at "
2597 RAM_ADDR_FMT, addr);
2598 ret = -EINVAL;
2599 break;
2600 }
2601 break;
2602 case RAM_SAVE_FLAG_EOS:
2603 /* normal exit */
2604 break;
2605 default:
2606 if (flags & RAM_SAVE_FLAG_HOOK) {
632e3a5c 2607 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
56e93d26
JQ
2608 } else {
2609 error_report("Unknown combination of migration flags: %#x",
2610 flags);
2611 ret = -EINVAL;
2612 }
2613 }
2614 if (!ret) {
2615 ret = qemu_file_get_error(f);
2616 }
2617 }
2618
5533b2e9 2619 wait_for_decompress_done();
56e93d26 2620 rcu_read_unlock();
55c4446b 2621 trace_ram_load_complete(ret, seq_iter);
56e93d26
JQ
2622 return ret;
2623}
2624
2625static SaveVMHandlers savevm_ram_handlers = {
9907e842 2626 .save_setup = ram_save_setup,
56e93d26 2627 .save_live_iterate = ram_save_iterate,
763c906b 2628 .save_live_complete_postcopy = ram_save_complete,
a3e06c3d 2629 .save_live_complete_precopy = ram_save_complete,
56e93d26
JQ
2630 .save_live_pending = ram_save_pending,
2631 .load_state = ram_load,
70f794fc 2632 .save_cleanup = ram_migration_cleanup,
56e93d26
JQ
2633};
2634
2635void ram_mig_init(void)
2636{
2637 qemu_mutex_init(&XBZRLE.lock);
6f37bb8b 2638 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
56e93d26 2639}