]> git.proxmox.com Git - mirror_qemu.git/blame - migration/ram.c
migration: Move remaining exported functions to migration/misc.h
[mirror_qemu.git] / migration / ram.c
CommitLineData
56e93d26
JQ
1/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
76cc7b58
JQ
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
56e93d26
JQ
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
1393a485 28#include "qemu/osdep.h"
33c11879
PB
29#include "qemu-common.h"
30#include "cpu.h"
56e93d26 31#include <zlib.h>
4addcd4f 32#include "qapi-event.h"
f348b6d1 33#include "qemu/cutils.h"
56e93d26
JQ
34#include "qemu/bitops.h"
35#include "qemu/bitmap.h"
7205c9ec
JQ
36#include "qemu/timer.h"
37#include "qemu/main-loop.h"
709e3fe8 38#include "xbzrle.h"
7b1e1a22 39#include "ram.h"
56e93d26 40#include "migration/migration.h"
f2a8f0a6 41#include "migration/register.h"
7b1e1a22 42#include "migration/misc.h"
08a0aee1 43#include "qemu-file.h"
987772d9 44#include "migration/vmstate.h"
be07b0ac 45#include "postcopy-ram.h"
56e93d26
JQ
46#include "exec/address-spaces.h"
47#include "migration/page_cache.h"
56e93d26 48#include "qemu/error-report.h"
56e93d26 49#include "trace.h"
56e93d26 50#include "exec/ram_addr.h"
56e93d26 51#include "qemu/rcu_queue.h"
a91246c9 52#include "migration/colo.h"
56e93d26 53
56e93d26
JQ
54/***********************************************************/
55/* ram save/restore */
56
bb890ed5
JQ
57/* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
58 * worked for pages that where filled with the same char. We switched
59 * it to only search for the zero value. And to avoid confusion with
60 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
61 */
62
56e93d26 63#define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
bb890ed5 64#define RAM_SAVE_FLAG_ZERO 0x02
56e93d26
JQ
65#define RAM_SAVE_FLAG_MEM_SIZE 0x04
66#define RAM_SAVE_FLAG_PAGE 0x08
67#define RAM_SAVE_FLAG_EOS 0x10
68#define RAM_SAVE_FLAG_CONTINUE 0x20
69#define RAM_SAVE_FLAG_XBZRLE 0x40
70/* 0x80 is reserved in migration.h start with 0x100 next */
71#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
72
56e93d26
JQ
73static inline bool is_zero_range(uint8_t *p, uint64_t size)
74{
a1febc49 75 return buffer_is_zero(p, size);
56e93d26
JQ
76}
77
9360447d
JQ
78XBZRLECacheStats xbzrle_counters;
79
56e93d26
JQ
80/* struct contains XBZRLE cache and a static page
81 used by the compression */
82static struct {
83 /* buffer used for XBZRLE encoding */
84 uint8_t *encoded_buf;
85 /* buffer for storing page content */
86 uint8_t *current_buf;
87 /* Cache for XBZRLE, Protected by lock. */
88 PageCache *cache;
89 QemuMutex lock;
c00e0928
JQ
90 /* it will store a page full of zeros */
91 uint8_t *zero_target_page;
56e93d26
JQ
92} XBZRLE;
93
94/* buffer used for XBZRLE decoding */
95static uint8_t *xbzrle_decoded_buf;
96
97static void XBZRLE_cache_lock(void)
98{
99 if (migrate_use_xbzrle())
100 qemu_mutex_lock(&XBZRLE.lock);
101}
102
103static void XBZRLE_cache_unlock(void)
104{
105 if (migrate_use_xbzrle())
106 qemu_mutex_unlock(&XBZRLE.lock);
107}
108
3d0684b2
JQ
109/**
110 * xbzrle_cache_resize: resize the xbzrle cache
111 *
112 * This function is called from qmp_migrate_set_cache_size in main
113 * thread, possibly while a migration is in progress. A running
114 * migration may be using the cache and might finish during this call,
115 * hence changes to the cache are protected by XBZRLE.lock().
116 *
117 * Returns the new_size or negative in case of error.
118 *
119 * @new_size: new cache size
56e93d26
JQ
120 */
121int64_t xbzrle_cache_resize(int64_t new_size)
122{
123 PageCache *new_cache;
124 int64_t ret;
125
126 if (new_size < TARGET_PAGE_SIZE) {
127 return -1;
128 }
129
130 XBZRLE_cache_lock();
131
132 if (XBZRLE.cache != NULL) {
133 if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
134 goto out_new_size;
135 }
136 new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
137 TARGET_PAGE_SIZE);
138 if (!new_cache) {
139 error_report("Error creating cache");
140 ret = -1;
141 goto out;
142 }
143
144 cache_fini(XBZRLE.cache);
145 XBZRLE.cache = new_cache;
146 }
147
148out_new_size:
149 ret = pow2floor(new_size);
150out:
151 XBZRLE_cache_unlock();
152 return ret;
153}
154
ec481c6c
JQ
155/*
156 * An outstanding page request, on the source, having been received
157 * and queued
158 */
159struct RAMSrcPageRequest {
160 RAMBlock *rb;
161 hwaddr offset;
162 hwaddr len;
163
164 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
165};
166
6f37bb8b
JQ
167/* State of RAM for migration */
168struct RAMState {
204b88b8
JQ
169 /* QEMUFile used for this migration */
170 QEMUFile *f;
6f37bb8b
JQ
171 /* Last block that we have visited searching for dirty pages */
172 RAMBlock *last_seen_block;
173 /* Last block from where we have sent data */
174 RAMBlock *last_sent_block;
269ace29
JQ
175 /* Last dirty target page we have sent */
176 ram_addr_t last_page;
6f37bb8b
JQ
177 /* last ram version we have seen */
178 uint32_t last_version;
179 /* We are in the first round */
180 bool ram_bulk_stage;
8d820d6f
JQ
181 /* How many times we have dirty too many pages */
182 int dirty_rate_high_cnt;
f664da80
JQ
183 /* these variables are used for bitmap sync */
184 /* last time we did a full bitmap_sync */
185 int64_t time_last_bitmap_sync;
eac74159 186 /* bytes transferred at start_time */
c4bdf0cf 187 uint64_t bytes_xfer_prev;
a66cd90c 188 /* number of dirty pages since start_time */
68908ed6 189 uint64_t num_dirty_pages_period;
b5833fde
JQ
190 /* xbzrle misses since the beginning of the period */
191 uint64_t xbzrle_cache_miss_prev;
36040d9c
JQ
192 /* number of iterations at the beginning of period */
193 uint64_t iterations_prev;
23b28c3c
JQ
194 /* Iterations since start */
195 uint64_t iterations;
108cfae0 196 /* protects modification of the bitmap */
9360447d
JQ
197 uint64_t migration_dirty_pages;
198 /* number of dirty bits in the bitmap */
108cfae0 199 QemuMutex bitmap_mutex;
68a098f3
JQ
200 /* The RAMBlock used in the last src_page_requests */
201 RAMBlock *last_req_rb;
ec481c6c
JQ
202 /* Queue of outstanding page requests from the destination */
203 QemuMutex src_page_req_mutex;
204 QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
6f37bb8b
JQ
205};
206typedef struct RAMState RAMState;
207
53518d94 208static RAMState *ram_state;
6f37bb8b 209
9edabd4d 210uint64_t ram_bytes_remaining(void)
2f4fde93 211{
53518d94 212 return ram_state->migration_dirty_pages * TARGET_PAGE_SIZE;
2f4fde93
JQ
213}
214
9360447d 215MigrationStats ram_counters;
96506894 216
b8fb8cb7
DDAG
217/* used by the search for pages to send */
218struct PageSearchStatus {
219 /* Current block being searched */
220 RAMBlock *block;
a935e30f
JQ
221 /* Current page to search from */
222 unsigned long page;
b8fb8cb7
DDAG
223 /* Set once we wrap around */
224 bool complete_round;
225};
226typedef struct PageSearchStatus PageSearchStatus;
227
56e93d26 228struct CompressParam {
56e93d26 229 bool done;
90e56fb4 230 bool quit;
56e93d26
JQ
231 QEMUFile *file;
232 QemuMutex mutex;
233 QemuCond cond;
234 RAMBlock *block;
235 ram_addr_t offset;
236};
237typedef struct CompressParam CompressParam;
238
239struct DecompressParam {
73a8912b 240 bool done;
90e56fb4 241 bool quit;
56e93d26
JQ
242 QemuMutex mutex;
243 QemuCond cond;
244 void *des;
d341d9f3 245 uint8_t *compbuf;
56e93d26
JQ
246 int len;
247};
248typedef struct DecompressParam DecompressParam;
249
250static CompressParam *comp_param;
251static QemuThread *compress_threads;
252/* comp_done_cond is used to wake up the migration thread when
253 * one of the compression threads has finished the compression.
254 * comp_done_lock is used to co-work with comp_done_cond.
255 */
0d9f9a5c
LL
256static QemuMutex comp_done_lock;
257static QemuCond comp_done_cond;
56e93d26
JQ
258/* The empty QEMUFileOps will be used by file in CompressParam */
259static const QEMUFileOps empty_ops = { };
260
56e93d26
JQ
261static DecompressParam *decomp_param;
262static QemuThread *decompress_threads;
73a8912b
LL
263static QemuMutex decomp_done_lock;
264static QemuCond decomp_done_cond;
56e93d26 265
a7a9a88f
LL
266static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
267 ram_addr_t offset);
56e93d26
JQ
268
269static void *do_data_compress(void *opaque)
270{
271 CompressParam *param = opaque;
a7a9a88f
LL
272 RAMBlock *block;
273 ram_addr_t offset;
56e93d26 274
a7a9a88f 275 qemu_mutex_lock(&param->mutex);
90e56fb4 276 while (!param->quit) {
a7a9a88f
LL
277 if (param->block) {
278 block = param->block;
279 offset = param->offset;
280 param->block = NULL;
281 qemu_mutex_unlock(&param->mutex);
282
283 do_compress_ram_page(param->file, block, offset);
284
0d9f9a5c 285 qemu_mutex_lock(&comp_done_lock);
a7a9a88f 286 param->done = true;
0d9f9a5c
LL
287 qemu_cond_signal(&comp_done_cond);
288 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
289
290 qemu_mutex_lock(&param->mutex);
291 } else {
56e93d26
JQ
292 qemu_cond_wait(&param->cond, &param->mutex);
293 }
56e93d26 294 }
a7a9a88f 295 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
296
297 return NULL;
298}
299
300static inline void terminate_compression_threads(void)
301{
302 int idx, thread_count;
303
304 thread_count = migrate_compress_threads();
3d0684b2 305
56e93d26
JQ
306 for (idx = 0; idx < thread_count; idx++) {
307 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 308 comp_param[idx].quit = true;
56e93d26
JQ
309 qemu_cond_signal(&comp_param[idx].cond);
310 qemu_mutex_unlock(&comp_param[idx].mutex);
311 }
312}
313
314void migrate_compress_threads_join(void)
315{
316 int i, thread_count;
317
318 if (!migrate_use_compression()) {
319 return;
320 }
321 terminate_compression_threads();
322 thread_count = migrate_compress_threads();
323 for (i = 0; i < thread_count; i++) {
324 qemu_thread_join(compress_threads + i);
325 qemu_fclose(comp_param[i].file);
326 qemu_mutex_destroy(&comp_param[i].mutex);
327 qemu_cond_destroy(&comp_param[i].cond);
328 }
0d9f9a5c
LL
329 qemu_mutex_destroy(&comp_done_lock);
330 qemu_cond_destroy(&comp_done_cond);
56e93d26
JQ
331 g_free(compress_threads);
332 g_free(comp_param);
56e93d26
JQ
333 compress_threads = NULL;
334 comp_param = NULL;
56e93d26
JQ
335}
336
337void migrate_compress_threads_create(void)
338{
339 int i, thread_count;
340
341 if (!migrate_use_compression()) {
342 return;
343 }
56e93d26
JQ
344 thread_count = migrate_compress_threads();
345 compress_threads = g_new0(QemuThread, thread_count);
346 comp_param = g_new0(CompressParam, thread_count);
0d9f9a5c
LL
347 qemu_cond_init(&comp_done_cond);
348 qemu_mutex_init(&comp_done_lock);
56e93d26 349 for (i = 0; i < thread_count; i++) {
e110aa91
C
350 /* comp_param[i].file is just used as a dummy buffer to save data,
351 * set its ops to empty.
56e93d26
JQ
352 */
353 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
354 comp_param[i].done = true;
90e56fb4 355 comp_param[i].quit = false;
56e93d26
JQ
356 qemu_mutex_init(&comp_param[i].mutex);
357 qemu_cond_init(&comp_param[i].cond);
358 qemu_thread_create(compress_threads + i, "compress",
359 do_data_compress, comp_param + i,
360 QEMU_THREAD_JOINABLE);
361 }
362}
363
364/**
3d0684b2 365 * save_page_header: write page header to wire
56e93d26
JQ
366 *
367 * If this is the 1st block, it also writes the block identification
368 *
3d0684b2 369 * Returns the number of bytes written
56e93d26
JQ
370 *
371 * @f: QEMUFile where to send the data
372 * @block: block that contains the page we want to send
373 * @offset: offset inside the block for the page
374 * in the lower bits, it contains flags
375 */
2bf3aa85
JQ
376static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
377 ram_addr_t offset)
56e93d26 378{
9f5f380b 379 size_t size, len;
56e93d26 380
24795694
JQ
381 if (block == rs->last_sent_block) {
382 offset |= RAM_SAVE_FLAG_CONTINUE;
383 }
2bf3aa85 384 qemu_put_be64(f, offset);
56e93d26
JQ
385 size = 8;
386
387 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
9f5f380b 388 len = strlen(block->idstr);
2bf3aa85
JQ
389 qemu_put_byte(f, len);
390 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
9f5f380b 391 size += 1 + len;
24795694 392 rs->last_sent_block = block;
56e93d26
JQ
393 }
394 return size;
395}
396
3d0684b2
JQ
397/**
398 * mig_throttle_guest_down: throotle down the guest
399 *
400 * Reduce amount of guest cpu execution to hopefully slow down memory
401 * writes. If guest dirty memory rate is reduced below the rate at
402 * which we can transfer pages to the destination then we should be
403 * able to complete migration. Some workloads dirty memory way too
404 * fast and will not effectively converge, even with auto-converge.
070afca2
JH
405 */
406static void mig_throttle_guest_down(void)
407{
408 MigrationState *s = migrate_get_current();
2594f56d
DB
409 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
410 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
070afca2
JH
411
412 /* We have not started throttling yet. Let's start it. */
413 if (!cpu_throttle_active()) {
414 cpu_throttle_set(pct_initial);
415 } else {
416 /* Throttling already on, just increase the rate */
417 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
418 }
419}
420
3d0684b2
JQ
421/**
422 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
423 *
6f37bb8b 424 * @rs: current RAM state
3d0684b2
JQ
425 * @current_addr: address for the zero page
426 *
427 * Update the xbzrle cache to reflect a page that's been sent as all 0.
56e93d26
JQ
428 * The important thing is that a stale (not-yet-0'd) page be replaced
429 * by the new data.
430 * As a bonus, if the page wasn't in the cache it gets added so that
3d0684b2 431 * when a small write is made into the 0'd page it gets XBZRLE sent.
56e93d26 432 */
6f37bb8b 433static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
56e93d26 434{
6f37bb8b 435 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
56e93d26
JQ
436 return;
437 }
438
439 /* We don't care if this fails to allocate a new cache page
440 * as long as it updated an old one */
c00e0928 441 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
9360447d 442 ram_counters.dirty_sync_count);
56e93d26
JQ
443}
444
445#define ENCODING_FLAG_XBZRLE 0x1
446
447/**
448 * save_xbzrle_page: compress and send current page
449 *
450 * Returns: 1 means that we wrote the page
451 * 0 means that page is identical to the one already sent
452 * -1 means that xbzrle would be longer than normal
453 *
5a987738 454 * @rs: current RAM state
3d0684b2
JQ
455 * @current_data: pointer to the address of the page contents
456 * @current_addr: addr of the page
56e93d26
JQ
457 * @block: block that contains the page we want to send
458 * @offset: offset inside the block for the page
459 * @last_stage: if we are at the completion stage
56e93d26 460 */
204b88b8 461static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
56e93d26 462 ram_addr_t current_addr, RAMBlock *block,
072c2511 463 ram_addr_t offset, bool last_stage)
56e93d26
JQ
464{
465 int encoded_len = 0, bytes_xbzrle;
466 uint8_t *prev_cached_page;
467
9360447d
JQ
468 if (!cache_is_cached(XBZRLE.cache, current_addr,
469 ram_counters.dirty_sync_count)) {
470 xbzrle_counters.cache_miss++;
56e93d26
JQ
471 if (!last_stage) {
472 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
9360447d 473 ram_counters.dirty_sync_count) == -1) {
56e93d26
JQ
474 return -1;
475 } else {
476 /* update *current_data when the page has been
477 inserted into cache */
478 *current_data = get_cached_data(XBZRLE.cache, current_addr);
479 }
480 }
481 return -1;
482 }
483
484 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
485
486 /* save current buffer into memory */
487 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
488
489 /* XBZRLE encoding (if there is no overflow) */
490 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
491 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
492 TARGET_PAGE_SIZE);
493 if (encoded_len == 0) {
55c4446b 494 trace_save_xbzrle_page_skipping();
56e93d26
JQ
495 return 0;
496 } else if (encoded_len == -1) {
55c4446b 497 trace_save_xbzrle_page_overflow();
9360447d 498 xbzrle_counters.overflow++;
56e93d26
JQ
499 /* update data in the cache */
500 if (!last_stage) {
501 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
502 *current_data = prev_cached_page;
503 }
504 return -1;
505 }
506
507 /* we need to update the data in the cache, in order to get the same data */
508 if (!last_stage) {
509 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
510 }
511
512 /* Send XBZRLE based compressed page */
2bf3aa85 513 bytes_xbzrle = save_page_header(rs, rs->f, block,
204b88b8
JQ
514 offset | RAM_SAVE_FLAG_XBZRLE);
515 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
516 qemu_put_be16(rs->f, encoded_len);
517 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
56e93d26 518 bytes_xbzrle += encoded_len + 1 + 2;
9360447d
JQ
519 xbzrle_counters.pages++;
520 xbzrle_counters.bytes += bytes_xbzrle;
521 ram_counters.transferred += bytes_xbzrle;
56e93d26
JQ
522
523 return 1;
524}
525
3d0684b2
JQ
526/**
527 * migration_bitmap_find_dirty: find the next dirty page from start
f3f491fc 528 *
3d0684b2
JQ
529 * Called with rcu_read_lock() to protect migration_bitmap
530 *
531 * Returns the byte offset within memory region of the start of a dirty page
532 *
6f37bb8b 533 * @rs: current RAM state
3d0684b2 534 * @rb: RAMBlock where to search for dirty pages
a935e30f 535 * @start: page where we start the search
f3f491fc 536 */
56e93d26 537static inline
a935e30f 538unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
f20e2865 539 unsigned long start)
56e93d26 540{
6b6712ef
JQ
541 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
542 unsigned long *bitmap = rb->bmap;
56e93d26
JQ
543 unsigned long next;
544
6b6712ef
JQ
545 if (rs->ram_bulk_stage && start > 0) {
546 next = start + 1;
56e93d26 547 } else {
6b6712ef 548 next = find_next_bit(bitmap, size, start);
56e93d26
JQ
549 }
550
6b6712ef 551 return next;
56e93d26
JQ
552}
553
06b10688 554static inline bool migration_bitmap_clear_dirty(RAMState *rs,
f20e2865
JQ
555 RAMBlock *rb,
556 unsigned long page)
a82d593b
DDAG
557{
558 bool ret;
a82d593b 559
6b6712ef 560 ret = test_and_clear_bit(page, rb->bmap);
a82d593b
DDAG
561
562 if (ret) {
0d8ec885 563 rs->migration_dirty_pages--;
a82d593b
DDAG
564 }
565 return ret;
566}
567
15440dd5
JQ
568static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
569 ram_addr_t start, ram_addr_t length)
56e93d26 570{
0d8ec885 571 rs->migration_dirty_pages +=
6b6712ef 572 cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
0d8ec885 573 &rs->num_dirty_pages_period);
56e93d26
JQ
574}
575
3d0684b2
JQ
576/**
577 * ram_pagesize_summary: calculate all the pagesizes of a VM
578 *
579 * Returns a summary bitmap of the page sizes of all RAMBlocks
580 *
581 * For VMs with just normal pages this is equivalent to the host page
582 * size. If it's got some huge pages then it's the OR of all the
583 * different page sizes.
e8ca1db2
DDAG
584 */
585uint64_t ram_pagesize_summary(void)
586{
587 RAMBlock *block;
588 uint64_t summary = 0;
589
99e15582 590 RAMBLOCK_FOREACH(block) {
e8ca1db2
DDAG
591 summary |= block->page_size;
592 }
593
594 return summary;
595}
596
8d820d6f 597static void migration_bitmap_sync(RAMState *rs)
56e93d26
JQ
598{
599 RAMBlock *block;
56e93d26 600 int64_t end_time;
c4bdf0cf 601 uint64_t bytes_xfer_now;
56e93d26 602
9360447d 603 ram_counters.dirty_sync_count++;
56e93d26 604
f664da80
JQ
605 if (!rs->time_last_bitmap_sync) {
606 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
56e93d26
JQ
607 }
608
609 trace_migration_bitmap_sync_start();
9c1f8f44 610 memory_global_dirty_log_sync();
56e93d26 611
108cfae0 612 qemu_mutex_lock(&rs->bitmap_mutex);
56e93d26 613 rcu_read_lock();
99e15582 614 RAMBLOCK_FOREACH(block) {
15440dd5 615 migration_bitmap_sync_range(rs, block, 0, block->used_length);
56e93d26
JQ
616 }
617 rcu_read_unlock();
108cfae0 618 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26 619
a66cd90c 620 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1ffb5dfd 621
56e93d26
JQ
622 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
623
624 /* more than 1 second = 1000 millisecons */
f664da80 625 if (end_time > rs->time_last_bitmap_sync + 1000) {
d693c6f1 626 /* calculate period counters */
9360447d 627 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
d693c6f1 628 / (end_time - rs->time_last_bitmap_sync);
9360447d 629 bytes_xfer_now = ram_counters.transferred;
d693c6f1 630
56e93d26
JQ
631 if (migrate_auto_converge()) {
632 /* The following detection logic can be refined later. For now:
633 Check to see if the dirtied bytes is 50% more than the approx.
634 amount of bytes that just got transferred since the last time we
070afca2
JH
635 were in this routine. If that happens twice, start or increase
636 throttling */
070afca2 637
d693c6f1 638 if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
eac74159 639 (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
b4a3c64b 640 (++rs->dirty_rate_high_cnt >= 2)) {
56e93d26 641 trace_migration_throttle();
8d820d6f 642 rs->dirty_rate_high_cnt = 0;
070afca2 643 mig_throttle_guest_down();
d693c6f1 644 }
56e93d26 645 }
070afca2 646
56e93d26 647 if (migrate_use_xbzrle()) {
23b28c3c 648 if (rs->iterations_prev != rs->iterations) {
9360447d
JQ
649 xbzrle_counters.cache_miss_rate =
650 (double)(xbzrle_counters.cache_miss -
b5833fde 651 rs->xbzrle_cache_miss_prev) /
23b28c3c 652 (rs->iterations - rs->iterations_prev);
56e93d26 653 }
23b28c3c 654 rs->iterations_prev = rs->iterations;
9360447d 655 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
56e93d26 656 }
d693c6f1
FF
657
658 /* reset period counters */
f664da80 659 rs->time_last_bitmap_sync = end_time;
a66cd90c 660 rs->num_dirty_pages_period = 0;
d2a4d85a 661 rs->bytes_xfer_prev = bytes_xfer_now;
56e93d26 662 }
4addcd4f 663 if (migrate_use_events()) {
9360447d 664 qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
4addcd4f 665 }
56e93d26
JQ
666}
667
668/**
3d0684b2 669 * save_zero_page: send the zero page to the stream
56e93d26 670 *
3d0684b2 671 * Returns the number of pages written.
56e93d26 672 *
f7ccd61b 673 * @rs: current RAM state
56e93d26
JQ
674 * @block: block that contains the page we want to send
675 * @offset: offset inside the block for the page
676 * @p: pointer to the page
56e93d26 677 */
ce25d337
JQ
678static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
679 uint8_t *p)
56e93d26
JQ
680{
681 int pages = -1;
682
683 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
9360447d
JQ
684 ram_counters.duplicate++;
685 ram_counters.transferred +=
bb890ed5 686 save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
ce25d337 687 qemu_put_byte(rs->f, 0);
9360447d 688 ram_counters.transferred += 1;
56e93d26
JQ
689 pages = 1;
690 }
691
692 return pages;
693}
694
5727309d 695static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
53f09a10 696{
5727309d 697 if (!migrate_release_ram() || !migration_in_postcopy()) {
53f09a10
PB
698 return;
699 }
700
aaa2064c 701 ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
53f09a10
PB
702}
703
56e93d26 704/**
3d0684b2 705 * ram_save_page: send the given page to the stream
56e93d26 706 *
3d0684b2 707 * Returns the number of pages written.
3fd3c4b3
DDAG
708 * < 0 - error
709 * >=0 - Number of pages written - this might legally be 0
710 * if xbzrle noticed the page was the same.
56e93d26 711 *
6f37bb8b 712 * @rs: current RAM state
56e93d26
JQ
713 * @block: block that contains the page we want to send
714 * @offset: offset inside the block for the page
715 * @last_stage: if we are at the completion stage
56e93d26 716 */
a0a8aa14 717static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
56e93d26
JQ
718{
719 int pages = -1;
720 uint64_t bytes_xmit;
721 ram_addr_t current_addr;
56e93d26
JQ
722 uint8_t *p;
723 int ret;
724 bool send_async = true;
a08f6890 725 RAMBlock *block = pss->block;
a935e30f 726 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
56e93d26 727
2f68e399 728 p = block->host + offset;
1db9d8e5 729 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
56e93d26
JQ
730
731 /* In doubt sent page as normal */
732 bytes_xmit = 0;
ce25d337 733 ret = ram_control_save_page(rs->f, block->offset,
56e93d26
JQ
734 offset, TARGET_PAGE_SIZE, &bytes_xmit);
735 if (bytes_xmit) {
9360447d 736 ram_counters.transferred += bytes_xmit;
56e93d26
JQ
737 pages = 1;
738 }
739
740 XBZRLE_cache_lock();
741
742 current_addr = block->offset + offset;
743
56e93d26
JQ
744 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
745 if (ret != RAM_SAVE_CONTROL_DELAYED) {
746 if (bytes_xmit > 0) {
9360447d 747 ram_counters.normal++;
56e93d26 748 } else if (bytes_xmit == 0) {
9360447d 749 ram_counters.duplicate++;
56e93d26
JQ
750 }
751 }
752 } else {
ce25d337 753 pages = save_zero_page(rs, block, offset, p);
56e93d26
JQ
754 if (pages > 0) {
755 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
756 * page would be stale
757 */
6f37bb8b 758 xbzrle_cache_zero_page(rs, current_addr);
a935e30f 759 ram_release_pages(block->idstr, offset, pages);
6f37bb8b 760 } else if (!rs->ram_bulk_stage &&
5727309d 761 !migration_in_postcopy() && migrate_use_xbzrle()) {
204b88b8 762 pages = save_xbzrle_page(rs, &p, current_addr, block,
072c2511 763 offset, last_stage);
56e93d26
JQ
764 if (!last_stage) {
765 /* Can't send this cached data async, since the cache page
766 * might get updated before it gets to the wire
767 */
768 send_async = false;
769 }
770 }
771 }
772
773 /* XBZRLE overflow or normal page */
774 if (pages == -1) {
9360447d
JQ
775 ram_counters.transferred +=
776 save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_PAGE);
56e93d26 777 if (send_async) {
ce25d337 778 qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
53f09a10 779 migrate_release_ram() &
5727309d 780 migration_in_postcopy());
56e93d26 781 } else {
ce25d337 782 qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
56e93d26 783 }
9360447d 784 ram_counters.transferred += TARGET_PAGE_SIZE;
56e93d26 785 pages = 1;
9360447d 786 ram_counters.normal++;
56e93d26
JQ
787 }
788
789 XBZRLE_cache_unlock();
790
791 return pages;
792}
793
a7a9a88f
LL
794static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
795 ram_addr_t offset)
56e93d26 796{
53518d94 797 RAMState *rs = ram_state;
56e93d26 798 int bytes_sent, blen;
a7a9a88f 799 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
56e93d26 800
2bf3aa85 801 bytes_sent = save_page_header(rs, f, block, offset |
56e93d26 802 RAM_SAVE_FLAG_COMPRESS_PAGE);
a7a9a88f 803 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
56e93d26 804 migrate_compress_level());
b3be2896
LL
805 if (blen < 0) {
806 bytes_sent = 0;
807 qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
808 error_report("compressed data failed!");
809 } else {
810 bytes_sent += blen;
5727309d 811 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
b3be2896 812 }
56e93d26
JQ
813
814 return bytes_sent;
815}
816
ce25d337 817static void flush_compressed_data(RAMState *rs)
56e93d26
JQ
818{
819 int idx, len, thread_count;
820
821 if (!migrate_use_compression()) {
822 return;
823 }
824 thread_count = migrate_compress_threads();
a7a9a88f 825
0d9f9a5c 826 qemu_mutex_lock(&comp_done_lock);
56e93d26 827 for (idx = 0; idx < thread_count; idx++) {
a7a9a88f 828 while (!comp_param[idx].done) {
0d9f9a5c 829 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26 830 }
a7a9a88f 831 }
0d9f9a5c 832 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
833
834 for (idx = 0; idx < thread_count; idx++) {
835 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 836 if (!comp_param[idx].quit) {
ce25d337 837 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
9360447d 838 ram_counters.transferred += len;
56e93d26 839 }
a7a9a88f 840 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26
JQ
841 }
842}
843
844static inline void set_compress_params(CompressParam *param, RAMBlock *block,
845 ram_addr_t offset)
846{
847 param->block = block;
848 param->offset = offset;
849}
850
ce25d337
JQ
851static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
852 ram_addr_t offset)
56e93d26
JQ
853{
854 int idx, thread_count, bytes_xmit = -1, pages = -1;
855
856 thread_count = migrate_compress_threads();
0d9f9a5c 857 qemu_mutex_lock(&comp_done_lock);
56e93d26
JQ
858 while (true) {
859 for (idx = 0; idx < thread_count; idx++) {
860 if (comp_param[idx].done) {
a7a9a88f 861 comp_param[idx].done = false;
ce25d337 862 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
a7a9a88f 863 qemu_mutex_lock(&comp_param[idx].mutex);
56e93d26 864 set_compress_params(&comp_param[idx], block, offset);
a7a9a88f
LL
865 qemu_cond_signal(&comp_param[idx].cond);
866 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26 867 pages = 1;
9360447d
JQ
868 ram_counters.normal++;
869 ram_counters.transferred += bytes_xmit;
56e93d26
JQ
870 break;
871 }
872 }
873 if (pages > 0) {
874 break;
875 } else {
0d9f9a5c 876 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26
JQ
877 }
878 }
0d9f9a5c 879 qemu_mutex_unlock(&comp_done_lock);
56e93d26
JQ
880
881 return pages;
882}
883
884/**
885 * ram_save_compressed_page: compress the given page and send it to the stream
886 *
3d0684b2 887 * Returns the number of pages written.
56e93d26 888 *
6f37bb8b 889 * @rs: current RAM state
56e93d26
JQ
890 * @block: block that contains the page we want to send
891 * @offset: offset inside the block for the page
892 * @last_stage: if we are at the completion stage
56e93d26 893 */
a0a8aa14
JQ
894static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
895 bool last_stage)
56e93d26
JQ
896{
897 int pages = -1;
fc50438e 898 uint64_t bytes_xmit = 0;
56e93d26 899 uint8_t *p;
fc50438e 900 int ret, blen;
a08f6890 901 RAMBlock *block = pss->block;
a935e30f 902 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
56e93d26 903
2f68e399 904 p = block->host + offset;
56e93d26 905
ce25d337 906 ret = ram_control_save_page(rs->f, block->offset,
56e93d26
JQ
907 offset, TARGET_PAGE_SIZE, &bytes_xmit);
908 if (bytes_xmit) {
9360447d 909 ram_counters.transferred += bytes_xmit;
56e93d26
JQ
910 pages = 1;
911 }
56e93d26
JQ
912 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
913 if (ret != RAM_SAVE_CONTROL_DELAYED) {
914 if (bytes_xmit > 0) {
9360447d 915 ram_counters.normal++;
56e93d26 916 } else if (bytes_xmit == 0) {
9360447d 917 ram_counters.duplicate++;
56e93d26
JQ
918 }
919 }
920 } else {
921 /* When starting the process of a new block, the first page of
922 * the block should be sent out before other pages in the same
923 * block, and all the pages in last block should have been sent
924 * out, keeping this order is important, because the 'cont' flag
925 * is used to avoid resending the block name.
926 */
6f37bb8b 927 if (block != rs->last_sent_block) {
ce25d337
JQ
928 flush_compressed_data(rs);
929 pages = save_zero_page(rs, block, offset, p);
56e93d26 930 if (pages == -1) {
fc50438e 931 /* Make sure the first page is sent out before other pages */
2bf3aa85 932 bytes_xmit = save_page_header(rs, rs->f, block, offset |
fc50438e 933 RAM_SAVE_FLAG_COMPRESS_PAGE);
ce25d337 934 blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
fc50438e
LL
935 migrate_compress_level());
936 if (blen > 0) {
9360447d
JQ
937 ram_counters.transferred += bytes_xmit + blen;
938 ram_counters.normal++;
b3be2896 939 pages = 1;
fc50438e 940 } else {
ce25d337 941 qemu_file_set_error(rs->f, blen);
fc50438e 942 error_report("compressed data failed!");
b3be2896 943 }
56e93d26 944 }
53f09a10 945 if (pages > 0) {
a935e30f 946 ram_release_pages(block->idstr, offset, pages);
53f09a10 947 }
56e93d26 948 } else {
ce25d337 949 pages = save_zero_page(rs, block, offset, p);
56e93d26 950 if (pages == -1) {
ce25d337 951 pages = compress_page_with_multi_thread(rs, block, offset);
53f09a10 952 } else {
a935e30f 953 ram_release_pages(block->idstr, offset, pages);
56e93d26
JQ
954 }
955 }
956 }
957
958 return pages;
959}
960
3d0684b2
JQ
961/**
962 * find_dirty_block: find the next dirty page and update any state
963 * associated with the search process.
b9e60928 964 *
3d0684b2 965 * Returns if a page is found
b9e60928 966 *
6f37bb8b 967 * @rs: current RAM state
3d0684b2
JQ
968 * @pss: data about the state of the current dirty page scan
969 * @again: set to false if the search has scanned the whole of RAM
b9e60928 970 */
f20e2865 971static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
b9e60928 972{
f20e2865 973 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
6f37bb8b 974 if (pss->complete_round && pss->block == rs->last_seen_block &&
a935e30f 975 pss->page >= rs->last_page) {
b9e60928
DDAG
976 /*
977 * We've been once around the RAM and haven't found anything.
978 * Give up.
979 */
980 *again = false;
981 return false;
982 }
a935e30f 983 if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
b9e60928 984 /* Didn't find anything in this RAM Block */
a935e30f 985 pss->page = 0;
b9e60928
DDAG
986 pss->block = QLIST_NEXT_RCU(pss->block, next);
987 if (!pss->block) {
988 /* Hit the end of the list */
989 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
990 /* Flag that we've looped */
991 pss->complete_round = true;
6f37bb8b 992 rs->ram_bulk_stage = false;
b9e60928
DDAG
993 if (migrate_use_xbzrle()) {
994 /* If xbzrle is on, stop using the data compression at this
995 * point. In theory, xbzrle can do better than compression.
996 */
ce25d337 997 flush_compressed_data(rs);
b9e60928
DDAG
998 }
999 }
1000 /* Didn't find anything this time, but try again on the new block */
1001 *again = true;
1002 return false;
1003 } else {
1004 /* Can go around again, but... */
1005 *again = true;
1006 /* We've found something so probably don't need to */
1007 return true;
1008 }
1009}
1010
3d0684b2
JQ
1011/**
1012 * unqueue_page: gets a page of the queue
1013 *
a82d593b 1014 * Helper for 'get_queued_page' - gets a page off the queue
a82d593b 1015 *
3d0684b2
JQ
1016 * Returns the block of the page (or NULL if none available)
1017 *
ec481c6c 1018 * @rs: current RAM state
3d0684b2 1019 * @offset: used to return the offset within the RAMBlock
a82d593b 1020 */
f20e2865 1021static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
a82d593b
DDAG
1022{
1023 RAMBlock *block = NULL;
1024
ec481c6c
JQ
1025 qemu_mutex_lock(&rs->src_page_req_mutex);
1026 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1027 struct RAMSrcPageRequest *entry =
1028 QSIMPLEQ_FIRST(&rs->src_page_requests);
a82d593b
DDAG
1029 block = entry->rb;
1030 *offset = entry->offset;
a82d593b
DDAG
1031
1032 if (entry->len > TARGET_PAGE_SIZE) {
1033 entry->len -= TARGET_PAGE_SIZE;
1034 entry->offset += TARGET_PAGE_SIZE;
1035 } else {
1036 memory_region_unref(block->mr);
ec481c6c 1037 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
a82d593b
DDAG
1038 g_free(entry);
1039 }
1040 }
ec481c6c 1041 qemu_mutex_unlock(&rs->src_page_req_mutex);
a82d593b
DDAG
1042
1043 return block;
1044}
1045
3d0684b2
JQ
1046/**
1047 * get_queued_page: unqueue a page from the postocpy requests
1048 *
1049 * Skips pages that are already sent (!dirty)
a82d593b 1050 *
3d0684b2 1051 * Returns if a queued page is found
a82d593b 1052 *
6f37bb8b 1053 * @rs: current RAM state
3d0684b2 1054 * @pss: data about the state of the current dirty page scan
a82d593b 1055 */
f20e2865 1056static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
a82d593b
DDAG
1057{
1058 RAMBlock *block;
1059 ram_addr_t offset;
1060 bool dirty;
1061
1062 do {
f20e2865 1063 block = unqueue_page(rs, &offset);
a82d593b
DDAG
1064 /*
1065 * We're sending this page, and since it's postcopy nothing else
1066 * will dirty it, and we must make sure it doesn't get sent again
1067 * even if this queue request was received after the background
1068 * search already sent it.
1069 */
1070 if (block) {
f20e2865
JQ
1071 unsigned long page;
1072
6b6712ef
JQ
1073 page = offset >> TARGET_PAGE_BITS;
1074 dirty = test_bit(page, block->bmap);
a82d593b 1075 if (!dirty) {
06b10688 1076 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
6b6712ef 1077 page, test_bit(page, block->unsentmap));
a82d593b 1078 } else {
f20e2865 1079 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
a82d593b
DDAG
1080 }
1081 }
1082
1083 } while (block && !dirty);
1084
1085 if (block) {
1086 /*
1087 * As soon as we start servicing pages out of order, then we have
1088 * to kill the bulk stage, since the bulk stage assumes
1089 * in (migration_bitmap_find_and_reset_dirty) that every page is
1090 * dirty, that's no longer true.
1091 */
6f37bb8b 1092 rs->ram_bulk_stage = false;
a82d593b
DDAG
1093
1094 /*
1095 * We want the background search to continue from the queued page
1096 * since the guest is likely to want other pages near to the page
1097 * it just requested.
1098 */
1099 pss->block = block;
a935e30f 1100 pss->page = offset >> TARGET_PAGE_BITS;
a82d593b
DDAG
1101 }
1102
1103 return !!block;
1104}
1105
6c595cde 1106/**
5e58f968
JQ
1107 * migration_page_queue_free: drop any remaining pages in the ram
1108 * request queue
6c595cde 1109 *
3d0684b2
JQ
1110 * It should be empty at the end anyway, but in error cases there may
1111 * be some left. in case that there is any page left, we drop it.
1112 *
6c595cde 1113 */
83c13382 1114static void migration_page_queue_free(RAMState *rs)
6c595cde 1115{
ec481c6c 1116 struct RAMSrcPageRequest *mspr, *next_mspr;
6c595cde
DDAG
1117 /* This queue generally should be empty - but in the case of a failed
1118 * migration might have some droppings in.
1119 */
1120 rcu_read_lock();
ec481c6c 1121 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
6c595cde 1122 memory_region_unref(mspr->rb->mr);
ec481c6c 1123 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
6c595cde
DDAG
1124 g_free(mspr);
1125 }
1126 rcu_read_unlock();
1127}
1128
1129/**
3d0684b2
JQ
1130 * ram_save_queue_pages: queue the page for transmission
1131 *
1132 * A request from postcopy destination for example.
1133 *
1134 * Returns zero on success or negative on error
1135 *
3d0684b2
JQ
1136 * @rbname: Name of the RAMBLock of the request. NULL means the
1137 * same that last one.
1138 * @start: starting address from the start of the RAMBlock
1139 * @len: length (in bytes) to send
6c595cde 1140 */
96506894 1141int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
6c595cde
DDAG
1142{
1143 RAMBlock *ramblock;
53518d94 1144 RAMState *rs = ram_state;
6c595cde 1145
9360447d 1146 ram_counters.postcopy_requests++;
6c595cde
DDAG
1147 rcu_read_lock();
1148 if (!rbname) {
1149 /* Reuse last RAMBlock */
68a098f3 1150 ramblock = rs->last_req_rb;
6c595cde
DDAG
1151
1152 if (!ramblock) {
1153 /*
1154 * Shouldn't happen, we can't reuse the last RAMBlock if
1155 * it's the 1st request.
1156 */
1157 error_report("ram_save_queue_pages no previous block");
1158 goto err;
1159 }
1160 } else {
1161 ramblock = qemu_ram_block_by_name(rbname);
1162
1163 if (!ramblock) {
1164 /* We shouldn't be asked for a non-existent RAMBlock */
1165 error_report("ram_save_queue_pages no block '%s'", rbname);
1166 goto err;
1167 }
68a098f3 1168 rs->last_req_rb = ramblock;
6c595cde
DDAG
1169 }
1170 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1171 if (start+len > ramblock->used_length) {
9458ad6b
JQ
1172 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1173 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
6c595cde
DDAG
1174 __func__, start, len, ramblock->used_length);
1175 goto err;
1176 }
1177
ec481c6c
JQ
1178 struct RAMSrcPageRequest *new_entry =
1179 g_malloc0(sizeof(struct RAMSrcPageRequest));
6c595cde
DDAG
1180 new_entry->rb = ramblock;
1181 new_entry->offset = start;
1182 new_entry->len = len;
1183
1184 memory_region_ref(ramblock->mr);
ec481c6c
JQ
1185 qemu_mutex_lock(&rs->src_page_req_mutex);
1186 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1187 qemu_mutex_unlock(&rs->src_page_req_mutex);
6c595cde
DDAG
1188 rcu_read_unlock();
1189
1190 return 0;
1191
1192err:
1193 rcu_read_unlock();
1194 return -1;
1195}
1196
a82d593b 1197/**
3d0684b2 1198 * ram_save_target_page: save one target page
a82d593b 1199 *
3d0684b2 1200 * Returns the number of pages written
a82d593b 1201 *
6f37bb8b 1202 * @rs: current RAM state
3d0684b2 1203 * @ms: current migration state
3d0684b2 1204 * @pss: data about the page we want to send
a82d593b 1205 * @last_stage: if we are at the completion stage
a82d593b 1206 */
a0a8aa14 1207static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
f20e2865 1208 bool last_stage)
a82d593b
DDAG
1209{
1210 int res = 0;
1211
1212 /* Check the pages is dirty and if it is send it */
f20e2865 1213 if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
6d358d94
JQ
1214 /*
1215 * If xbzrle is on, stop using the data compression after first
1216 * round of migration even if compression is enabled. In theory,
1217 * xbzrle can do better than compression.
1218 */
6b6712ef
JQ
1219 if (migrate_use_compression() &&
1220 (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
a0a8aa14 1221 res = ram_save_compressed_page(rs, pss, last_stage);
a82d593b 1222 } else {
a0a8aa14 1223 res = ram_save_page(rs, pss, last_stage);
a82d593b
DDAG
1224 }
1225
1226 if (res < 0) {
1227 return res;
1228 }
6b6712ef
JQ
1229 if (pss->block->unsentmap) {
1230 clear_bit(pss->page, pss->block->unsentmap);
a82d593b
DDAG
1231 }
1232 }
1233
1234 return res;
1235}
1236
1237/**
3d0684b2 1238 * ram_save_host_page: save a whole host page
a82d593b 1239 *
3d0684b2
JQ
1240 * Starting at *offset send pages up to the end of the current host
1241 * page. It's valid for the initial offset to point into the middle of
1242 * a host page in which case the remainder of the hostpage is sent.
1243 * Only dirty target pages are sent. Note that the host page size may
1244 * be a huge page for this block.
1eb3fc0a
DDAG
1245 * The saving stops at the boundary of the used_length of the block
1246 * if the RAMBlock isn't a multiple of the host page size.
a82d593b 1247 *
3d0684b2
JQ
1248 * Returns the number of pages written or negative on error
1249 *
6f37bb8b 1250 * @rs: current RAM state
3d0684b2 1251 * @ms: current migration state
3d0684b2 1252 * @pss: data about the page we want to send
a82d593b 1253 * @last_stage: if we are at the completion stage
a82d593b 1254 */
a0a8aa14 1255static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
f20e2865 1256 bool last_stage)
a82d593b
DDAG
1257{
1258 int tmppages, pages = 0;
a935e30f
JQ
1259 size_t pagesize_bits =
1260 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
4c011c37 1261
a82d593b 1262 do {
f20e2865 1263 tmppages = ram_save_target_page(rs, pss, last_stage);
a82d593b
DDAG
1264 if (tmppages < 0) {
1265 return tmppages;
1266 }
1267
1268 pages += tmppages;
a935e30f 1269 pss->page++;
1eb3fc0a
DDAG
1270 } while ((pss->page & (pagesize_bits - 1)) &&
1271 offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
a82d593b
DDAG
1272
1273 /* The offset we leave with is the last one we looked at */
a935e30f 1274 pss->page--;
a82d593b
DDAG
1275 return pages;
1276}
6c595cde 1277
56e93d26 1278/**
3d0684b2 1279 * ram_find_and_save_block: finds a dirty page and sends it to f
56e93d26
JQ
1280 *
1281 * Called within an RCU critical section.
1282 *
3d0684b2 1283 * Returns the number of pages written where zero means no dirty pages
56e93d26 1284 *
6f37bb8b 1285 * @rs: current RAM state
56e93d26 1286 * @last_stage: if we are at the completion stage
a82d593b
DDAG
1287 *
1288 * On systems where host-page-size > target-page-size it will send all the
1289 * pages in a host page that are dirty.
56e93d26
JQ
1290 */
1291
ce25d337 1292static int ram_find_and_save_block(RAMState *rs, bool last_stage)
56e93d26 1293{
b8fb8cb7 1294 PageSearchStatus pss;
56e93d26 1295 int pages = 0;
b9e60928 1296 bool again, found;
56e93d26 1297
0827b9e9
AA
1298 /* No dirty page as there is zero RAM */
1299 if (!ram_bytes_total()) {
1300 return pages;
1301 }
1302
6f37bb8b 1303 pss.block = rs->last_seen_block;
a935e30f 1304 pss.page = rs->last_page;
b8fb8cb7
DDAG
1305 pss.complete_round = false;
1306
1307 if (!pss.block) {
1308 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1309 }
56e93d26 1310
b9e60928 1311 do {
a82d593b 1312 again = true;
f20e2865 1313 found = get_queued_page(rs, &pss);
b9e60928 1314
a82d593b
DDAG
1315 if (!found) {
1316 /* priority queue empty, so just search for something dirty */
f20e2865 1317 found = find_dirty_block(rs, &pss, &again);
a82d593b 1318 }
f3f491fc 1319
a82d593b 1320 if (found) {
f20e2865 1321 pages = ram_save_host_page(rs, &pss, last_stage);
56e93d26 1322 }
b9e60928 1323 } while (!pages && again);
56e93d26 1324
6f37bb8b 1325 rs->last_seen_block = pss.block;
a935e30f 1326 rs->last_page = pss.page;
56e93d26
JQ
1327
1328 return pages;
1329}
1330
1331void acct_update_position(QEMUFile *f, size_t size, bool zero)
1332{
1333 uint64_t pages = size / TARGET_PAGE_SIZE;
f7ccd61b 1334
56e93d26 1335 if (zero) {
9360447d 1336 ram_counters.duplicate += pages;
56e93d26 1337 } else {
9360447d
JQ
1338 ram_counters.normal += pages;
1339 ram_counters.transferred += size;
56e93d26
JQ
1340 qemu_update_position(f, size);
1341 }
1342}
1343
56e93d26
JQ
1344uint64_t ram_bytes_total(void)
1345{
1346 RAMBlock *block;
1347 uint64_t total = 0;
1348
1349 rcu_read_lock();
99e15582 1350 RAMBLOCK_FOREACH(block) {
56e93d26 1351 total += block->used_length;
99e15582 1352 }
56e93d26
JQ
1353 rcu_read_unlock();
1354 return total;
1355}
1356
1357void free_xbzrle_decoded_buf(void)
1358{
1359 g_free(xbzrle_decoded_buf);
1360 xbzrle_decoded_buf = NULL;
1361}
1362
6ad2a215 1363static void ram_migration_cleanup(void *opaque)
56e93d26 1364{
53518d94 1365 RAMState **rsp = opaque;
6b6712ef 1366 RAMBlock *block;
eb859c53 1367
2ff64038
LZ
1368 /* caller have hold iothread lock or is in a bh, so there is
1369 * no writing race against this migration_bitmap
1370 */
6b6712ef
JQ
1371 memory_global_dirty_log_stop();
1372
1373 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1374 g_free(block->bmap);
1375 block->bmap = NULL;
1376 g_free(block->unsentmap);
1377 block->unsentmap = NULL;
56e93d26
JQ
1378 }
1379
1380 XBZRLE_cache_lock();
1381 if (XBZRLE.cache) {
1382 cache_fini(XBZRLE.cache);
1383 g_free(XBZRLE.encoded_buf);
1384 g_free(XBZRLE.current_buf);
c00e0928 1385 g_free(XBZRLE.zero_target_page);
56e93d26
JQ
1386 XBZRLE.cache = NULL;
1387 XBZRLE.encoded_buf = NULL;
1388 XBZRLE.current_buf = NULL;
c00e0928 1389 XBZRLE.zero_target_page = NULL;
56e93d26
JQ
1390 }
1391 XBZRLE_cache_unlock();
53518d94
JQ
1392 migration_page_queue_free(*rsp);
1393 g_free(*rsp);
1394 *rsp = NULL;
56e93d26
JQ
1395}
1396
6f37bb8b 1397static void ram_state_reset(RAMState *rs)
56e93d26 1398{
6f37bb8b
JQ
1399 rs->last_seen_block = NULL;
1400 rs->last_sent_block = NULL;
269ace29 1401 rs->last_page = 0;
6f37bb8b
JQ
1402 rs->last_version = ram_list.version;
1403 rs->ram_bulk_stage = true;
56e93d26
JQ
1404}
1405
1406#define MAX_WAIT 50 /* ms, half buffered_file limit */
1407
4f2e4252
DDAG
1408/*
1409 * 'expected' is the value you expect the bitmap mostly to be full
1410 * of; it won't bother printing lines that are all this value.
1411 * If 'todump' is null the migration bitmap is dumped.
1412 */
6b6712ef
JQ
1413void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1414 unsigned long pages)
4f2e4252 1415{
4f2e4252
DDAG
1416 int64_t cur;
1417 int64_t linelen = 128;
1418 char linebuf[129];
1419
6b6712ef 1420 for (cur = 0; cur < pages; cur += linelen) {
4f2e4252
DDAG
1421 int64_t curb;
1422 bool found = false;
1423 /*
1424 * Last line; catch the case where the line length
1425 * is longer than remaining ram
1426 */
6b6712ef
JQ
1427 if (cur + linelen > pages) {
1428 linelen = pages - cur;
4f2e4252
DDAG
1429 }
1430 for (curb = 0; curb < linelen; curb++) {
1431 bool thisbit = test_bit(cur + curb, todump);
1432 linebuf[curb] = thisbit ? '1' : '.';
1433 found = found || (thisbit != expected);
1434 }
1435 if (found) {
1436 linebuf[curb] = '\0';
1437 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
1438 }
1439 }
1440}
1441
e0b266f0
DDAG
1442/* **** functions for postcopy ***** */
1443
ced1c616
PB
1444void ram_postcopy_migrated_memory_release(MigrationState *ms)
1445{
1446 struct RAMBlock *block;
ced1c616 1447
99e15582 1448 RAMBLOCK_FOREACH(block) {
6b6712ef
JQ
1449 unsigned long *bitmap = block->bmap;
1450 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1451 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
ced1c616
PB
1452
1453 while (run_start < range) {
1454 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
aaa2064c 1455 ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
ced1c616
PB
1456 (run_end - run_start) << TARGET_PAGE_BITS);
1457 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1458 }
1459 }
1460}
1461
3d0684b2
JQ
1462/**
1463 * postcopy_send_discard_bm_ram: discard a RAMBlock
1464 *
1465 * Returns zero on success
1466 *
e0b266f0
DDAG
1467 * Callback from postcopy_each_ram_send_discard for each RAMBlock
1468 * Note: At this point the 'unsentmap' is the processed bitmap combined
1469 * with the dirtymap; so a '1' means it's either dirty or unsent.
3d0684b2
JQ
1470 *
1471 * @ms: current migration state
1472 * @pds: state for postcopy
1473 * @start: RAMBlock starting page
1474 * @length: RAMBlock size
e0b266f0
DDAG
1475 */
1476static int postcopy_send_discard_bm_ram(MigrationState *ms,
1477 PostcopyDiscardState *pds,
6b6712ef 1478 RAMBlock *block)
e0b266f0 1479{
6b6712ef 1480 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
e0b266f0 1481 unsigned long current;
6b6712ef 1482 unsigned long *unsentmap = block->unsentmap;
e0b266f0 1483
6b6712ef 1484 for (current = 0; current < end; ) {
e0b266f0
DDAG
1485 unsigned long one = find_next_bit(unsentmap, end, current);
1486
1487 if (one <= end) {
1488 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1489 unsigned long discard_length;
1490
1491 if (zero >= end) {
1492 discard_length = end - one;
1493 } else {
1494 discard_length = zero - one;
1495 }
d688c62d
DDAG
1496 if (discard_length) {
1497 postcopy_discard_send_range(ms, pds, one, discard_length);
1498 }
e0b266f0
DDAG
1499 current = one + discard_length;
1500 } else {
1501 current = one;
1502 }
1503 }
1504
1505 return 0;
1506}
1507
3d0684b2
JQ
1508/**
1509 * postcopy_each_ram_send_discard: discard all RAMBlocks
1510 *
1511 * Returns 0 for success or negative for error
1512 *
e0b266f0
DDAG
1513 * Utility for the outgoing postcopy code.
1514 * Calls postcopy_send_discard_bm_ram for each RAMBlock
1515 * passing it bitmap indexes and name.
e0b266f0
DDAG
1516 * (qemu_ram_foreach_block ends up passing unscaled lengths
1517 * which would mean postcopy code would have to deal with target page)
3d0684b2
JQ
1518 *
1519 * @ms: current migration state
e0b266f0
DDAG
1520 */
1521static int postcopy_each_ram_send_discard(MigrationState *ms)
1522{
1523 struct RAMBlock *block;
1524 int ret;
1525
99e15582 1526 RAMBLOCK_FOREACH(block) {
6b6712ef
JQ
1527 PostcopyDiscardState *pds =
1528 postcopy_discard_send_init(ms, block->idstr);
e0b266f0
DDAG
1529
1530 /*
1531 * Postcopy sends chunks of bitmap over the wire, but it
1532 * just needs indexes at this point, avoids it having
1533 * target page specific code.
1534 */
6b6712ef 1535 ret = postcopy_send_discard_bm_ram(ms, pds, block);
e0b266f0
DDAG
1536 postcopy_discard_send_finish(ms, pds);
1537 if (ret) {
1538 return ret;
1539 }
1540 }
1541
1542 return 0;
1543}
1544
3d0684b2
JQ
1545/**
1546 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1547 *
1548 * Helper for postcopy_chunk_hostpages; it's called twice to
1549 * canonicalize the two bitmaps, that are similar, but one is
1550 * inverted.
99e314eb 1551 *
3d0684b2
JQ
1552 * Postcopy requires that all target pages in a hostpage are dirty or
1553 * clean, not a mix. This function canonicalizes the bitmaps.
99e314eb 1554 *
3d0684b2
JQ
1555 * @ms: current migration state
1556 * @unsent_pass: if true we need to canonicalize partially unsent host pages
1557 * otherwise we need to canonicalize partially dirty host pages
1558 * @block: block that contains the page we want to canonicalize
1559 * @pds: state for postcopy
99e314eb
DDAG
1560 */
1561static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1562 RAMBlock *block,
1563 PostcopyDiscardState *pds)
1564{
53518d94 1565 RAMState *rs = ram_state;
6b6712ef
JQ
1566 unsigned long *bitmap = block->bmap;
1567 unsigned long *unsentmap = block->unsentmap;
29c59172 1568 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
6b6712ef 1569 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
99e314eb
DDAG
1570 unsigned long run_start;
1571
29c59172
DDAG
1572 if (block->page_size == TARGET_PAGE_SIZE) {
1573 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1574 return;
1575 }
1576
99e314eb
DDAG
1577 if (unsent_pass) {
1578 /* Find a sent page */
6b6712ef 1579 run_start = find_next_zero_bit(unsentmap, pages, 0);
99e314eb
DDAG
1580 } else {
1581 /* Find a dirty page */
6b6712ef 1582 run_start = find_next_bit(bitmap, pages, 0);
99e314eb
DDAG
1583 }
1584
6b6712ef 1585 while (run_start < pages) {
99e314eb
DDAG
1586 bool do_fixup = false;
1587 unsigned long fixup_start_addr;
1588 unsigned long host_offset;
1589
1590 /*
1591 * If the start of this run of pages is in the middle of a host
1592 * page, then we need to fixup this host page.
1593 */
1594 host_offset = run_start % host_ratio;
1595 if (host_offset) {
1596 do_fixup = true;
1597 run_start -= host_offset;
1598 fixup_start_addr = run_start;
1599 /* For the next pass */
1600 run_start = run_start + host_ratio;
1601 } else {
1602 /* Find the end of this run */
1603 unsigned long run_end;
1604 if (unsent_pass) {
6b6712ef 1605 run_end = find_next_bit(unsentmap, pages, run_start + 1);
99e314eb 1606 } else {
6b6712ef 1607 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
99e314eb
DDAG
1608 }
1609 /*
1610 * If the end isn't at the start of a host page, then the
1611 * run doesn't finish at the end of a host page
1612 * and we need to discard.
1613 */
1614 host_offset = run_end % host_ratio;
1615 if (host_offset) {
1616 do_fixup = true;
1617 fixup_start_addr = run_end - host_offset;
1618 /*
1619 * This host page has gone, the next loop iteration starts
1620 * from after the fixup
1621 */
1622 run_start = fixup_start_addr + host_ratio;
1623 } else {
1624 /*
1625 * No discards on this iteration, next loop starts from
1626 * next sent/dirty page
1627 */
1628 run_start = run_end + 1;
1629 }
1630 }
1631
1632 if (do_fixup) {
1633 unsigned long page;
1634
1635 /* Tell the destination to discard this page */
1636 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1637 /* For the unsent_pass we:
1638 * discard partially sent pages
1639 * For the !unsent_pass (dirty) we:
1640 * discard partially dirty pages that were sent
1641 * (any partially sent pages were already discarded
1642 * by the previous unsent_pass)
1643 */
1644 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1645 host_ratio);
1646 }
1647
1648 /* Clean up the bitmap */
1649 for (page = fixup_start_addr;
1650 page < fixup_start_addr + host_ratio; page++) {
1651 /* All pages in this host page are now not sent */
1652 set_bit(page, unsentmap);
1653
1654 /*
1655 * Remark them as dirty, updating the count for any pages
1656 * that weren't previously dirty.
1657 */
0d8ec885 1658 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
99e314eb
DDAG
1659 }
1660 }
1661
1662 if (unsent_pass) {
1663 /* Find the next sent page for the next iteration */
6b6712ef 1664 run_start = find_next_zero_bit(unsentmap, pages, run_start);
99e314eb
DDAG
1665 } else {
1666 /* Find the next dirty page for the next iteration */
6b6712ef 1667 run_start = find_next_bit(bitmap, pages, run_start);
99e314eb
DDAG
1668 }
1669 }
1670}
1671
3d0684b2
JQ
1672/**
1673 * postcopy_chuck_hostpages: discrad any partially sent host page
1674 *
99e314eb
DDAG
1675 * Utility for the outgoing postcopy code.
1676 *
1677 * Discard any partially sent host-page size chunks, mark any partially
29c59172
DDAG
1678 * dirty host-page size chunks as all dirty. In this case the host-page
1679 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
99e314eb 1680 *
3d0684b2
JQ
1681 * Returns zero on success
1682 *
1683 * @ms: current migration state
6b6712ef 1684 * @block: block we want to work with
99e314eb 1685 */
6b6712ef 1686static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
99e314eb 1687{
6b6712ef
JQ
1688 PostcopyDiscardState *pds =
1689 postcopy_discard_send_init(ms, block->idstr);
99e314eb 1690
6b6712ef
JQ
1691 /* First pass: Discard all partially sent host pages */
1692 postcopy_chunk_hostpages_pass(ms, true, block, pds);
1693 /*
1694 * Second pass: Ensure that all partially dirty host pages are made
1695 * fully dirty.
1696 */
1697 postcopy_chunk_hostpages_pass(ms, false, block, pds);
99e314eb 1698
6b6712ef 1699 postcopy_discard_send_finish(ms, pds);
99e314eb
DDAG
1700 return 0;
1701}
1702
3d0684b2
JQ
1703/**
1704 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1705 *
1706 * Returns zero on success
1707 *
e0b266f0
DDAG
1708 * Transmit the set of pages to be discarded after precopy to the target
1709 * these are pages that:
1710 * a) Have been previously transmitted but are now dirty again
1711 * b) Pages that have never been transmitted, this ensures that
1712 * any pages on the destination that have been mapped by background
1713 * tasks get discarded (transparent huge pages is the specific concern)
1714 * Hopefully this is pretty sparse
3d0684b2
JQ
1715 *
1716 * @ms: current migration state
e0b266f0
DDAG
1717 */
1718int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1719{
53518d94 1720 RAMState *rs = ram_state;
6b6712ef 1721 RAMBlock *block;
e0b266f0 1722 int ret;
e0b266f0
DDAG
1723
1724 rcu_read_lock();
1725
1726 /* This should be our last sync, the src is now paused */
eb859c53 1727 migration_bitmap_sync(rs);
e0b266f0 1728
6b6712ef
JQ
1729 /* Easiest way to make sure we don't resume in the middle of a host-page */
1730 rs->last_seen_block = NULL;
1731 rs->last_sent_block = NULL;
1732 rs->last_page = 0;
e0b266f0 1733
6b6712ef
JQ
1734 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1735 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1736 unsigned long *bitmap = block->bmap;
1737 unsigned long *unsentmap = block->unsentmap;
1738
1739 if (!unsentmap) {
1740 /* We don't have a safe way to resize the sentmap, so
1741 * if the bitmap was resized it will be NULL at this
1742 * point.
1743 */
1744 error_report("migration ram resized during precopy phase");
1745 rcu_read_unlock();
1746 return -EINVAL;
1747 }
1748 /* Deal with TPS != HPS and huge pages */
1749 ret = postcopy_chunk_hostpages(ms, block);
1750 if (ret) {
1751 rcu_read_unlock();
1752 return ret;
1753 }
e0b266f0 1754
6b6712ef
JQ
1755 /*
1756 * Update the unsentmap to be unsentmap = unsentmap | dirty
1757 */
1758 bitmap_or(unsentmap, unsentmap, bitmap, pages);
e0b266f0 1759#ifdef DEBUG_POSTCOPY
6b6712ef 1760 ram_debug_dump_bitmap(unsentmap, true, pages);
e0b266f0 1761#endif
6b6712ef
JQ
1762 }
1763 trace_ram_postcopy_send_discard_bitmap();
e0b266f0
DDAG
1764
1765 ret = postcopy_each_ram_send_discard(ms);
1766 rcu_read_unlock();
1767
1768 return ret;
1769}
1770
3d0684b2
JQ
1771/**
1772 * ram_discard_range: discard dirtied pages at the beginning of postcopy
e0b266f0 1773 *
3d0684b2 1774 * Returns zero on success
e0b266f0 1775 *
36449157
JQ
1776 * @rbname: name of the RAMBlock of the request. NULL means the
1777 * same that last one.
3d0684b2
JQ
1778 * @start: RAMBlock starting page
1779 * @length: RAMBlock size
e0b266f0 1780 */
aaa2064c 1781int ram_discard_range(const char *rbname, uint64_t start, size_t length)
e0b266f0
DDAG
1782{
1783 int ret = -1;
1784
36449157 1785 trace_ram_discard_range(rbname, start, length);
d3a5038c 1786
e0b266f0 1787 rcu_read_lock();
36449157 1788 RAMBlock *rb = qemu_ram_block_by_name(rbname);
e0b266f0
DDAG
1789
1790 if (!rb) {
36449157 1791 error_report("ram_discard_range: Failed to find block '%s'", rbname);
e0b266f0
DDAG
1792 goto err;
1793 }
1794
d3a5038c 1795 ret = ram_block_discard_range(rb, start, length);
e0b266f0
DDAG
1796
1797err:
1798 rcu_read_unlock();
1799
1800 return ret;
1801}
1802
53518d94 1803static int ram_state_init(RAMState **rsp)
56e93d26 1804{
53518d94
JQ
1805 *rsp = g_new0(RAMState, 1);
1806
1807 qemu_mutex_init(&(*rsp)->bitmap_mutex);
1808 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
1809 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
56e93d26
JQ
1810
1811 if (migrate_use_xbzrle()) {
1812 XBZRLE_cache_lock();
c00e0928 1813 XBZRLE.zero_target_page = g_malloc0(TARGET_PAGE_SIZE);
56e93d26
JQ
1814 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1815 TARGET_PAGE_SIZE,
1816 TARGET_PAGE_SIZE);
1817 if (!XBZRLE.cache) {
1818 XBZRLE_cache_unlock();
1819 error_report("Error creating cache");
53518d94
JQ
1820 g_free(*rsp);
1821 *rsp = NULL;
56e93d26
JQ
1822 return -1;
1823 }
1824 XBZRLE_cache_unlock();
1825
1826 /* We prefer not to abort if there is no memory */
1827 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1828 if (!XBZRLE.encoded_buf) {
1829 error_report("Error allocating encoded_buf");
53518d94
JQ
1830 g_free(*rsp);
1831 *rsp = NULL;
56e93d26
JQ
1832 return -1;
1833 }
1834
1835 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1836 if (!XBZRLE.current_buf) {
1837 error_report("Error allocating current_buf");
1838 g_free(XBZRLE.encoded_buf);
1839 XBZRLE.encoded_buf = NULL;
53518d94
JQ
1840 g_free(*rsp);
1841 *rsp = NULL;
56e93d26
JQ
1842 return -1;
1843 }
56e93d26
JQ
1844 }
1845
49877834
PB
1846 /* For memory_global_dirty_log_start below. */
1847 qemu_mutex_lock_iothread();
1848
56e93d26
JQ
1849 qemu_mutex_lock_ramlist();
1850 rcu_read_lock();
53518d94 1851 ram_state_reset(*rsp);
56e93d26 1852
0827b9e9
AA
1853 /* Skip setting bitmap if there is no RAM */
1854 if (ram_bytes_total()) {
6b6712ef
JQ
1855 RAMBlock *block;
1856
1857 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1858 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
0827b9e9 1859
6b6712ef
JQ
1860 block->bmap = bitmap_new(pages);
1861 bitmap_set(block->bmap, 0, pages);
1862 if (migrate_postcopy_ram()) {
1863 block->unsentmap = bitmap_new(pages);
1864 bitmap_set(block->unsentmap, 0, pages);
1865 }
0827b9e9 1866 }
f3f491fc
DDAG
1867 }
1868
56e93d26
JQ
1869 /*
1870 * Count the total number of pages used by ram blocks not including any
1871 * gaps due to alignment or unplugs.
1872 */
53518d94 1873 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
56e93d26
JQ
1874
1875 memory_global_dirty_log_start();
53518d94 1876 migration_bitmap_sync(*rsp);
56e93d26 1877 qemu_mutex_unlock_ramlist();
49877834 1878 qemu_mutex_unlock_iothread();
a91246c9
HZ
1879 rcu_read_unlock();
1880
1881 return 0;
1882}
1883
3d0684b2
JQ
1884/*
1885 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
a91246c9
HZ
1886 * long-running RCU critical section. When rcu-reclaims in the code
1887 * start to become numerous it will be necessary to reduce the
1888 * granularity of these critical sections.
1889 */
1890
3d0684b2
JQ
1891/**
1892 * ram_save_setup: Setup RAM for migration
1893 *
1894 * Returns zero to indicate success and negative for error
1895 *
1896 * @f: QEMUFile where to send the data
1897 * @opaque: RAMState pointer
1898 */
a91246c9
HZ
1899static int ram_save_setup(QEMUFile *f, void *opaque)
1900{
53518d94 1901 RAMState **rsp = opaque;
a91246c9
HZ
1902 RAMBlock *block;
1903
1904 /* migration has already setup the bitmap, reuse it. */
1905 if (!migration_in_colo_state()) {
53518d94 1906 if (ram_state_init(rsp) != 0) {
a91246c9 1907 return -1;
53518d94 1908 }
a91246c9 1909 }
53518d94 1910 (*rsp)->f = f;
a91246c9
HZ
1911
1912 rcu_read_lock();
56e93d26
JQ
1913
1914 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1915
99e15582 1916 RAMBLOCK_FOREACH(block) {
56e93d26
JQ
1917 qemu_put_byte(f, strlen(block->idstr));
1918 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1919 qemu_put_be64(f, block->used_length);
ef08fb38
DDAG
1920 if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
1921 qemu_put_be64(f, block->page_size);
1922 }
56e93d26
JQ
1923 }
1924
1925 rcu_read_unlock();
1926
1927 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1928 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1929
1930 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1931
1932 return 0;
1933}
1934
3d0684b2
JQ
1935/**
1936 * ram_save_iterate: iterative stage for migration
1937 *
1938 * Returns zero to indicate success and negative for error
1939 *
1940 * @f: QEMUFile where to send the data
1941 * @opaque: RAMState pointer
1942 */
56e93d26
JQ
1943static int ram_save_iterate(QEMUFile *f, void *opaque)
1944{
53518d94
JQ
1945 RAMState **temp = opaque;
1946 RAMState *rs = *temp;
56e93d26
JQ
1947 int ret;
1948 int i;
1949 int64_t t0;
5c90308f 1950 int done = 0;
56e93d26
JQ
1951
1952 rcu_read_lock();
6f37bb8b
JQ
1953 if (ram_list.version != rs->last_version) {
1954 ram_state_reset(rs);
56e93d26
JQ
1955 }
1956
1957 /* Read version before ram_list.blocks */
1958 smp_rmb();
1959
1960 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
1961
1962 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1963 i = 0;
1964 while ((ret = qemu_file_rate_limit(f)) == 0) {
1965 int pages;
1966
ce25d337 1967 pages = ram_find_and_save_block(rs, false);
56e93d26
JQ
1968 /* no more pages to sent */
1969 if (pages == 0) {
5c90308f 1970 done = 1;
56e93d26
JQ
1971 break;
1972 }
23b28c3c 1973 rs->iterations++;
070afca2 1974
56e93d26
JQ
1975 /* we want to check in the 1st loop, just in case it was the 1st time
1976 and we had to sync the dirty bitmap.
1977 qemu_get_clock_ns() is a bit expensive, so we only check each some
1978 iterations
1979 */
1980 if ((i & 63) == 0) {
1981 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
1982 if (t1 > MAX_WAIT) {
55c4446b 1983 trace_ram_save_iterate_big_wait(t1, i);
56e93d26
JQ
1984 break;
1985 }
1986 }
1987 i++;
1988 }
ce25d337 1989 flush_compressed_data(rs);
56e93d26
JQ
1990 rcu_read_unlock();
1991
1992 /*
1993 * Must occur before EOS (or any QEMUFile operation)
1994 * because of RDMA protocol.
1995 */
1996 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
1997
1998 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
9360447d 1999 ram_counters.transferred += 8;
56e93d26
JQ
2000
2001 ret = qemu_file_get_error(f);
2002 if (ret < 0) {
2003 return ret;
2004 }
2005
5c90308f 2006 return done;
56e93d26
JQ
2007}
2008
3d0684b2
JQ
2009/**
2010 * ram_save_complete: function called to send the remaining amount of ram
2011 *
2012 * Returns zero to indicate success
2013 *
2014 * Called with iothread lock
2015 *
2016 * @f: QEMUFile where to send the data
2017 * @opaque: RAMState pointer
2018 */
56e93d26
JQ
2019static int ram_save_complete(QEMUFile *f, void *opaque)
2020{
53518d94
JQ
2021 RAMState **temp = opaque;
2022 RAMState *rs = *temp;
6f37bb8b 2023
56e93d26
JQ
2024 rcu_read_lock();
2025
5727309d 2026 if (!migration_in_postcopy()) {
8d820d6f 2027 migration_bitmap_sync(rs);
663e6c1d 2028 }
56e93d26
JQ
2029
2030 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2031
2032 /* try transferring iterative blocks of memory */
2033
2034 /* flush all remaining blocks regardless of rate limiting */
2035 while (true) {
2036 int pages;
2037
ce25d337 2038 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
56e93d26
JQ
2039 /* no more blocks to sent */
2040 if (pages == 0) {
2041 break;
2042 }
2043 }
2044
ce25d337 2045 flush_compressed_data(rs);
56e93d26 2046 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
56e93d26
JQ
2047
2048 rcu_read_unlock();
d09a6fde 2049
56e93d26
JQ
2050 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2051
2052 return 0;
2053}
2054
c31b098f
DDAG
2055static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2056 uint64_t *non_postcopiable_pending,
2057 uint64_t *postcopiable_pending)
56e93d26 2058{
53518d94
JQ
2059 RAMState **temp = opaque;
2060 RAMState *rs = *temp;
56e93d26
JQ
2061 uint64_t remaining_size;
2062
9edabd4d 2063 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 2064
5727309d 2065 if (!migration_in_postcopy() &&
663e6c1d 2066 remaining_size < max_size) {
56e93d26
JQ
2067 qemu_mutex_lock_iothread();
2068 rcu_read_lock();
8d820d6f 2069 migration_bitmap_sync(rs);
56e93d26
JQ
2070 rcu_read_unlock();
2071 qemu_mutex_unlock_iothread();
9edabd4d 2072 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 2073 }
c31b098f
DDAG
2074
2075 /* We can do postcopy, and all the data is postcopiable */
2076 *postcopiable_pending += remaining_size;
56e93d26
JQ
2077}
2078
2079static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2080{
2081 unsigned int xh_len;
2082 int xh_flags;
063e760a 2083 uint8_t *loaded_data;
56e93d26
JQ
2084
2085 if (!xbzrle_decoded_buf) {
2086 xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2087 }
063e760a 2088 loaded_data = xbzrle_decoded_buf;
56e93d26
JQ
2089
2090 /* extract RLE header */
2091 xh_flags = qemu_get_byte(f);
2092 xh_len = qemu_get_be16(f);
2093
2094 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2095 error_report("Failed to load XBZRLE page - wrong compression!");
2096 return -1;
2097 }
2098
2099 if (xh_len > TARGET_PAGE_SIZE) {
2100 error_report("Failed to load XBZRLE page - len overflow!");
2101 return -1;
2102 }
2103 /* load data and decode */
063e760a 2104 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
56e93d26
JQ
2105
2106 /* decode RLE */
063e760a 2107 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
56e93d26
JQ
2108 TARGET_PAGE_SIZE) == -1) {
2109 error_report("Failed to load XBZRLE page - decode error!");
2110 return -1;
2111 }
2112
2113 return 0;
2114}
2115
3d0684b2
JQ
2116/**
2117 * ram_block_from_stream: read a RAMBlock id from the migration stream
2118 *
2119 * Must be called from within a rcu critical section.
2120 *
56e93d26 2121 * Returns a pointer from within the RCU-protected ram_list.
a7180877 2122 *
3d0684b2
JQ
2123 * @f: QEMUFile where to read the data from
2124 * @flags: Page flags (mostly to see if it's a continuation of previous block)
a7180877 2125 */
3d0684b2 2126static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
56e93d26
JQ
2127{
2128 static RAMBlock *block = NULL;
2129 char id[256];
2130 uint8_t len;
2131
2132 if (flags & RAM_SAVE_FLAG_CONTINUE) {
4c4bad48 2133 if (!block) {
56e93d26
JQ
2134 error_report("Ack, bad migration stream!");
2135 return NULL;
2136 }
4c4bad48 2137 return block;
56e93d26
JQ
2138 }
2139
2140 len = qemu_get_byte(f);
2141 qemu_get_buffer(f, (uint8_t *)id, len);
2142 id[len] = 0;
2143
e3dd7493 2144 block = qemu_ram_block_by_name(id);
4c4bad48
HZ
2145 if (!block) {
2146 error_report("Can't find block %s", id);
2147 return NULL;
56e93d26
JQ
2148 }
2149
4c4bad48
HZ
2150 return block;
2151}
2152
2153static inline void *host_from_ram_block_offset(RAMBlock *block,
2154 ram_addr_t offset)
2155{
2156 if (!offset_in_ramblock(block, offset)) {
2157 return NULL;
2158 }
2159
2160 return block->host + offset;
56e93d26
JQ
2161}
2162
3d0684b2
JQ
2163/**
2164 * ram_handle_compressed: handle the zero page case
2165 *
56e93d26
JQ
2166 * If a page (or a whole RDMA chunk) has been
2167 * determined to be zero, then zap it.
3d0684b2
JQ
2168 *
2169 * @host: host address for the zero page
2170 * @ch: what the page is filled from. We only support zero
2171 * @size: size of the zero page
56e93d26
JQ
2172 */
2173void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2174{
2175 if (ch != 0 || !is_zero_range(host, size)) {
2176 memset(host, ch, size);
2177 }
2178}
2179
2180static void *do_data_decompress(void *opaque)
2181{
2182 DecompressParam *param = opaque;
2183 unsigned long pagesize;
33d151f4
LL
2184 uint8_t *des;
2185 int len;
56e93d26 2186
33d151f4 2187 qemu_mutex_lock(&param->mutex);
90e56fb4 2188 while (!param->quit) {
33d151f4
LL
2189 if (param->des) {
2190 des = param->des;
2191 len = param->len;
2192 param->des = 0;
2193 qemu_mutex_unlock(&param->mutex);
2194
56e93d26 2195 pagesize = TARGET_PAGE_SIZE;
73a8912b
LL
2196 /* uncompress() will return failed in some case, especially
2197 * when the page is dirted when doing the compression, it's
2198 * not a problem because the dirty page will be retransferred
2199 * and uncompress() won't break the data in other pages.
2200 */
33d151f4
LL
2201 uncompress((Bytef *)des, &pagesize,
2202 (const Bytef *)param->compbuf, len);
73a8912b 2203
33d151f4
LL
2204 qemu_mutex_lock(&decomp_done_lock);
2205 param->done = true;
2206 qemu_cond_signal(&decomp_done_cond);
2207 qemu_mutex_unlock(&decomp_done_lock);
2208
2209 qemu_mutex_lock(&param->mutex);
2210 } else {
2211 qemu_cond_wait(&param->cond, &param->mutex);
2212 }
56e93d26 2213 }
33d151f4 2214 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
2215
2216 return NULL;
2217}
2218
5533b2e9
LL
2219static void wait_for_decompress_done(void)
2220{
2221 int idx, thread_count;
2222
2223 if (!migrate_use_compression()) {
2224 return;
2225 }
2226
2227 thread_count = migrate_decompress_threads();
2228 qemu_mutex_lock(&decomp_done_lock);
2229 for (idx = 0; idx < thread_count; idx++) {
2230 while (!decomp_param[idx].done) {
2231 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2232 }
2233 }
2234 qemu_mutex_unlock(&decomp_done_lock);
2235}
2236
56e93d26
JQ
2237void migrate_decompress_threads_create(void)
2238{
2239 int i, thread_count;
2240
2241 thread_count = migrate_decompress_threads();
2242 decompress_threads = g_new0(QemuThread, thread_count);
2243 decomp_param = g_new0(DecompressParam, thread_count);
73a8912b
LL
2244 qemu_mutex_init(&decomp_done_lock);
2245 qemu_cond_init(&decomp_done_cond);
56e93d26
JQ
2246 for (i = 0; i < thread_count; i++) {
2247 qemu_mutex_init(&decomp_param[i].mutex);
2248 qemu_cond_init(&decomp_param[i].cond);
2249 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
73a8912b 2250 decomp_param[i].done = true;
90e56fb4 2251 decomp_param[i].quit = false;
56e93d26
JQ
2252 qemu_thread_create(decompress_threads + i, "decompress",
2253 do_data_decompress, decomp_param + i,
2254 QEMU_THREAD_JOINABLE);
2255 }
2256}
2257
2258void migrate_decompress_threads_join(void)
2259{
2260 int i, thread_count;
2261
56e93d26
JQ
2262 thread_count = migrate_decompress_threads();
2263 for (i = 0; i < thread_count; i++) {
2264 qemu_mutex_lock(&decomp_param[i].mutex);
90e56fb4 2265 decomp_param[i].quit = true;
56e93d26
JQ
2266 qemu_cond_signal(&decomp_param[i].cond);
2267 qemu_mutex_unlock(&decomp_param[i].mutex);
2268 }
2269 for (i = 0; i < thread_count; i++) {
2270 qemu_thread_join(decompress_threads + i);
2271 qemu_mutex_destroy(&decomp_param[i].mutex);
2272 qemu_cond_destroy(&decomp_param[i].cond);
2273 g_free(decomp_param[i].compbuf);
2274 }
2275 g_free(decompress_threads);
2276 g_free(decomp_param);
56e93d26
JQ
2277 decompress_threads = NULL;
2278 decomp_param = NULL;
56e93d26
JQ
2279}
2280
c1bc6626 2281static void decompress_data_with_multi_threads(QEMUFile *f,
56e93d26
JQ
2282 void *host, int len)
2283{
2284 int idx, thread_count;
2285
2286 thread_count = migrate_decompress_threads();
73a8912b 2287 qemu_mutex_lock(&decomp_done_lock);
56e93d26
JQ
2288 while (true) {
2289 for (idx = 0; idx < thread_count; idx++) {
73a8912b 2290 if (decomp_param[idx].done) {
33d151f4
LL
2291 decomp_param[idx].done = false;
2292 qemu_mutex_lock(&decomp_param[idx].mutex);
c1bc6626 2293 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
56e93d26
JQ
2294 decomp_param[idx].des = host;
2295 decomp_param[idx].len = len;
33d151f4
LL
2296 qemu_cond_signal(&decomp_param[idx].cond);
2297 qemu_mutex_unlock(&decomp_param[idx].mutex);
56e93d26
JQ
2298 break;
2299 }
2300 }
2301 if (idx < thread_count) {
2302 break;
73a8912b
LL
2303 } else {
2304 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
56e93d26
JQ
2305 }
2306 }
73a8912b 2307 qemu_mutex_unlock(&decomp_done_lock);
56e93d26
JQ
2308}
2309
3d0684b2
JQ
2310/**
2311 * ram_postcopy_incoming_init: allocate postcopy data structures
2312 *
2313 * Returns 0 for success and negative if there was one error
2314 *
2315 * @mis: current migration incoming state
2316 *
2317 * Allocate data structures etc needed by incoming migration with
2318 * postcopy-ram. postcopy-ram's similarly names
2319 * postcopy_ram_incoming_init does the work.
1caddf8a
DDAG
2320 */
2321int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2322{
b8c48993 2323 unsigned long ram_pages = last_ram_page();
1caddf8a
DDAG
2324
2325 return postcopy_ram_incoming_init(mis, ram_pages);
2326}
2327
3d0684b2
JQ
2328/**
2329 * ram_load_postcopy: load a page in postcopy case
2330 *
2331 * Returns 0 for success or -errno in case of error
2332 *
a7180877
DDAG
2333 * Called in postcopy mode by ram_load().
2334 * rcu_read_lock is taken prior to this being called.
3d0684b2
JQ
2335 *
2336 * @f: QEMUFile where to send the data
a7180877
DDAG
2337 */
2338static int ram_load_postcopy(QEMUFile *f)
2339{
2340 int flags = 0, ret = 0;
2341 bool place_needed = false;
28abd200 2342 bool matching_page_sizes = false;
a7180877
DDAG
2343 MigrationIncomingState *mis = migration_incoming_get_current();
2344 /* Temporary page that is later 'placed' */
2345 void *postcopy_host_page = postcopy_get_tmp_page(mis);
c53b7ddc 2346 void *last_host = NULL;
a3b6ff6d 2347 bool all_zero = false;
a7180877
DDAG
2348
2349 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2350 ram_addr_t addr;
2351 void *host = NULL;
2352 void *page_buffer = NULL;
2353 void *place_source = NULL;
df9ff5e1 2354 RAMBlock *block = NULL;
a7180877 2355 uint8_t ch;
a7180877
DDAG
2356
2357 addr = qemu_get_be64(f);
2358 flags = addr & ~TARGET_PAGE_MASK;
2359 addr &= TARGET_PAGE_MASK;
2360
2361 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2362 place_needed = false;
bb890ed5 2363 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
df9ff5e1 2364 block = ram_block_from_stream(f, flags);
4c4bad48
HZ
2365
2366 host = host_from_ram_block_offset(block, addr);
a7180877
DDAG
2367 if (!host) {
2368 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2369 ret = -EINVAL;
2370 break;
2371 }
28abd200 2372 matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
a7180877 2373 /*
28abd200
DDAG
2374 * Postcopy requires that we place whole host pages atomically;
2375 * these may be huge pages for RAMBlocks that are backed by
2376 * hugetlbfs.
a7180877
DDAG
2377 * To make it atomic, the data is read into a temporary page
2378 * that's moved into place later.
2379 * The migration protocol uses, possibly smaller, target-pages
2380 * however the source ensures it always sends all the components
2381 * of a host page in order.
2382 */
2383 page_buffer = postcopy_host_page +
28abd200 2384 ((uintptr_t)host & (block->page_size - 1));
a7180877 2385 /* If all TP are zero then we can optimise the place */
28abd200 2386 if (!((uintptr_t)host & (block->page_size - 1))) {
a7180877 2387 all_zero = true;
c53b7ddc
DDAG
2388 } else {
2389 /* not the 1st TP within the HP */
2390 if (host != (last_host + TARGET_PAGE_SIZE)) {
9af9e0fe 2391 error_report("Non-sequential target page %p/%p",
c53b7ddc
DDAG
2392 host, last_host);
2393 ret = -EINVAL;
2394 break;
2395 }
a7180877
DDAG
2396 }
2397
c53b7ddc 2398
a7180877
DDAG
2399 /*
2400 * If it's the last part of a host page then we place the host
2401 * page
2402 */
2403 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
28abd200 2404 (block->page_size - 1)) == 0;
a7180877
DDAG
2405 place_source = postcopy_host_page;
2406 }
c53b7ddc 2407 last_host = host;
a7180877
DDAG
2408
2409 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
bb890ed5 2410 case RAM_SAVE_FLAG_ZERO:
a7180877
DDAG
2411 ch = qemu_get_byte(f);
2412 memset(page_buffer, ch, TARGET_PAGE_SIZE);
2413 if (ch) {
2414 all_zero = false;
2415 }
2416 break;
2417
2418 case RAM_SAVE_FLAG_PAGE:
2419 all_zero = false;
2420 if (!place_needed || !matching_page_sizes) {
2421 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2422 } else {
2423 /* Avoids the qemu_file copy during postcopy, which is
2424 * going to do a copy later; can only do it when we
2425 * do this read in one go (matching page sizes)
2426 */
2427 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2428 TARGET_PAGE_SIZE);
2429 }
2430 break;
2431 case RAM_SAVE_FLAG_EOS:
2432 /* normal exit */
2433 break;
2434 default:
2435 error_report("Unknown combination of migration flags: %#x"
2436 " (postcopy mode)", flags);
2437 ret = -EINVAL;
2438 }
2439
2440 if (place_needed) {
2441 /* This gets called at the last target page in the host page */
df9ff5e1
DDAG
2442 void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2443
a7180877 2444 if (all_zero) {
df9ff5e1
DDAG
2445 ret = postcopy_place_page_zero(mis, place_dest,
2446 block->page_size);
a7180877 2447 } else {
df9ff5e1
DDAG
2448 ret = postcopy_place_page(mis, place_dest,
2449 place_source, block->page_size);
a7180877
DDAG
2450 }
2451 }
2452 if (!ret) {
2453 ret = qemu_file_get_error(f);
2454 }
2455 }
2456
2457 return ret;
2458}
2459
56e93d26
JQ
2460static int ram_load(QEMUFile *f, void *opaque, int version_id)
2461{
2462 int flags = 0, ret = 0;
2463 static uint64_t seq_iter;
2464 int len = 0;
a7180877
DDAG
2465 /*
2466 * If system is running in postcopy mode, page inserts to host memory must
2467 * be atomic
2468 */
2469 bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
ef08fb38
DDAG
2470 /* ADVISE is earlier, it shows the source has the postcopy capability on */
2471 bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
56e93d26
JQ
2472
2473 seq_iter++;
2474
2475 if (version_id != 4) {
2476 ret = -EINVAL;
2477 }
2478
2479 /* This RCU critical section can be very long running.
2480 * When RCU reclaims in the code start to become numerous,
2481 * it will be necessary to reduce the granularity of this
2482 * critical section.
2483 */
2484 rcu_read_lock();
a7180877
DDAG
2485
2486 if (postcopy_running) {
2487 ret = ram_load_postcopy(f);
2488 }
2489
2490 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
56e93d26 2491 ram_addr_t addr, total_ram_bytes;
a776aa15 2492 void *host = NULL;
56e93d26
JQ
2493 uint8_t ch;
2494
2495 addr = qemu_get_be64(f);
2496 flags = addr & ~TARGET_PAGE_MASK;
2497 addr &= TARGET_PAGE_MASK;
2498
bb890ed5 2499 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
a776aa15 2500 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4c4bad48
HZ
2501 RAMBlock *block = ram_block_from_stream(f, flags);
2502
2503 host = host_from_ram_block_offset(block, addr);
a776aa15
DDAG
2504 if (!host) {
2505 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2506 ret = -EINVAL;
2507 break;
2508 }
1db9d8e5 2509 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
a776aa15
DDAG
2510 }
2511
56e93d26
JQ
2512 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2513 case RAM_SAVE_FLAG_MEM_SIZE:
2514 /* Synchronize RAM block list */
2515 total_ram_bytes = addr;
2516 while (!ret && total_ram_bytes) {
2517 RAMBlock *block;
56e93d26
JQ
2518 char id[256];
2519 ram_addr_t length;
2520
2521 len = qemu_get_byte(f);
2522 qemu_get_buffer(f, (uint8_t *)id, len);
2523 id[len] = 0;
2524 length = qemu_get_be64(f);
2525
e3dd7493
DDAG
2526 block = qemu_ram_block_by_name(id);
2527 if (block) {
2528 if (length != block->used_length) {
2529 Error *local_err = NULL;
56e93d26 2530
fa53a0e5 2531 ret = qemu_ram_resize(block, length,
e3dd7493
DDAG
2532 &local_err);
2533 if (local_err) {
2534 error_report_err(local_err);
56e93d26 2535 }
56e93d26 2536 }
ef08fb38
DDAG
2537 /* For postcopy we need to check hugepage sizes match */
2538 if (postcopy_advised &&
2539 block->page_size != qemu_host_page_size) {
2540 uint64_t remote_page_size = qemu_get_be64(f);
2541 if (remote_page_size != block->page_size) {
2542 error_report("Mismatched RAM page size %s "
2543 "(local) %zd != %" PRId64,
2544 id, block->page_size,
2545 remote_page_size);
2546 ret = -EINVAL;
2547 }
2548 }
e3dd7493
DDAG
2549 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2550 block->idstr);
2551 } else {
56e93d26
JQ
2552 error_report("Unknown ramblock \"%s\", cannot "
2553 "accept migration", id);
2554 ret = -EINVAL;
2555 }
2556
2557 total_ram_bytes -= length;
2558 }
2559 break;
a776aa15 2560
bb890ed5 2561 case RAM_SAVE_FLAG_ZERO:
56e93d26
JQ
2562 ch = qemu_get_byte(f);
2563 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2564 break;
a776aa15 2565
56e93d26 2566 case RAM_SAVE_FLAG_PAGE:
56e93d26
JQ
2567 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2568 break;
56e93d26 2569
a776aa15 2570 case RAM_SAVE_FLAG_COMPRESS_PAGE:
56e93d26
JQ
2571 len = qemu_get_be32(f);
2572 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2573 error_report("Invalid compressed data length: %d", len);
2574 ret = -EINVAL;
2575 break;
2576 }
c1bc6626 2577 decompress_data_with_multi_threads(f, host, len);
56e93d26 2578 break;
a776aa15 2579
56e93d26 2580 case RAM_SAVE_FLAG_XBZRLE:
56e93d26
JQ
2581 if (load_xbzrle(f, addr, host) < 0) {
2582 error_report("Failed to decompress XBZRLE page at "
2583 RAM_ADDR_FMT, addr);
2584 ret = -EINVAL;
2585 break;
2586 }
2587 break;
2588 case RAM_SAVE_FLAG_EOS:
2589 /* normal exit */
2590 break;
2591 default:
2592 if (flags & RAM_SAVE_FLAG_HOOK) {
632e3a5c 2593 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
56e93d26
JQ
2594 } else {
2595 error_report("Unknown combination of migration flags: %#x",
2596 flags);
2597 ret = -EINVAL;
2598 }
2599 }
2600 if (!ret) {
2601 ret = qemu_file_get_error(f);
2602 }
2603 }
2604
5533b2e9 2605 wait_for_decompress_done();
56e93d26 2606 rcu_read_unlock();
55c4446b 2607 trace_ram_load_complete(ret, seq_iter);
56e93d26
JQ
2608 return ret;
2609}
2610
2611static SaveVMHandlers savevm_ram_handlers = {
2612 .save_live_setup = ram_save_setup,
2613 .save_live_iterate = ram_save_iterate,
763c906b 2614 .save_live_complete_postcopy = ram_save_complete,
a3e06c3d 2615 .save_live_complete_precopy = ram_save_complete,
56e93d26
JQ
2616 .save_live_pending = ram_save_pending,
2617 .load_state = ram_load,
6ad2a215 2618 .cleanup = ram_migration_cleanup,
56e93d26
JQ
2619};
2620
2621void ram_mig_init(void)
2622{
2623 qemu_mutex_init(&XBZRLE.lock);
6f37bb8b 2624 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
56e93d26 2625}