]> git.proxmox.com Git - mirror_qemu.git/blame - migration/ram.c
ram: Remove ram_save_remaining
[mirror_qemu.git] / migration / ram.c
CommitLineData
56e93d26
JQ
1/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
76cc7b58
JQ
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
56e93d26
JQ
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
1393a485 28#include "qemu/osdep.h"
33c11879
PB
29#include "qemu-common.h"
30#include "cpu.h"
56e93d26 31#include <zlib.h>
4addcd4f 32#include "qapi-event.h"
f348b6d1 33#include "qemu/cutils.h"
56e93d26
JQ
34#include "qemu/bitops.h"
35#include "qemu/bitmap.h"
7205c9ec
JQ
36#include "qemu/timer.h"
37#include "qemu/main-loop.h"
56e93d26 38#include "migration/migration.h"
e0b266f0 39#include "migration/postcopy-ram.h"
56e93d26
JQ
40#include "exec/address-spaces.h"
41#include "migration/page_cache.h"
56e93d26 42#include "qemu/error-report.h"
56e93d26 43#include "trace.h"
56e93d26 44#include "exec/ram_addr.h"
56e93d26 45#include "qemu/rcu_queue.h"
a91246c9 46#include "migration/colo.h"
56e93d26 47
56e93d26
JQ
48/***********************************************************/
49/* ram save/restore */
50
51#define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
52#define RAM_SAVE_FLAG_COMPRESS 0x02
53#define RAM_SAVE_FLAG_MEM_SIZE 0x04
54#define RAM_SAVE_FLAG_PAGE 0x08
55#define RAM_SAVE_FLAG_EOS 0x10
56#define RAM_SAVE_FLAG_CONTINUE 0x20
57#define RAM_SAVE_FLAG_XBZRLE 0x40
58/* 0x80 is reserved in migration.h start with 0x100 next */
59#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
60
adb65dec 61static uint8_t *ZERO_TARGET_PAGE;
56e93d26
JQ
62
63static inline bool is_zero_range(uint8_t *p, uint64_t size)
64{
a1febc49 65 return buffer_is_zero(p, size);
56e93d26
JQ
66}
67
68/* struct contains XBZRLE cache and a static page
69 used by the compression */
70static struct {
71 /* buffer used for XBZRLE encoding */
72 uint8_t *encoded_buf;
73 /* buffer for storing page content */
74 uint8_t *current_buf;
75 /* Cache for XBZRLE, Protected by lock. */
76 PageCache *cache;
77 QemuMutex lock;
78} XBZRLE;
79
80/* buffer used for XBZRLE decoding */
81static uint8_t *xbzrle_decoded_buf;
82
83static void XBZRLE_cache_lock(void)
84{
85 if (migrate_use_xbzrle())
86 qemu_mutex_lock(&XBZRLE.lock);
87}
88
89static void XBZRLE_cache_unlock(void)
90{
91 if (migrate_use_xbzrle())
92 qemu_mutex_unlock(&XBZRLE.lock);
93}
94
3d0684b2
JQ
95/**
96 * xbzrle_cache_resize: resize the xbzrle cache
97 *
98 * This function is called from qmp_migrate_set_cache_size in main
99 * thread, possibly while a migration is in progress. A running
100 * migration may be using the cache and might finish during this call,
101 * hence changes to the cache are protected by XBZRLE.lock().
102 *
103 * Returns the new_size or negative in case of error.
104 *
105 * @new_size: new cache size
56e93d26
JQ
106 */
107int64_t xbzrle_cache_resize(int64_t new_size)
108{
109 PageCache *new_cache;
110 int64_t ret;
111
112 if (new_size < TARGET_PAGE_SIZE) {
113 return -1;
114 }
115
116 XBZRLE_cache_lock();
117
118 if (XBZRLE.cache != NULL) {
119 if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
120 goto out_new_size;
121 }
122 new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
123 TARGET_PAGE_SIZE);
124 if (!new_cache) {
125 error_report("Error creating cache");
126 ret = -1;
127 goto out;
128 }
129
130 cache_fini(XBZRLE.cache);
131 XBZRLE.cache = new_cache;
132 }
133
134out_new_size:
135 ret = pow2floor(new_size);
136out:
137 XBZRLE_cache_unlock();
138 return ret;
139}
140
eb859c53
JQ
141struct RAMBitmap {
142 struct rcu_head rcu;
143 /* Main migration bitmap */
144 unsigned long *bmap;
145 /* bitmap of pages that haven't been sent even once
146 * only maintained and used in postcopy at the moment
147 * where it's used to send the dirtymap at the start
148 * of the postcopy phase
149 */
150 unsigned long *unsentmap;
151};
152typedef struct RAMBitmap RAMBitmap;
153
6f37bb8b
JQ
154/* State of RAM for migration */
155struct RAMState {
156 /* Last block that we have visited searching for dirty pages */
157 RAMBlock *last_seen_block;
158 /* Last block from where we have sent data */
159 RAMBlock *last_sent_block;
160 /* Last offset we have sent data from */
161 ram_addr_t last_offset;
162 /* last ram version we have seen */
163 uint32_t last_version;
164 /* We are in the first round */
165 bool ram_bulk_stage;
8d820d6f
JQ
166 /* How many times we have dirty too many pages */
167 int dirty_rate_high_cnt;
5a987738
JQ
168 /* How many times we have synchronized the bitmap */
169 uint64_t bitmap_sync_count;
f664da80
JQ
170 /* these variables are used for bitmap sync */
171 /* last time we did a full bitmap_sync */
172 int64_t time_last_bitmap_sync;
eac74159 173 /* bytes transferred at start_time */
c4bdf0cf 174 uint64_t bytes_xfer_prev;
a66cd90c 175 /* number of dirty pages since start_time */
68908ed6 176 uint64_t num_dirty_pages_period;
b5833fde
JQ
177 /* xbzrle misses since the beginning of the period */
178 uint64_t xbzrle_cache_miss_prev;
36040d9c
JQ
179 /* number of iterations at the beginning of period */
180 uint64_t iterations_prev;
f7ccd61b
JQ
181 /* Accounting fields */
182 /* number of zero pages. It used to be pages filled by the same char. */
183 uint64_t zero_pages;
b4d1c6e7
JQ
184 /* number of normal transferred pages */
185 uint64_t norm_pages;
23b28c3c
JQ
186 /* Iterations since start */
187 uint64_t iterations;
f36ada95
JQ
188 /* xbzrle transmitted bytes. Notice that this is with
189 * compression, they can't be calculated from the pages */
07ed50a2 190 uint64_t xbzrle_bytes;
f36ada95
JQ
191 /* xbzrle transmmited pages */
192 uint64_t xbzrle_pages;
544c36f1
JQ
193 /* xbzrle number of cache miss */
194 uint64_t xbzrle_cache_miss;
b07016b6
JQ
195 /* xbzrle miss rate */
196 double xbzrle_cache_miss_rate;
180f61f7
JQ
197 /* xbzrle number of overflows */
198 uint64_t xbzrle_overflows;
0d8ec885
JQ
199 /* number of dirty bits in the bitmap */
200 uint64_t migration_dirty_pages;
2f4fde93
JQ
201 /* total number of bytes transferred */
202 uint64_t bytes_transferred;
108cfae0
JQ
203 /* protects modification of the bitmap */
204 QemuMutex bitmap_mutex;
eb859c53
JQ
205 /* Ram Bitmap protected by RCU */
206 RAMBitmap *ram_bitmap;
6f37bb8b
JQ
207};
208typedef struct RAMState RAMState;
209
210static RAMState ram_state;
211
56e93d26
JQ
212uint64_t dup_mig_pages_transferred(void)
213{
f7ccd61b 214 return ram_state.zero_pages;
56e93d26
JQ
215}
216
56e93d26
JQ
217uint64_t norm_mig_pages_transferred(void)
218{
b4d1c6e7 219 return ram_state.norm_pages;
56e93d26
JQ
220}
221
222uint64_t xbzrle_mig_bytes_transferred(void)
223{
07ed50a2 224 return ram_state.xbzrle_bytes;
56e93d26
JQ
225}
226
227uint64_t xbzrle_mig_pages_transferred(void)
228{
f36ada95 229 return ram_state.xbzrle_pages;
56e93d26
JQ
230}
231
232uint64_t xbzrle_mig_pages_cache_miss(void)
233{
544c36f1 234 return ram_state.xbzrle_cache_miss;
56e93d26
JQ
235}
236
237double xbzrle_mig_cache_miss_rate(void)
238{
b07016b6 239 return ram_state.xbzrle_cache_miss_rate;
56e93d26
JQ
240}
241
242uint64_t xbzrle_mig_pages_overflow(void)
243{
180f61f7 244 return ram_state.xbzrle_overflows;
56e93d26
JQ
245}
246
9edabd4d 247uint64_t ram_bytes_transferred(void)
0d8ec885 248{
9edabd4d 249 return ram_state.bytes_transferred;
0d8ec885
JQ
250}
251
9edabd4d 252uint64_t ram_bytes_remaining(void)
2f4fde93 253{
9edabd4d 254 return ram_state.migration_dirty_pages * TARGET_PAGE_SIZE;
2f4fde93
JQ
255}
256
b8fb8cb7
DDAG
257/* used by the search for pages to send */
258struct PageSearchStatus {
259 /* Current block being searched */
260 RAMBlock *block;
261 /* Current offset to search from */
262 ram_addr_t offset;
263 /* Set once we wrap around */
264 bool complete_round;
265};
266typedef struct PageSearchStatus PageSearchStatus;
267
56e93d26 268struct CompressParam {
56e93d26 269 bool done;
90e56fb4 270 bool quit;
56e93d26
JQ
271 QEMUFile *file;
272 QemuMutex mutex;
273 QemuCond cond;
274 RAMBlock *block;
275 ram_addr_t offset;
276};
277typedef struct CompressParam CompressParam;
278
279struct DecompressParam {
73a8912b 280 bool done;
90e56fb4 281 bool quit;
56e93d26
JQ
282 QemuMutex mutex;
283 QemuCond cond;
284 void *des;
d341d9f3 285 uint8_t *compbuf;
56e93d26
JQ
286 int len;
287};
288typedef struct DecompressParam DecompressParam;
289
290static CompressParam *comp_param;
291static QemuThread *compress_threads;
292/* comp_done_cond is used to wake up the migration thread when
293 * one of the compression threads has finished the compression.
294 * comp_done_lock is used to co-work with comp_done_cond.
295 */
0d9f9a5c
LL
296static QemuMutex comp_done_lock;
297static QemuCond comp_done_cond;
56e93d26
JQ
298/* The empty QEMUFileOps will be used by file in CompressParam */
299static const QEMUFileOps empty_ops = { };
300
301static bool compression_switch;
56e93d26
JQ
302static DecompressParam *decomp_param;
303static QemuThread *decompress_threads;
73a8912b
LL
304static QemuMutex decomp_done_lock;
305static QemuCond decomp_done_cond;
56e93d26 306
a7a9a88f
LL
307static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
308 ram_addr_t offset);
56e93d26
JQ
309
310static void *do_data_compress(void *opaque)
311{
312 CompressParam *param = opaque;
a7a9a88f
LL
313 RAMBlock *block;
314 ram_addr_t offset;
56e93d26 315
a7a9a88f 316 qemu_mutex_lock(&param->mutex);
90e56fb4 317 while (!param->quit) {
a7a9a88f
LL
318 if (param->block) {
319 block = param->block;
320 offset = param->offset;
321 param->block = NULL;
322 qemu_mutex_unlock(&param->mutex);
323
324 do_compress_ram_page(param->file, block, offset);
325
0d9f9a5c 326 qemu_mutex_lock(&comp_done_lock);
a7a9a88f 327 param->done = true;
0d9f9a5c
LL
328 qemu_cond_signal(&comp_done_cond);
329 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
330
331 qemu_mutex_lock(&param->mutex);
332 } else {
56e93d26
JQ
333 qemu_cond_wait(&param->cond, &param->mutex);
334 }
56e93d26 335 }
a7a9a88f 336 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
337
338 return NULL;
339}
340
341static inline void terminate_compression_threads(void)
342{
343 int idx, thread_count;
344
345 thread_count = migrate_compress_threads();
3d0684b2 346
56e93d26
JQ
347 for (idx = 0; idx < thread_count; idx++) {
348 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 349 comp_param[idx].quit = true;
56e93d26
JQ
350 qemu_cond_signal(&comp_param[idx].cond);
351 qemu_mutex_unlock(&comp_param[idx].mutex);
352 }
353}
354
355void migrate_compress_threads_join(void)
356{
357 int i, thread_count;
358
359 if (!migrate_use_compression()) {
360 return;
361 }
362 terminate_compression_threads();
363 thread_count = migrate_compress_threads();
364 for (i = 0; i < thread_count; i++) {
365 qemu_thread_join(compress_threads + i);
366 qemu_fclose(comp_param[i].file);
367 qemu_mutex_destroy(&comp_param[i].mutex);
368 qemu_cond_destroy(&comp_param[i].cond);
369 }
0d9f9a5c
LL
370 qemu_mutex_destroy(&comp_done_lock);
371 qemu_cond_destroy(&comp_done_cond);
56e93d26
JQ
372 g_free(compress_threads);
373 g_free(comp_param);
56e93d26
JQ
374 compress_threads = NULL;
375 comp_param = NULL;
56e93d26
JQ
376}
377
378void migrate_compress_threads_create(void)
379{
380 int i, thread_count;
381
382 if (!migrate_use_compression()) {
383 return;
384 }
56e93d26
JQ
385 compression_switch = true;
386 thread_count = migrate_compress_threads();
387 compress_threads = g_new0(QemuThread, thread_count);
388 comp_param = g_new0(CompressParam, thread_count);
0d9f9a5c
LL
389 qemu_cond_init(&comp_done_cond);
390 qemu_mutex_init(&comp_done_lock);
56e93d26 391 for (i = 0; i < thread_count; i++) {
e110aa91
C
392 /* comp_param[i].file is just used as a dummy buffer to save data,
393 * set its ops to empty.
56e93d26
JQ
394 */
395 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
396 comp_param[i].done = true;
90e56fb4 397 comp_param[i].quit = false;
56e93d26
JQ
398 qemu_mutex_init(&comp_param[i].mutex);
399 qemu_cond_init(&comp_param[i].cond);
400 qemu_thread_create(compress_threads + i, "compress",
401 do_data_compress, comp_param + i,
402 QEMU_THREAD_JOINABLE);
403 }
404}
405
406/**
3d0684b2 407 * save_page_header: write page header to wire
56e93d26
JQ
408 *
409 * If this is the 1st block, it also writes the block identification
410 *
3d0684b2 411 * Returns the number of bytes written
56e93d26
JQ
412 *
413 * @f: QEMUFile where to send the data
414 * @block: block that contains the page we want to send
415 * @offset: offset inside the block for the page
416 * in the lower bits, it contains flags
417 */
418static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
419{
9f5f380b 420 size_t size, len;
56e93d26
JQ
421
422 qemu_put_be64(f, offset);
423 size = 8;
424
425 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
9f5f380b
LL
426 len = strlen(block->idstr);
427 qemu_put_byte(f, len);
428 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
429 size += 1 + len;
56e93d26
JQ
430 }
431 return size;
432}
433
3d0684b2
JQ
434/**
435 * mig_throttle_guest_down: throotle down the guest
436 *
437 * Reduce amount of guest cpu execution to hopefully slow down memory
438 * writes. If guest dirty memory rate is reduced below the rate at
439 * which we can transfer pages to the destination then we should be
440 * able to complete migration. Some workloads dirty memory way too
441 * fast and will not effectively converge, even with auto-converge.
070afca2
JH
442 */
443static void mig_throttle_guest_down(void)
444{
445 MigrationState *s = migrate_get_current();
2594f56d
DB
446 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
447 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
070afca2
JH
448
449 /* We have not started throttling yet. Let's start it. */
450 if (!cpu_throttle_active()) {
451 cpu_throttle_set(pct_initial);
452 } else {
453 /* Throttling already on, just increase the rate */
454 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
455 }
456}
457
3d0684b2
JQ
458/**
459 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
460 *
6f37bb8b 461 * @rs: current RAM state
3d0684b2
JQ
462 * @current_addr: address for the zero page
463 *
464 * Update the xbzrle cache to reflect a page that's been sent as all 0.
56e93d26
JQ
465 * The important thing is that a stale (not-yet-0'd) page be replaced
466 * by the new data.
467 * As a bonus, if the page wasn't in the cache it gets added so that
3d0684b2 468 * when a small write is made into the 0'd page it gets XBZRLE sent.
56e93d26 469 */
6f37bb8b 470static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
56e93d26 471{
6f37bb8b 472 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
56e93d26
JQ
473 return;
474 }
475
476 /* We don't care if this fails to allocate a new cache page
477 * as long as it updated an old one */
478 cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
5a987738 479 rs->bitmap_sync_count);
56e93d26
JQ
480}
481
482#define ENCODING_FLAG_XBZRLE 0x1
483
484/**
485 * save_xbzrle_page: compress and send current page
486 *
487 * Returns: 1 means that we wrote the page
488 * 0 means that page is identical to the one already sent
489 * -1 means that xbzrle would be longer than normal
490 *
5a987738 491 * @rs: current RAM state
56e93d26 492 * @f: QEMUFile where to send the data
3d0684b2
JQ
493 * @current_data: pointer to the address of the page contents
494 * @current_addr: addr of the page
56e93d26
JQ
495 * @block: block that contains the page we want to send
496 * @offset: offset inside the block for the page
497 * @last_stage: if we are at the completion stage
56e93d26 498 */
5a987738 499static int save_xbzrle_page(RAMState *rs, QEMUFile *f, uint8_t **current_data,
56e93d26 500 ram_addr_t current_addr, RAMBlock *block,
072c2511 501 ram_addr_t offset, bool last_stage)
56e93d26
JQ
502{
503 int encoded_len = 0, bytes_xbzrle;
504 uint8_t *prev_cached_page;
505
5a987738 506 if (!cache_is_cached(XBZRLE.cache, current_addr, rs->bitmap_sync_count)) {
544c36f1 507 rs->xbzrle_cache_miss++;
56e93d26
JQ
508 if (!last_stage) {
509 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
5a987738 510 rs->bitmap_sync_count) == -1) {
56e93d26
JQ
511 return -1;
512 } else {
513 /* update *current_data when the page has been
514 inserted into cache */
515 *current_data = get_cached_data(XBZRLE.cache, current_addr);
516 }
517 }
518 return -1;
519 }
520
521 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
522
523 /* save current buffer into memory */
524 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
525
526 /* XBZRLE encoding (if there is no overflow) */
527 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
528 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
529 TARGET_PAGE_SIZE);
530 if (encoded_len == 0) {
55c4446b 531 trace_save_xbzrle_page_skipping();
56e93d26
JQ
532 return 0;
533 } else if (encoded_len == -1) {
55c4446b 534 trace_save_xbzrle_page_overflow();
180f61f7 535 rs->xbzrle_overflows++;
56e93d26
JQ
536 /* update data in the cache */
537 if (!last_stage) {
538 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
539 *current_data = prev_cached_page;
540 }
541 return -1;
542 }
543
544 /* we need to update the data in the cache, in order to get the same data */
545 if (!last_stage) {
546 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
547 }
548
549 /* Send XBZRLE based compressed page */
550 bytes_xbzrle = save_page_header(f, block, offset | RAM_SAVE_FLAG_XBZRLE);
551 qemu_put_byte(f, ENCODING_FLAG_XBZRLE);
552 qemu_put_be16(f, encoded_len);
553 qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len);
554 bytes_xbzrle += encoded_len + 1 + 2;
f36ada95 555 rs->xbzrle_pages++;
07ed50a2 556 rs->xbzrle_bytes += bytes_xbzrle;
072c2511 557 rs->bytes_transferred += bytes_xbzrle;
56e93d26
JQ
558
559 return 1;
560}
561
3d0684b2
JQ
562/**
563 * migration_bitmap_find_dirty: find the next dirty page from start
f3f491fc 564 *
3d0684b2
JQ
565 * Called with rcu_read_lock() to protect migration_bitmap
566 *
567 * Returns the byte offset within memory region of the start of a dirty page
568 *
6f37bb8b 569 * @rs: current RAM state
3d0684b2
JQ
570 * @rb: RAMBlock where to search for dirty pages
571 * @start: starting address (typically so we can continue from previous page)
572 * @ram_addr_abs: pointer into which to store the address of the dirty page
573 * within the global ram_addr space
f3f491fc 574 */
56e93d26 575static inline
6f37bb8b 576ram_addr_t migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
a82d593b
DDAG
577 ram_addr_t start,
578 ram_addr_t *ram_addr_abs)
56e93d26 579{
2f68e399 580 unsigned long base = rb->offset >> TARGET_PAGE_BITS;
56e93d26 581 unsigned long nr = base + (start >> TARGET_PAGE_BITS);
2f68e399
DDAG
582 uint64_t rb_size = rb->used_length;
583 unsigned long size = base + (rb_size >> TARGET_PAGE_BITS);
2ff64038 584 unsigned long *bitmap;
56e93d26
JQ
585
586 unsigned long next;
587
eb859c53 588 bitmap = atomic_rcu_read(&rs->ram_bitmap)->bmap;
6f37bb8b 589 if (rs->ram_bulk_stage && nr > base) {
56e93d26
JQ
590 next = nr + 1;
591 } else {
2ff64038 592 next = find_next_bit(bitmap, size, nr);
56e93d26
JQ
593 }
594
f3f491fc 595 *ram_addr_abs = next << TARGET_PAGE_BITS;
56e93d26
JQ
596 return (next - base) << TARGET_PAGE_BITS;
597}
598
0d8ec885 599static inline bool migration_bitmap_clear_dirty(RAMState *rs, ram_addr_t addr)
a82d593b
DDAG
600{
601 bool ret;
602 int nr = addr >> TARGET_PAGE_BITS;
eb859c53 603 unsigned long *bitmap = atomic_rcu_read(&rs->ram_bitmap)->bmap;
a82d593b
DDAG
604
605 ret = test_and_clear_bit(nr, bitmap);
606
607 if (ret) {
0d8ec885 608 rs->migration_dirty_pages--;
a82d593b
DDAG
609 }
610 return ret;
611}
612
a66cd90c
JQ
613static void migration_bitmap_sync_range(RAMState *rs, ram_addr_t start,
614 ram_addr_t length)
56e93d26 615{
2ff64038 616 unsigned long *bitmap;
eb859c53 617 bitmap = atomic_rcu_read(&rs->ram_bitmap)->bmap;
0d8ec885
JQ
618 rs->migration_dirty_pages +=
619 cpu_physical_memory_sync_dirty_bitmap(bitmap, start, length,
620 &rs->num_dirty_pages_period);
56e93d26
JQ
621}
622
3d0684b2
JQ
623/**
624 * ram_pagesize_summary: calculate all the pagesizes of a VM
625 *
626 * Returns a summary bitmap of the page sizes of all RAMBlocks
627 *
628 * For VMs with just normal pages this is equivalent to the host page
629 * size. If it's got some huge pages then it's the OR of all the
630 * different page sizes.
e8ca1db2
DDAG
631 */
632uint64_t ram_pagesize_summary(void)
633{
634 RAMBlock *block;
635 uint64_t summary = 0;
636
637 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
638 summary |= block->page_size;
639 }
640
641 return summary;
642}
643
8d820d6f 644static void migration_bitmap_sync(RAMState *rs)
56e93d26
JQ
645{
646 RAMBlock *block;
56e93d26
JQ
647 MigrationState *s = migrate_get_current();
648 int64_t end_time;
c4bdf0cf 649 uint64_t bytes_xfer_now;
56e93d26 650
5a987738 651 rs->bitmap_sync_count++;
56e93d26 652
eac74159
JQ
653 if (!rs->bytes_xfer_prev) {
654 rs->bytes_xfer_prev = ram_bytes_transferred();
56e93d26
JQ
655 }
656
f664da80
JQ
657 if (!rs->time_last_bitmap_sync) {
658 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
56e93d26
JQ
659 }
660
661 trace_migration_bitmap_sync_start();
9c1f8f44 662 memory_global_dirty_log_sync();
56e93d26 663
108cfae0 664 qemu_mutex_lock(&rs->bitmap_mutex);
56e93d26
JQ
665 rcu_read_lock();
666 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
a66cd90c 667 migration_bitmap_sync_range(rs, block->offset, block->used_length);
56e93d26
JQ
668 }
669 rcu_read_unlock();
108cfae0 670 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26 671
a66cd90c 672 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1ffb5dfd 673
56e93d26
JQ
674 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
675
676 /* more than 1 second = 1000 millisecons */
f664da80 677 if (end_time > rs->time_last_bitmap_sync + 1000) {
56e93d26
JQ
678 if (migrate_auto_converge()) {
679 /* The following detection logic can be refined later. For now:
680 Check to see if the dirtied bytes is 50% more than the approx.
681 amount of bytes that just got transferred since the last time we
070afca2
JH
682 were in this routine. If that happens twice, start or increase
683 throttling */
56e93d26 684 bytes_xfer_now = ram_bytes_transferred();
070afca2 685
56e93d26 686 if (s->dirty_pages_rate &&
a66cd90c 687 (rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
eac74159 688 (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
8d820d6f 689 (rs->dirty_rate_high_cnt++ >= 2)) {
56e93d26 690 trace_migration_throttle();
8d820d6f 691 rs->dirty_rate_high_cnt = 0;
070afca2 692 mig_throttle_guest_down();
56e93d26 693 }
eac74159 694 rs->bytes_xfer_prev = bytes_xfer_now;
56e93d26 695 }
070afca2 696
56e93d26 697 if (migrate_use_xbzrle()) {
23b28c3c 698 if (rs->iterations_prev != rs->iterations) {
b07016b6 699 rs->xbzrle_cache_miss_rate =
544c36f1 700 (double)(rs->xbzrle_cache_miss -
b5833fde 701 rs->xbzrle_cache_miss_prev) /
23b28c3c 702 (rs->iterations - rs->iterations_prev);
56e93d26 703 }
23b28c3c 704 rs->iterations_prev = rs->iterations;
544c36f1 705 rs->xbzrle_cache_miss_prev = rs->xbzrle_cache_miss;
56e93d26 706 }
a66cd90c 707 s->dirty_pages_rate = rs->num_dirty_pages_period * 1000
f664da80 708 / (end_time - rs->time_last_bitmap_sync);
56e93d26 709 s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE;
f664da80 710 rs->time_last_bitmap_sync = end_time;
a66cd90c 711 rs->num_dirty_pages_period = 0;
56e93d26 712 }
5a987738 713 s->dirty_sync_count = rs->bitmap_sync_count;
4addcd4f 714 if (migrate_use_events()) {
5a987738 715 qapi_event_send_migration_pass(rs->bitmap_sync_count, NULL);
4addcd4f 716 }
56e93d26
JQ
717}
718
719/**
3d0684b2 720 * save_zero_page: send the zero page to the stream
56e93d26 721 *
3d0684b2 722 * Returns the number of pages written.
56e93d26 723 *
f7ccd61b 724 * @rs: current RAM state
56e93d26
JQ
725 * @f: QEMUFile where to send the data
726 * @block: block that contains the page we want to send
727 * @offset: offset inside the block for the page
728 * @p: pointer to the page
56e93d26 729 */
f7ccd61b 730static int save_zero_page(RAMState *rs, QEMUFile *f, RAMBlock *block,
072c2511 731 ram_addr_t offset, uint8_t *p)
56e93d26
JQ
732{
733 int pages = -1;
734
735 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
f7ccd61b 736 rs->zero_pages++;
072c2511
JQ
737 rs->bytes_transferred +=
738 save_page_header(f, block, offset | RAM_SAVE_FLAG_COMPRESS);
56e93d26 739 qemu_put_byte(f, 0);
072c2511 740 rs->bytes_transferred += 1;
56e93d26
JQ
741 pages = 1;
742 }
743
744 return pages;
745}
746
36449157 747static void ram_release_pages(MigrationState *ms, const char *rbname,
53f09a10
PB
748 uint64_t offset, int pages)
749{
750 if (!migrate_release_ram() || !migration_in_postcopy(ms)) {
751 return;
752 }
753
36449157 754 ram_discard_range(NULL, rbname, offset, pages << TARGET_PAGE_BITS);
53f09a10
PB
755}
756
56e93d26 757/**
3d0684b2 758 * ram_save_page: send the given page to the stream
56e93d26 759 *
3d0684b2 760 * Returns the number of pages written.
3fd3c4b3
DDAG
761 * < 0 - error
762 * >=0 - Number of pages written - this might legally be 0
763 * if xbzrle noticed the page was the same.
56e93d26 764 *
6f37bb8b 765 * @rs: current RAM state
3d0684b2 766 * @ms: current migration state
56e93d26
JQ
767 * @f: QEMUFile where to send the data
768 * @block: block that contains the page we want to send
769 * @offset: offset inside the block for the page
770 * @last_stage: if we are at the completion stage
56e93d26 771 */
6f37bb8b 772static int ram_save_page(RAMState *rs, MigrationState *ms, QEMUFile *f,
072c2511 773 PageSearchStatus *pss, bool last_stage)
56e93d26
JQ
774{
775 int pages = -1;
776 uint64_t bytes_xmit;
777 ram_addr_t current_addr;
56e93d26
JQ
778 uint8_t *p;
779 int ret;
780 bool send_async = true;
a08f6890
HZ
781 RAMBlock *block = pss->block;
782 ram_addr_t offset = pss->offset;
56e93d26 783
2f68e399 784 p = block->host + offset;
56e93d26
JQ
785
786 /* In doubt sent page as normal */
787 bytes_xmit = 0;
788 ret = ram_control_save_page(f, block->offset,
789 offset, TARGET_PAGE_SIZE, &bytes_xmit);
790 if (bytes_xmit) {
072c2511 791 rs->bytes_transferred += bytes_xmit;
56e93d26
JQ
792 pages = 1;
793 }
794
795 XBZRLE_cache_lock();
796
797 current_addr = block->offset + offset;
798
6f37bb8b 799 if (block == rs->last_sent_block) {
56e93d26
JQ
800 offset |= RAM_SAVE_FLAG_CONTINUE;
801 }
802 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
803 if (ret != RAM_SAVE_CONTROL_DELAYED) {
804 if (bytes_xmit > 0) {
b4d1c6e7 805 rs->norm_pages++;
56e93d26 806 } else if (bytes_xmit == 0) {
f7ccd61b 807 rs->zero_pages++;
56e93d26
JQ
808 }
809 }
810 } else {
072c2511 811 pages = save_zero_page(rs, f, block, offset, p);
56e93d26
JQ
812 if (pages > 0) {
813 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
814 * page would be stale
815 */
6f37bb8b 816 xbzrle_cache_zero_page(rs, current_addr);
53f09a10 817 ram_release_pages(ms, block->idstr, pss->offset, pages);
6f37bb8b 818 } else if (!rs->ram_bulk_stage &&
9eb14766 819 !migration_in_postcopy(ms) && migrate_use_xbzrle()) {
5a987738 820 pages = save_xbzrle_page(rs, f, &p, current_addr, block,
072c2511 821 offset, last_stage);
56e93d26
JQ
822 if (!last_stage) {
823 /* Can't send this cached data async, since the cache page
824 * might get updated before it gets to the wire
825 */
826 send_async = false;
827 }
828 }
829 }
830
831 /* XBZRLE overflow or normal page */
832 if (pages == -1) {
072c2511 833 rs->bytes_transferred += save_page_header(f, block,
56e93d26
JQ
834 offset | RAM_SAVE_FLAG_PAGE);
835 if (send_async) {
53f09a10
PB
836 qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE,
837 migrate_release_ram() &
838 migration_in_postcopy(ms));
56e93d26
JQ
839 } else {
840 qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
841 }
072c2511 842 rs->bytes_transferred += TARGET_PAGE_SIZE;
56e93d26 843 pages = 1;
b4d1c6e7 844 rs->norm_pages++;
56e93d26
JQ
845 }
846
847 XBZRLE_cache_unlock();
848
849 return pages;
850}
851
a7a9a88f
LL
852static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
853 ram_addr_t offset)
56e93d26
JQ
854{
855 int bytes_sent, blen;
a7a9a88f 856 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
56e93d26 857
a7a9a88f 858 bytes_sent = save_page_header(f, block, offset |
56e93d26 859 RAM_SAVE_FLAG_COMPRESS_PAGE);
a7a9a88f 860 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
56e93d26 861 migrate_compress_level());
b3be2896
LL
862 if (blen < 0) {
863 bytes_sent = 0;
864 qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
865 error_report("compressed data failed!");
866 } else {
867 bytes_sent += blen;
53f09a10
PB
868 ram_release_pages(migrate_get_current(), block->idstr,
869 offset & TARGET_PAGE_MASK, 1);
b3be2896 870 }
56e93d26
JQ
871
872 return bytes_sent;
873}
874
2f4fde93 875static void flush_compressed_data(RAMState *rs, QEMUFile *f)
56e93d26
JQ
876{
877 int idx, len, thread_count;
878
879 if (!migrate_use_compression()) {
880 return;
881 }
882 thread_count = migrate_compress_threads();
a7a9a88f 883
0d9f9a5c 884 qemu_mutex_lock(&comp_done_lock);
56e93d26 885 for (idx = 0; idx < thread_count; idx++) {
a7a9a88f 886 while (!comp_param[idx].done) {
0d9f9a5c 887 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26 888 }
a7a9a88f 889 }
0d9f9a5c 890 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
891
892 for (idx = 0; idx < thread_count; idx++) {
893 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 894 if (!comp_param[idx].quit) {
56e93d26 895 len = qemu_put_qemu_file(f, comp_param[idx].file);
2f4fde93 896 rs->bytes_transferred += len;
56e93d26 897 }
a7a9a88f 898 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26
JQ
899 }
900}
901
902static inline void set_compress_params(CompressParam *param, RAMBlock *block,
903 ram_addr_t offset)
904{
905 param->block = block;
906 param->offset = offset;
907}
908
b4d1c6e7 909static int compress_page_with_multi_thread(RAMState *rs, QEMUFile *f,
072c2511 910 RAMBlock *block, ram_addr_t offset)
56e93d26
JQ
911{
912 int idx, thread_count, bytes_xmit = -1, pages = -1;
913
914 thread_count = migrate_compress_threads();
0d9f9a5c 915 qemu_mutex_lock(&comp_done_lock);
56e93d26
JQ
916 while (true) {
917 for (idx = 0; idx < thread_count; idx++) {
918 if (comp_param[idx].done) {
a7a9a88f 919 comp_param[idx].done = false;
56e93d26 920 bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file);
a7a9a88f 921 qemu_mutex_lock(&comp_param[idx].mutex);
56e93d26 922 set_compress_params(&comp_param[idx], block, offset);
a7a9a88f
LL
923 qemu_cond_signal(&comp_param[idx].cond);
924 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26 925 pages = 1;
b4d1c6e7 926 rs->norm_pages++;
072c2511 927 rs->bytes_transferred += bytes_xmit;
56e93d26
JQ
928 break;
929 }
930 }
931 if (pages > 0) {
932 break;
933 } else {
0d9f9a5c 934 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26
JQ
935 }
936 }
0d9f9a5c 937 qemu_mutex_unlock(&comp_done_lock);
56e93d26
JQ
938
939 return pages;
940}
941
942/**
943 * ram_save_compressed_page: compress the given page and send it to the stream
944 *
3d0684b2 945 * Returns the number of pages written.
56e93d26 946 *
6f37bb8b 947 * @rs: current RAM state
3d0684b2 948 * @ms: current migration state
56e93d26
JQ
949 * @f: QEMUFile where to send the data
950 * @block: block that contains the page we want to send
951 * @offset: offset inside the block for the page
952 * @last_stage: if we are at the completion stage
56e93d26 953 */
6f37bb8b
JQ
954static int ram_save_compressed_page(RAMState *rs, MigrationState *ms,
955 QEMUFile *f,
072c2511 956 PageSearchStatus *pss, bool last_stage)
56e93d26
JQ
957{
958 int pages = -1;
fc50438e 959 uint64_t bytes_xmit = 0;
56e93d26 960 uint8_t *p;
fc50438e 961 int ret, blen;
a08f6890
HZ
962 RAMBlock *block = pss->block;
963 ram_addr_t offset = pss->offset;
56e93d26 964
2f68e399 965 p = block->host + offset;
56e93d26 966
56e93d26
JQ
967 ret = ram_control_save_page(f, block->offset,
968 offset, TARGET_PAGE_SIZE, &bytes_xmit);
969 if (bytes_xmit) {
072c2511 970 rs->bytes_transferred += bytes_xmit;
56e93d26
JQ
971 pages = 1;
972 }
56e93d26
JQ
973 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
974 if (ret != RAM_SAVE_CONTROL_DELAYED) {
975 if (bytes_xmit > 0) {
b4d1c6e7 976 rs->norm_pages++;
56e93d26 977 } else if (bytes_xmit == 0) {
f7ccd61b 978 rs->zero_pages++;
56e93d26
JQ
979 }
980 }
981 } else {
982 /* When starting the process of a new block, the first page of
983 * the block should be sent out before other pages in the same
984 * block, and all the pages in last block should have been sent
985 * out, keeping this order is important, because the 'cont' flag
986 * is used to avoid resending the block name.
987 */
6f37bb8b 988 if (block != rs->last_sent_block) {
2f4fde93 989 flush_compressed_data(rs, f);
072c2511 990 pages = save_zero_page(rs, f, block, offset, p);
56e93d26 991 if (pages == -1) {
fc50438e
LL
992 /* Make sure the first page is sent out before other pages */
993 bytes_xmit = save_page_header(f, block, offset |
994 RAM_SAVE_FLAG_COMPRESS_PAGE);
995 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
996 migrate_compress_level());
997 if (blen > 0) {
072c2511 998 rs->bytes_transferred += bytes_xmit + blen;
b4d1c6e7 999 rs->norm_pages++;
b3be2896 1000 pages = 1;
fc50438e
LL
1001 } else {
1002 qemu_file_set_error(f, blen);
1003 error_report("compressed data failed!");
b3be2896 1004 }
56e93d26 1005 }
53f09a10
PB
1006 if (pages > 0) {
1007 ram_release_pages(ms, block->idstr, pss->offset, pages);
1008 }
56e93d26 1009 } else {
fc50438e 1010 offset |= RAM_SAVE_FLAG_CONTINUE;
072c2511 1011 pages = save_zero_page(rs, f, block, offset, p);
56e93d26 1012 if (pages == -1) {
072c2511 1013 pages = compress_page_with_multi_thread(rs, f, block, offset);
53f09a10
PB
1014 } else {
1015 ram_release_pages(ms, block->idstr, pss->offset, pages);
56e93d26
JQ
1016 }
1017 }
1018 }
1019
1020 return pages;
1021}
1022
3d0684b2
JQ
1023/**
1024 * find_dirty_block: find the next dirty page and update any state
1025 * associated with the search process.
b9e60928 1026 *
3d0684b2 1027 * Returns if a page is found
b9e60928 1028 *
6f37bb8b 1029 * @rs: current RAM state
3d0684b2
JQ
1030 * @f: QEMUFile where to send the data
1031 * @pss: data about the state of the current dirty page scan
1032 * @again: set to false if the search has scanned the whole of RAM
1033 * @ram_addr_abs: pointer into which to store the address of the dirty page
1034 * within the global ram_addr space
b9e60928 1035 */
6f37bb8b 1036static bool find_dirty_block(RAMState *rs, QEMUFile *f, PageSearchStatus *pss,
f3f491fc 1037 bool *again, ram_addr_t *ram_addr_abs)
b9e60928 1038{
6f37bb8b 1039 pss->offset = migration_bitmap_find_dirty(rs, pss->block, pss->offset,
a82d593b 1040 ram_addr_abs);
6f37bb8b
JQ
1041 if (pss->complete_round && pss->block == rs->last_seen_block &&
1042 pss->offset >= rs->last_offset) {
b9e60928
DDAG
1043 /*
1044 * We've been once around the RAM and haven't found anything.
1045 * Give up.
1046 */
1047 *again = false;
1048 return false;
1049 }
1050 if (pss->offset >= pss->block->used_length) {
1051 /* Didn't find anything in this RAM Block */
1052 pss->offset = 0;
1053 pss->block = QLIST_NEXT_RCU(pss->block, next);
1054 if (!pss->block) {
1055 /* Hit the end of the list */
1056 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1057 /* Flag that we've looped */
1058 pss->complete_round = true;
6f37bb8b 1059 rs->ram_bulk_stage = false;
b9e60928
DDAG
1060 if (migrate_use_xbzrle()) {
1061 /* If xbzrle is on, stop using the data compression at this
1062 * point. In theory, xbzrle can do better than compression.
1063 */
2f4fde93 1064 flush_compressed_data(rs, f);
b9e60928
DDAG
1065 compression_switch = false;
1066 }
1067 }
1068 /* Didn't find anything this time, but try again on the new block */
1069 *again = true;
1070 return false;
1071 } else {
1072 /* Can go around again, but... */
1073 *again = true;
1074 /* We've found something so probably don't need to */
1075 return true;
1076 }
1077}
1078
3d0684b2
JQ
1079/**
1080 * unqueue_page: gets a page of the queue
1081 *
a82d593b 1082 * Helper for 'get_queued_page' - gets a page off the queue
a82d593b 1083 *
3d0684b2
JQ
1084 * Returns the block of the page (or NULL if none available)
1085 *
1086 * @ms: current migration state
1087 * @offset: used to return the offset within the RAMBlock
1088 * @ram_addr_abs: pointer into which to store the address of the dirty page
1089 * within the global ram_addr space
a82d593b
DDAG
1090 */
1091static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset,
1092 ram_addr_t *ram_addr_abs)
1093{
1094 RAMBlock *block = NULL;
1095
1096 qemu_mutex_lock(&ms->src_page_req_mutex);
1097 if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) {
1098 struct MigrationSrcPageRequest *entry =
1099 QSIMPLEQ_FIRST(&ms->src_page_requests);
1100 block = entry->rb;
1101 *offset = entry->offset;
1102 *ram_addr_abs = (entry->offset + entry->rb->offset) &
1103 TARGET_PAGE_MASK;
1104
1105 if (entry->len > TARGET_PAGE_SIZE) {
1106 entry->len -= TARGET_PAGE_SIZE;
1107 entry->offset += TARGET_PAGE_SIZE;
1108 } else {
1109 memory_region_unref(block->mr);
1110 QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1111 g_free(entry);
1112 }
1113 }
1114 qemu_mutex_unlock(&ms->src_page_req_mutex);
1115
1116 return block;
1117}
1118
3d0684b2
JQ
1119/**
1120 * get_queued_page: unqueue a page from the postocpy requests
1121 *
1122 * Skips pages that are already sent (!dirty)
a82d593b 1123 *
3d0684b2 1124 * Returns if a queued page is found
a82d593b 1125 *
6f37bb8b 1126 * @rs: current RAM state
3d0684b2
JQ
1127 * @ms: current migration state
1128 * @pss: data about the state of the current dirty page scan
1129 * @ram_addr_abs: pointer into which to store the address of the dirty page
1130 * within the global ram_addr space
a82d593b 1131 */
6f37bb8b
JQ
1132static bool get_queued_page(RAMState *rs, MigrationState *ms,
1133 PageSearchStatus *pss,
a82d593b
DDAG
1134 ram_addr_t *ram_addr_abs)
1135{
1136 RAMBlock *block;
1137 ram_addr_t offset;
1138 bool dirty;
1139
1140 do {
1141 block = unqueue_page(ms, &offset, ram_addr_abs);
1142 /*
1143 * We're sending this page, and since it's postcopy nothing else
1144 * will dirty it, and we must make sure it doesn't get sent again
1145 * even if this queue request was received after the background
1146 * search already sent it.
1147 */
1148 if (block) {
1149 unsigned long *bitmap;
eb859c53 1150 bitmap = atomic_rcu_read(&rs->ram_bitmap)->bmap;
a82d593b
DDAG
1151 dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap);
1152 if (!dirty) {
1153 trace_get_queued_page_not_dirty(
1154 block->idstr, (uint64_t)offset,
1155 (uint64_t)*ram_addr_abs,
1156 test_bit(*ram_addr_abs >> TARGET_PAGE_BITS,
eb859c53 1157 atomic_rcu_read(&rs->ram_bitmap)->unsentmap));
a82d593b
DDAG
1158 } else {
1159 trace_get_queued_page(block->idstr,
1160 (uint64_t)offset,
1161 (uint64_t)*ram_addr_abs);
1162 }
1163 }
1164
1165 } while (block && !dirty);
1166
1167 if (block) {
1168 /*
1169 * As soon as we start servicing pages out of order, then we have
1170 * to kill the bulk stage, since the bulk stage assumes
1171 * in (migration_bitmap_find_and_reset_dirty) that every page is
1172 * dirty, that's no longer true.
1173 */
6f37bb8b 1174 rs->ram_bulk_stage = false;
a82d593b
DDAG
1175
1176 /*
1177 * We want the background search to continue from the queued page
1178 * since the guest is likely to want other pages near to the page
1179 * it just requested.
1180 */
1181 pss->block = block;
1182 pss->offset = offset;
1183 }
1184
1185 return !!block;
1186}
1187
6c595cde 1188/**
5e58f968
JQ
1189 * migration_page_queue_free: drop any remaining pages in the ram
1190 * request queue
6c595cde 1191 *
3d0684b2
JQ
1192 * It should be empty at the end anyway, but in error cases there may
1193 * be some left. in case that there is any page left, we drop it.
1194 *
1195 * @ms: current migration state
6c595cde 1196 */
5e58f968 1197void migration_page_queue_free(MigrationState *ms)
6c595cde
DDAG
1198{
1199 struct MigrationSrcPageRequest *mspr, *next_mspr;
1200 /* This queue generally should be empty - but in the case of a failed
1201 * migration might have some droppings in.
1202 */
1203 rcu_read_lock();
1204 QSIMPLEQ_FOREACH_SAFE(mspr, &ms->src_page_requests, next_req, next_mspr) {
1205 memory_region_unref(mspr->rb->mr);
1206 QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1207 g_free(mspr);
1208 }
1209 rcu_read_unlock();
1210}
1211
1212/**
3d0684b2
JQ
1213 * ram_save_queue_pages: queue the page for transmission
1214 *
1215 * A request from postcopy destination for example.
1216 *
1217 * Returns zero on success or negative on error
1218 *
1219 * @ms: current migration state
1220 * @rbname: Name of the RAMBLock of the request. NULL means the
1221 * same that last one.
1222 * @start: starting address from the start of the RAMBlock
1223 * @len: length (in bytes) to send
6c595cde
DDAG
1224 */
1225int ram_save_queue_pages(MigrationState *ms, const char *rbname,
1226 ram_addr_t start, ram_addr_t len)
1227{
1228 RAMBlock *ramblock;
1229
d3bf5418 1230 ms->postcopy_requests++;
6c595cde
DDAG
1231 rcu_read_lock();
1232 if (!rbname) {
1233 /* Reuse last RAMBlock */
1234 ramblock = ms->last_req_rb;
1235
1236 if (!ramblock) {
1237 /*
1238 * Shouldn't happen, we can't reuse the last RAMBlock if
1239 * it's the 1st request.
1240 */
1241 error_report("ram_save_queue_pages no previous block");
1242 goto err;
1243 }
1244 } else {
1245 ramblock = qemu_ram_block_by_name(rbname);
1246
1247 if (!ramblock) {
1248 /* We shouldn't be asked for a non-existent RAMBlock */
1249 error_report("ram_save_queue_pages no block '%s'", rbname);
1250 goto err;
1251 }
1252 ms->last_req_rb = ramblock;
1253 }
1254 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1255 if (start+len > ramblock->used_length) {
9458ad6b
JQ
1256 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1257 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
6c595cde
DDAG
1258 __func__, start, len, ramblock->used_length);
1259 goto err;
1260 }
1261
1262 struct MigrationSrcPageRequest *new_entry =
1263 g_malloc0(sizeof(struct MigrationSrcPageRequest));
1264 new_entry->rb = ramblock;
1265 new_entry->offset = start;
1266 new_entry->len = len;
1267
1268 memory_region_ref(ramblock->mr);
1269 qemu_mutex_lock(&ms->src_page_req_mutex);
1270 QSIMPLEQ_INSERT_TAIL(&ms->src_page_requests, new_entry, next_req);
1271 qemu_mutex_unlock(&ms->src_page_req_mutex);
1272 rcu_read_unlock();
1273
1274 return 0;
1275
1276err:
1277 rcu_read_unlock();
1278 return -1;
1279}
1280
a82d593b 1281/**
3d0684b2 1282 * ram_save_target_page: save one target page
a82d593b 1283 *
3d0684b2 1284 * Returns the number of pages written
a82d593b 1285 *
6f37bb8b 1286 * @rs: current RAM state
3d0684b2 1287 * @ms: current migration state
a82d593b 1288 * @f: QEMUFile where to send the data
3d0684b2 1289 * @pss: data about the page we want to send
a82d593b 1290 * @last_stage: if we are at the completion stage
3d0684b2 1291 * @dirty_ram_abs: address of the start of the dirty page in ram_addr_t space
a82d593b 1292 */
6f37bb8b 1293static int ram_save_target_page(RAMState *rs, MigrationState *ms, QEMUFile *f,
a08f6890 1294 PageSearchStatus *pss,
a82d593b 1295 bool last_stage,
a82d593b
DDAG
1296 ram_addr_t dirty_ram_abs)
1297{
1298 int res = 0;
1299
1300 /* Check the pages is dirty and if it is send it */
0d8ec885 1301 if (migration_bitmap_clear_dirty(rs, dirty_ram_abs)) {
a82d593b
DDAG
1302 unsigned long *unsentmap;
1303 if (compression_switch && migrate_use_compression()) {
072c2511 1304 res = ram_save_compressed_page(rs, ms, f, pss, last_stage);
a82d593b 1305 } else {
072c2511 1306 res = ram_save_page(rs, ms, f, pss, last_stage);
a82d593b
DDAG
1307 }
1308
1309 if (res < 0) {
1310 return res;
1311 }
eb859c53 1312 unsentmap = atomic_rcu_read(&rs->ram_bitmap)->unsentmap;
a82d593b
DDAG
1313 if (unsentmap) {
1314 clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap);
1315 }
3fd3c4b3
DDAG
1316 /* Only update last_sent_block if a block was actually sent; xbzrle
1317 * might have decided the page was identical so didn't bother writing
1318 * to the stream.
1319 */
1320 if (res > 0) {
6f37bb8b 1321 rs->last_sent_block = pss->block;
3fd3c4b3 1322 }
a82d593b
DDAG
1323 }
1324
1325 return res;
1326}
1327
1328/**
3d0684b2 1329 * ram_save_host_page: save a whole host page
a82d593b 1330 *
3d0684b2
JQ
1331 * Starting at *offset send pages up to the end of the current host
1332 * page. It's valid for the initial offset to point into the middle of
1333 * a host page in which case the remainder of the hostpage is sent.
1334 * Only dirty target pages are sent. Note that the host page size may
1335 * be a huge page for this block.
a82d593b 1336 *
3d0684b2
JQ
1337 * Returns the number of pages written or negative on error
1338 *
6f37bb8b 1339 * @rs: current RAM state
3d0684b2 1340 * @ms: current migration state
a82d593b 1341 * @f: QEMUFile where to send the data
3d0684b2 1342 * @pss: data about the page we want to send
a82d593b 1343 * @last_stage: if we are at the completion stage
a82d593b
DDAG
1344 * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1345 */
6f37bb8b 1346static int ram_save_host_page(RAMState *rs, MigrationState *ms, QEMUFile *f,
a08f6890
HZ
1347 PageSearchStatus *pss,
1348 bool last_stage,
a82d593b
DDAG
1349 ram_addr_t dirty_ram_abs)
1350{
1351 int tmppages, pages = 0;
4c011c37
DDAG
1352 size_t pagesize = qemu_ram_pagesize(pss->block);
1353
a82d593b 1354 do {
6f37bb8b 1355 tmppages = ram_save_target_page(rs, ms, f, pss, last_stage,
072c2511 1356 dirty_ram_abs);
a82d593b
DDAG
1357 if (tmppages < 0) {
1358 return tmppages;
1359 }
1360
1361 pages += tmppages;
a08f6890 1362 pss->offset += TARGET_PAGE_SIZE;
a82d593b 1363 dirty_ram_abs += TARGET_PAGE_SIZE;
4c011c37 1364 } while (pss->offset & (pagesize - 1));
a82d593b
DDAG
1365
1366 /* The offset we leave with is the last one we looked at */
a08f6890 1367 pss->offset -= TARGET_PAGE_SIZE;
a82d593b
DDAG
1368 return pages;
1369}
6c595cde 1370
56e93d26 1371/**
3d0684b2 1372 * ram_find_and_save_block: finds a dirty page and sends it to f
56e93d26
JQ
1373 *
1374 * Called within an RCU critical section.
1375 *
3d0684b2 1376 * Returns the number of pages written where zero means no dirty pages
56e93d26 1377 *
6f37bb8b 1378 * @rs: current RAM state
56e93d26
JQ
1379 * @f: QEMUFile where to send the data
1380 * @last_stage: if we are at the completion stage
a82d593b
DDAG
1381 *
1382 * On systems where host-page-size > target-page-size it will send all the
1383 * pages in a host page that are dirty.
56e93d26
JQ
1384 */
1385
072c2511 1386static int ram_find_and_save_block(RAMState *rs, QEMUFile *f, bool last_stage)
56e93d26 1387{
b8fb8cb7 1388 PageSearchStatus pss;
a82d593b 1389 MigrationState *ms = migrate_get_current();
56e93d26 1390 int pages = 0;
b9e60928 1391 bool again, found;
f3f491fc
DDAG
1392 ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
1393 ram_addr_t space */
56e93d26 1394
0827b9e9
AA
1395 /* No dirty page as there is zero RAM */
1396 if (!ram_bytes_total()) {
1397 return pages;
1398 }
1399
6f37bb8b
JQ
1400 pss.block = rs->last_seen_block;
1401 pss.offset = rs->last_offset;
b8fb8cb7
DDAG
1402 pss.complete_round = false;
1403
1404 if (!pss.block) {
1405 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1406 }
56e93d26 1407
b9e60928 1408 do {
a82d593b 1409 again = true;
6f37bb8b 1410 found = get_queued_page(rs, ms, &pss, &dirty_ram_abs);
b9e60928 1411
a82d593b
DDAG
1412 if (!found) {
1413 /* priority queue empty, so just search for something dirty */
6f37bb8b 1414 found = find_dirty_block(rs, f, &pss, &again, &dirty_ram_abs);
a82d593b 1415 }
f3f491fc 1416
a82d593b 1417 if (found) {
072c2511 1418 pages = ram_save_host_page(rs, ms, f, &pss, last_stage,
a82d593b 1419 dirty_ram_abs);
56e93d26 1420 }
b9e60928 1421 } while (!pages && again);
56e93d26 1422
6f37bb8b
JQ
1423 rs->last_seen_block = pss.block;
1424 rs->last_offset = pss.offset;
56e93d26
JQ
1425
1426 return pages;
1427}
1428
1429void acct_update_position(QEMUFile *f, size_t size, bool zero)
1430{
1431 uint64_t pages = size / TARGET_PAGE_SIZE;
f7ccd61b
JQ
1432 RAMState *rs = &ram_state;
1433
56e93d26 1434 if (zero) {
f7ccd61b 1435 rs->zero_pages += pages;
56e93d26 1436 } else {
b4d1c6e7 1437 rs->norm_pages += pages;
2f4fde93 1438 rs->bytes_transferred += size;
56e93d26
JQ
1439 qemu_update_position(f, size);
1440 }
1441}
1442
56e93d26
JQ
1443uint64_t ram_bytes_total(void)
1444{
1445 RAMBlock *block;
1446 uint64_t total = 0;
1447
1448 rcu_read_lock();
1449 QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
1450 total += block->used_length;
1451 rcu_read_unlock();
1452 return total;
1453}
1454
1455void free_xbzrle_decoded_buf(void)
1456{
1457 g_free(xbzrle_decoded_buf);
1458 xbzrle_decoded_buf = NULL;
1459}
1460
eb859c53 1461static void migration_bitmap_free(struct RAMBitmap *bmap)
60be6340
DL
1462{
1463 g_free(bmap->bmap);
f3f491fc 1464 g_free(bmap->unsentmap);
60be6340
DL
1465 g_free(bmap);
1466}
1467
6ad2a215 1468static void ram_migration_cleanup(void *opaque)
56e93d26 1469{
eb859c53
JQ
1470 RAMState *rs = opaque;
1471
2ff64038
LZ
1472 /* caller have hold iothread lock or is in a bh, so there is
1473 * no writing race against this migration_bitmap
1474 */
eb859c53
JQ
1475 struct RAMBitmap *bitmap = rs->ram_bitmap;
1476 atomic_rcu_set(&rs->ram_bitmap, NULL);
2ff64038 1477 if (bitmap) {
56e93d26 1478 memory_global_dirty_log_stop();
60be6340 1479 call_rcu(bitmap, migration_bitmap_free, rcu);
56e93d26
JQ
1480 }
1481
1482 XBZRLE_cache_lock();
1483 if (XBZRLE.cache) {
1484 cache_fini(XBZRLE.cache);
1485 g_free(XBZRLE.encoded_buf);
1486 g_free(XBZRLE.current_buf);
adb65dec 1487 g_free(ZERO_TARGET_PAGE);
56e93d26
JQ
1488 XBZRLE.cache = NULL;
1489 XBZRLE.encoded_buf = NULL;
1490 XBZRLE.current_buf = NULL;
1491 }
1492 XBZRLE_cache_unlock();
1493}
1494
6f37bb8b 1495static void ram_state_reset(RAMState *rs)
56e93d26 1496{
6f37bb8b
JQ
1497 rs->last_seen_block = NULL;
1498 rs->last_sent_block = NULL;
1499 rs->last_offset = 0;
1500 rs->last_version = ram_list.version;
1501 rs->ram_bulk_stage = true;
56e93d26
JQ
1502}
1503
1504#define MAX_WAIT 50 /* ms, half buffered_file limit */
1505
dd631697
LZ
1506void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
1507{
0d8ec885 1508 RAMState *rs = &ram_state;
108cfae0 1509
dd631697
LZ
1510 /* called in qemu main thread, so there is
1511 * no writing race against this migration_bitmap
1512 */
eb859c53
JQ
1513 if (rs->ram_bitmap) {
1514 struct RAMBitmap *old_bitmap = rs->ram_bitmap, *bitmap;
1515 bitmap = g_new(struct RAMBitmap, 1);
60be6340 1516 bitmap->bmap = bitmap_new(new);
dd631697
LZ
1517
1518 /* prevent migration_bitmap content from being set bit
1519 * by migration_bitmap_sync_range() at the same time.
1520 * it is safe to migration if migration_bitmap is cleared bit
1521 * at the same time.
1522 */
108cfae0 1523 qemu_mutex_lock(&rs->bitmap_mutex);
60be6340
DL
1524 bitmap_copy(bitmap->bmap, old_bitmap->bmap, old);
1525 bitmap_set(bitmap->bmap, old, new - old);
f3f491fc
DDAG
1526
1527 /* We don't have a way to safely extend the sentmap
1528 * with RCU; so mark it as missing, entry to postcopy
1529 * will fail.
1530 */
1531 bitmap->unsentmap = NULL;
1532
eb859c53 1533 atomic_rcu_set(&rs->ram_bitmap, bitmap);
108cfae0 1534 qemu_mutex_unlock(&rs->bitmap_mutex);
0d8ec885 1535 rs->migration_dirty_pages += new - old;
60be6340 1536 call_rcu(old_bitmap, migration_bitmap_free, rcu);
dd631697
LZ
1537 }
1538}
56e93d26 1539
4f2e4252
DDAG
1540/*
1541 * 'expected' is the value you expect the bitmap mostly to be full
1542 * of; it won't bother printing lines that are all this value.
1543 * If 'todump' is null the migration bitmap is dumped.
1544 */
1545void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
1546{
1547 int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
eb859c53 1548 RAMState *rs = &ram_state;
4f2e4252
DDAG
1549 int64_t cur;
1550 int64_t linelen = 128;
1551 char linebuf[129];
1552
1553 if (!todump) {
eb859c53 1554 todump = atomic_rcu_read(&rs->ram_bitmap)->bmap;
4f2e4252
DDAG
1555 }
1556
1557 for (cur = 0; cur < ram_pages; cur += linelen) {
1558 int64_t curb;
1559 bool found = false;
1560 /*
1561 * Last line; catch the case where the line length
1562 * is longer than remaining ram
1563 */
1564 if (cur + linelen > ram_pages) {
1565 linelen = ram_pages - cur;
1566 }
1567 for (curb = 0; curb < linelen; curb++) {
1568 bool thisbit = test_bit(cur + curb, todump);
1569 linebuf[curb] = thisbit ? '1' : '.';
1570 found = found || (thisbit != expected);
1571 }
1572 if (found) {
1573 linebuf[curb] = '\0';
1574 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
1575 }
1576 }
1577}
1578
e0b266f0
DDAG
1579/* **** functions for postcopy ***** */
1580
ced1c616
PB
1581void ram_postcopy_migrated_memory_release(MigrationState *ms)
1582{
eb859c53 1583 RAMState *rs = &ram_state;
ced1c616 1584 struct RAMBlock *block;
eb859c53 1585 unsigned long *bitmap = atomic_rcu_read(&rs->ram_bitmap)->bmap;
ced1c616
PB
1586
1587 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1588 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1589 unsigned long range = first + (block->used_length >> TARGET_PAGE_BITS);
1590 unsigned long run_start = find_next_zero_bit(bitmap, range, first);
1591
1592 while (run_start < range) {
1593 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1594 ram_discard_range(NULL, block->idstr, run_start << TARGET_PAGE_BITS,
1595 (run_end - run_start) << TARGET_PAGE_BITS);
1596 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1597 }
1598 }
1599}
1600
3d0684b2
JQ
1601/**
1602 * postcopy_send_discard_bm_ram: discard a RAMBlock
1603 *
1604 * Returns zero on success
1605 *
e0b266f0
DDAG
1606 * Callback from postcopy_each_ram_send_discard for each RAMBlock
1607 * Note: At this point the 'unsentmap' is the processed bitmap combined
1608 * with the dirtymap; so a '1' means it's either dirty or unsent.
3d0684b2
JQ
1609 *
1610 * @ms: current migration state
1611 * @pds: state for postcopy
1612 * @start: RAMBlock starting page
1613 * @length: RAMBlock size
e0b266f0
DDAG
1614 */
1615static int postcopy_send_discard_bm_ram(MigrationState *ms,
1616 PostcopyDiscardState *pds,
1617 unsigned long start,
1618 unsigned long length)
1619{
eb859c53 1620 RAMState *rs = &ram_state;
e0b266f0
DDAG
1621 unsigned long end = start + length; /* one after the end */
1622 unsigned long current;
1623 unsigned long *unsentmap;
1624
eb859c53 1625 unsentmap = atomic_rcu_read(&rs->ram_bitmap)->unsentmap;
e0b266f0
DDAG
1626 for (current = start; current < end; ) {
1627 unsigned long one = find_next_bit(unsentmap, end, current);
1628
1629 if (one <= end) {
1630 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1631 unsigned long discard_length;
1632
1633 if (zero >= end) {
1634 discard_length = end - one;
1635 } else {
1636 discard_length = zero - one;
1637 }
d688c62d
DDAG
1638 if (discard_length) {
1639 postcopy_discard_send_range(ms, pds, one, discard_length);
1640 }
e0b266f0
DDAG
1641 current = one + discard_length;
1642 } else {
1643 current = one;
1644 }
1645 }
1646
1647 return 0;
1648}
1649
3d0684b2
JQ
1650/**
1651 * postcopy_each_ram_send_discard: discard all RAMBlocks
1652 *
1653 * Returns 0 for success or negative for error
1654 *
e0b266f0
DDAG
1655 * Utility for the outgoing postcopy code.
1656 * Calls postcopy_send_discard_bm_ram for each RAMBlock
1657 * passing it bitmap indexes and name.
e0b266f0
DDAG
1658 * (qemu_ram_foreach_block ends up passing unscaled lengths
1659 * which would mean postcopy code would have to deal with target page)
3d0684b2
JQ
1660 *
1661 * @ms: current migration state
e0b266f0
DDAG
1662 */
1663static int postcopy_each_ram_send_discard(MigrationState *ms)
1664{
1665 struct RAMBlock *block;
1666 int ret;
1667
1668 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1669 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1670 PostcopyDiscardState *pds = postcopy_discard_send_init(ms,
1671 first,
1672 block->idstr);
1673
1674 /*
1675 * Postcopy sends chunks of bitmap over the wire, but it
1676 * just needs indexes at this point, avoids it having
1677 * target page specific code.
1678 */
1679 ret = postcopy_send_discard_bm_ram(ms, pds, first,
1680 block->used_length >> TARGET_PAGE_BITS);
1681 postcopy_discard_send_finish(ms, pds);
1682 if (ret) {
1683 return ret;
1684 }
1685 }
1686
1687 return 0;
1688}
1689
3d0684b2
JQ
1690/**
1691 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1692 *
1693 * Helper for postcopy_chunk_hostpages; it's called twice to
1694 * canonicalize the two bitmaps, that are similar, but one is
1695 * inverted.
99e314eb 1696 *
3d0684b2
JQ
1697 * Postcopy requires that all target pages in a hostpage are dirty or
1698 * clean, not a mix. This function canonicalizes the bitmaps.
99e314eb 1699 *
3d0684b2
JQ
1700 * @ms: current migration state
1701 * @unsent_pass: if true we need to canonicalize partially unsent host pages
1702 * otherwise we need to canonicalize partially dirty host pages
1703 * @block: block that contains the page we want to canonicalize
1704 * @pds: state for postcopy
99e314eb
DDAG
1705 */
1706static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1707 RAMBlock *block,
1708 PostcopyDiscardState *pds)
1709{
0d8ec885 1710 RAMState *rs = &ram_state;
99e314eb
DDAG
1711 unsigned long *bitmap;
1712 unsigned long *unsentmap;
29c59172 1713 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
99e314eb
DDAG
1714 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1715 unsigned long len = block->used_length >> TARGET_PAGE_BITS;
1716 unsigned long last = first + (len - 1);
1717 unsigned long run_start;
1718
29c59172
DDAG
1719 if (block->page_size == TARGET_PAGE_SIZE) {
1720 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1721 return;
1722 }
1723
eb859c53
JQ
1724 bitmap = atomic_rcu_read(&rs->ram_bitmap)->bmap;
1725 unsentmap = atomic_rcu_read(&rs->ram_bitmap)->unsentmap;
99e314eb
DDAG
1726
1727 if (unsent_pass) {
1728 /* Find a sent page */
1729 run_start = find_next_zero_bit(unsentmap, last + 1, first);
1730 } else {
1731 /* Find a dirty page */
1732 run_start = find_next_bit(bitmap, last + 1, first);
1733 }
1734
1735 while (run_start <= last) {
1736 bool do_fixup = false;
1737 unsigned long fixup_start_addr;
1738 unsigned long host_offset;
1739
1740 /*
1741 * If the start of this run of pages is in the middle of a host
1742 * page, then we need to fixup this host page.
1743 */
1744 host_offset = run_start % host_ratio;
1745 if (host_offset) {
1746 do_fixup = true;
1747 run_start -= host_offset;
1748 fixup_start_addr = run_start;
1749 /* For the next pass */
1750 run_start = run_start + host_ratio;
1751 } else {
1752 /* Find the end of this run */
1753 unsigned long run_end;
1754 if (unsent_pass) {
1755 run_end = find_next_bit(unsentmap, last + 1, run_start + 1);
1756 } else {
1757 run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1);
1758 }
1759 /*
1760 * If the end isn't at the start of a host page, then the
1761 * run doesn't finish at the end of a host page
1762 * and we need to discard.
1763 */
1764 host_offset = run_end % host_ratio;
1765 if (host_offset) {
1766 do_fixup = true;
1767 fixup_start_addr = run_end - host_offset;
1768 /*
1769 * This host page has gone, the next loop iteration starts
1770 * from after the fixup
1771 */
1772 run_start = fixup_start_addr + host_ratio;
1773 } else {
1774 /*
1775 * No discards on this iteration, next loop starts from
1776 * next sent/dirty page
1777 */
1778 run_start = run_end + 1;
1779 }
1780 }
1781
1782 if (do_fixup) {
1783 unsigned long page;
1784
1785 /* Tell the destination to discard this page */
1786 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1787 /* For the unsent_pass we:
1788 * discard partially sent pages
1789 * For the !unsent_pass (dirty) we:
1790 * discard partially dirty pages that were sent
1791 * (any partially sent pages were already discarded
1792 * by the previous unsent_pass)
1793 */
1794 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1795 host_ratio);
1796 }
1797
1798 /* Clean up the bitmap */
1799 for (page = fixup_start_addr;
1800 page < fixup_start_addr + host_ratio; page++) {
1801 /* All pages in this host page are now not sent */
1802 set_bit(page, unsentmap);
1803
1804 /*
1805 * Remark them as dirty, updating the count for any pages
1806 * that weren't previously dirty.
1807 */
0d8ec885 1808 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
99e314eb
DDAG
1809 }
1810 }
1811
1812 if (unsent_pass) {
1813 /* Find the next sent page for the next iteration */
1814 run_start = find_next_zero_bit(unsentmap, last + 1,
1815 run_start);
1816 } else {
1817 /* Find the next dirty page for the next iteration */
1818 run_start = find_next_bit(bitmap, last + 1, run_start);
1819 }
1820 }
1821}
1822
3d0684b2
JQ
1823/**
1824 * postcopy_chuck_hostpages: discrad any partially sent host page
1825 *
99e314eb
DDAG
1826 * Utility for the outgoing postcopy code.
1827 *
1828 * Discard any partially sent host-page size chunks, mark any partially
29c59172
DDAG
1829 * dirty host-page size chunks as all dirty. In this case the host-page
1830 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
99e314eb 1831 *
3d0684b2
JQ
1832 * Returns zero on success
1833 *
1834 * @ms: current migration state
99e314eb
DDAG
1835 */
1836static int postcopy_chunk_hostpages(MigrationState *ms)
1837{
6f37bb8b 1838 RAMState *rs = &ram_state;
99e314eb
DDAG
1839 struct RAMBlock *block;
1840
99e314eb 1841 /* Easiest way to make sure we don't resume in the middle of a host-page */
6f37bb8b
JQ
1842 rs->last_seen_block = NULL;
1843 rs->last_sent_block = NULL;
1844 rs->last_offset = 0;
99e314eb
DDAG
1845
1846 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1847 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1848
1849 PostcopyDiscardState *pds =
1850 postcopy_discard_send_init(ms, first, block->idstr);
1851
1852 /* First pass: Discard all partially sent host pages */
1853 postcopy_chunk_hostpages_pass(ms, true, block, pds);
1854 /*
1855 * Second pass: Ensure that all partially dirty host pages are made
1856 * fully dirty.
1857 */
1858 postcopy_chunk_hostpages_pass(ms, false, block, pds);
1859
1860 postcopy_discard_send_finish(ms, pds);
1861 } /* ram_list loop */
1862
1863 return 0;
1864}
1865
3d0684b2
JQ
1866/**
1867 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1868 *
1869 * Returns zero on success
1870 *
e0b266f0
DDAG
1871 * Transmit the set of pages to be discarded after precopy to the target
1872 * these are pages that:
1873 * a) Have been previously transmitted but are now dirty again
1874 * b) Pages that have never been transmitted, this ensures that
1875 * any pages on the destination that have been mapped by background
1876 * tasks get discarded (transparent huge pages is the specific concern)
1877 * Hopefully this is pretty sparse
3d0684b2
JQ
1878 *
1879 * @ms: current migration state
e0b266f0
DDAG
1880 */
1881int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1882{
eb859c53 1883 RAMState *rs = &ram_state;
e0b266f0
DDAG
1884 int ret;
1885 unsigned long *bitmap, *unsentmap;
1886
1887 rcu_read_lock();
1888
1889 /* This should be our last sync, the src is now paused */
eb859c53 1890 migration_bitmap_sync(rs);
e0b266f0 1891
eb859c53 1892 unsentmap = atomic_rcu_read(&rs->ram_bitmap)->unsentmap;
e0b266f0
DDAG
1893 if (!unsentmap) {
1894 /* We don't have a safe way to resize the sentmap, so
1895 * if the bitmap was resized it will be NULL at this
1896 * point.
1897 */
1898 error_report("migration ram resized during precopy phase");
1899 rcu_read_unlock();
1900 return -EINVAL;
1901 }
1902
29c59172 1903 /* Deal with TPS != HPS and huge pages */
99e314eb
DDAG
1904 ret = postcopy_chunk_hostpages(ms);
1905 if (ret) {
1906 rcu_read_unlock();
1907 return ret;
1908 }
1909
e0b266f0
DDAG
1910 /*
1911 * Update the unsentmap to be unsentmap = unsentmap | dirty
1912 */
eb859c53 1913 bitmap = atomic_rcu_read(&rs->ram_bitmap)->bmap;
e0b266f0
DDAG
1914 bitmap_or(unsentmap, unsentmap, bitmap,
1915 last_ram_offset() >> TARGET_PAGE_BITS);
1916
1917
1918 trace_ram_postcopy_send_discard_bitmap();
1919#ifdef DEBUG_POSTCOPY
1920 ram_debug_dump_bitmap(unsentmap, true);
1921#endif
1922
1923 ret = postcopy_each_ram_send_discard(ms);
1924 rcu_read_unlock();
1925
1926 return ret;
1927}
1928
3d0684b2
JQ
1929/**
1930 * ram_discard_range: discard dirtied pages at the beginning of postcopy
e0b266f0 1931 *
3d0684b2 1932 * Returns zero on success
e0b266f0 1933 *
3d0684b2 1934 * @mis: current migration incoming state
36449157
JQ
1935 * @rbname: name of the RAMBlock of the request. NULL means the
1936 * same that last one.
3d0684b2
JQ
1937 * @start: RAMBlock starting page
1938 * @length: RAMBlock size
e0b266f0
DDAG
1939 */
1940int ram_discard_range(MigrationIncomingState *mis,
36449157 1941 const char *rbname,
e0b266f0
DDAG
1942 uint64_t start, size_t length)
1943{
1944 int ret = -1;
1945
36449157 1946 trace_ram_discard_range(rbname, start, length);
d3a5038c 1947
e0b266f0 1948 rcu_read_lock();
36449157 1949 RAMBlock *rb = qemu_ram_block_by_name(rbname);
e0b266f0
DDAG
1950
1951 if (!rb) {
36449157 1952 error_report("ram_discard_range: Failed to find block '%s'", rbname);
e0b266f0
DDAG
1953 goto err;
1954 }
1955
d3a5038c 1956 ret = ram_block_discard_range(rb, start, length);
e0b266f0
DDAG
1957
1958err:
1959 rcu_read_unlock();
1960
1961 return ret;
1962}
1963
ceb4d168 1964static int ram_state_init(RAMState *rs)
56e93d26 1965{
56e93d26
JQ
1966 int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
1967
ceb4d168 1968 memset(rs, 0, sizeof(*rs));
108cfae0 1969 qemu_mutex_init(&rs->bitmap_mutex);
56e93d26
JQ
1970
1971 if (migrate_use_xbzrle()) {
1972 XBZRLE_cache_lock();
adb65dec 1973 ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE);
56e93d26
JQ
1974 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1975 TARGET_PAGE_SIZE,
1976 TARGET_PAGE_SIZE);
1977 if (!XBZRLE.cache) {
1978 XBZRLE_cache_unlock();
1979 error_report("Error creating cache");
1980 return -1;
1981 }
1982 XBZRLE_cache_unlock();
1983
1984 /* We prefer not to abort if there is no memory */
1985 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1986 if (!XBZRLE.encoded_buf) {
1987 error_report("Error allocating encoded_buf");
1988 return -1;
1989 }
1990
1991 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1992 if (!XBZRLE.current_buf) {
1993 error_report("Error allocating current_buf");
1994 g_free(XBZRLE.encoded_buf);
1995 XBZRLE.encoded_buf = NULL;
1996 return -1;
1997 }
56e93d26
JQ
1998 }
1999
49877834
PB
2000 /* For memory_global_dirty_log_start below. */
2001 qemu_mutex_lock_iothread();
2002
56e93d26
JQ
2003 qemu_mutex_lock_ramlist();
2004 rcu_read_lock();
6f37bb8b 2005 ram_state_reset(rs);
56e93d26 2006
eb859c53 2007 rs->ram_bitmap = g_new0(struct RAMBitmap, 1);
0827b9e9
AA
2008 /* Skip setting bitmap if there is no RAM */
2009 if (ram_bytes_total()) {
2010 ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
eb859c53
JQ
2011 rs->ram_bitmap->bmap = bitmap_new(ram_bitmap_pages);
2012 bitmap_set(rs->ram_bitmap->bmap, 0, ram_bitmap_pages);
0827b9e9
AA
2013
2014 if (migrate_postcopy_ram()) {
eb859c53
JQ
2015 rs->ram_bitmap->unsentmap = bitmap_new(ram_bitmap_pages);
2016 bitmap_set(rs->ram_bitmap->unsentmap, 0, ram_bitmap_pages);
0827b9e9 2017 }
f3f491fc
DDAG
2018 }
2019
56e93d26
JQ
2020 /*
2021 * Count the total number of pages used by ram blocks not including any
2022 * gaps due to alignment or unplugs.
2023 */
0d8ec885 2024 rs->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
56e93d26
JQ
2025
2026 memory_global_dirty_log_start();
8d820d6f 2027 migration_bitmap_sync(rs);
56e93d26 2028 qemu_mutex_unlock_ramlist();
49877834 2029 qemu_mutex_unlock_iothread();
a91246c9
HZ
2030 rcu_read_unlock();
2031
2032 return 0;
2033}
2034
3d0684b2
JQ
2035/*
2036 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
a91246c9
HZ
2037 * long-running RCU critical section. When rcu-reclaims in the code
2038 * start to become numerous it will be necessary to reduce the
2039 * granularity of these critical sections.
2040 */
2041
3d0684b2
JQ
2042/**
2043 * ram_save_setup: Setup RAM for migration
2044 *
2045 * Returns zero to indicate success and negative for error
2046 *
2047 * @f: QEMUFile where to send the data
2048 * @opaque: RAMState pointer
2049 */
a91246c9
HZ
2050static int ram_save_setup(QEMUFile *f, void *opaque)
2051{
6f37bb8b 2052 RAMState *rs = opaque;
a91246c9
HZ
2053 RAMBlock *block;
2054
2055 /* migration has already setup the bitmap, reuse it. */
2056 if (!migration_in_colo_state()) {
ceb4d168 2057 if (ram_state_init(rs) < 0) {
a91246c9
HZ
2058 return -1;
2059 }
2060 }
2061
2062 rcu_read_lock();
56e93d26
JQ
2063
2064 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2065
2066 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2067 qemu_put_byte(f, strlen(block->idstr));
2068 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2069 qemu_put_be64(f, block->used_length);
ef08fb38
DDAG
2070 if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2071 qemu_put_be64(f, block->page_size);
2072 }
56e93d26
JQ
2073 }
2074
2075 rcu_read_unlock();
2076
2077 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2078 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2079
2080 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2081
2082 return 0;
2083}
2084
3d0684b2
JQ
2085/**
2086 * ram_save_iterate: iterative stage for migration
2087 *
2088 * Returns zero to indicate success and negative for error
2089 *
2090 * @f: QEMUFile where to send the data
2091 * @opaque: RAMState pointer
2092 */
56e93d26
JQ
2093static int ram_save_iterate(QEMUFile *f, void *opaque)
2094{
6f37bb8b 2095 RAMState *rs = opaque;
56e93d26
JQ
2096 int ret;
2097 int i;
2098 int64_t t0;
5c90308f 2099 int done = 0;
56e93d26
JQ
2100
2101 rcu_read_lock();
6f37bb8b
JQ
2102 if (ram_list.version != rs->last_version) {
2103 ram_state_reset(rs);
56e93d26
JQ
2104 }
2105
2106 /* Read version before ram_list.blocks */
2107 smp_rmb();
2108
2109 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2110
2111 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2112 i = 0;
2113 while ((ret = qemu_file_rate_limit(f)) == 0) {
2114 int pages;
2115
072c2511 2116 pages = ram_find_and_save_block(rs, f, false);
56e93d26
JQ
2117 /* no more pages to sent */
2118 if (pages == 0) {
5c90308f 2119 done = 1;
56e93d26
JQ
2120 break;
2121 }
23b28c3c 2122 rs->iterations++;
070afca2 2123
56e93d26
JQ
2124 /* we want to check in the 1st loop, just in case it was the 1st time
2125 and we had to sync the dirty bitmap.
2126 qemu_get_clock_ns() is a bit expensive, so we only check each some
2127 iterations
2128 */
2129 if ((i & 63) == 0) {
2130 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2131 if (t1 > MAX_WAIT) {
55c4446b 2132 trace_ram_save_iterate_big_wait(t1, i);
56e93d26
JQ
2133 break;
2134 }
2135 }
2136 i++;
2137 }
2f4fde93 2138 flush_compressed_data(rs, f);
56e93d26
JQ
2139 rcu_read_unlock();
2140
2141 /*
2142 * Must occur before EOS (or any QEMUFile operation)
2143 * because of RDMA protocol.
2144 */
2145 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2146
2147 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2f4fde93 2148 rs->bytes_transferred += 8;
56e93d26
JQ
2149
2150 ret = qemu_file_get_error(f);
2151 if (ret < 0) {
2152 return ret;
2153 }
2154
5c90308f 2155 return done;
56e93d26
JQ
2156}
2157
3d0684b2
JQ
2158/**
2159 * ram_save_complete: function called to send the remaining amount of ram
2160 *
2161 * Returns zero to indicate success
2162 *
2163 * Called with iothread lock
2164 *
2165 * @f: QEMUFile where to send the data
2166 * @opaque: RAMState pointer
2167 */
56e93d26
JQ
2168static int ram_save_complete(QEMUFile *f, void *opaque)
2169{
6f37bb8b
JQ
2170 RAMState *rs = opaque;
2171
56e93d26
JQ
2172 rcu_read_lock();
2173
663e6c1d 2174 if (!migration_in_postcopy(migrate_get_current())) {
8d820d6f 2175 migration_bitmap_sync(rs);
663e6c1d 2176 }
56e93d26
JQ
2177
2178 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2179
2180 /* try transferring iterative blocks of memory */
2181
2182 /* flush all remaining blocks regardless of rate limiting */
2183 while (true) {
2184 int pages;
2185
072c2511 2186 pages = ram_find_and_save_block(rs, f, !migration_in_colo_state());
56e93d26
JQ
2187 /* no more blocks to sent */
2188 if (pages == 0) {
2189 break;
2190 }
2191 }
2192
2f4fde93 2193 flush_compressed_data(rs, f);
56e93d26 2194 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
56e93d26
JQ
2195
2196 rcu_read_unlock();
d09a6fde 2197
56e93d26
JQ
2198 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2199
2200 return 0;
2201}
2202
c31b098f
DDAG
2203static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2204 uint64_t *non_postcopiable_pending,
2205 uint64_t *postcopiable_pending)
56e93d26 2206{
8d820d6f 2207 RAMState *rs = opaque;
56e93d26
JQ
2208 uint64_t remaining_size;
2209
9edabd4d 2210 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 2211
663e6c1d
DDAG
2212 if (!migration_in_postcopy(migrate_get_current()) &&
2213 remaining_size < max_size) {
56e93d26
JQ
2214 qemu_mutex_lock_iothread();
2215 rcu_read_lock();
8d820d6f 2216 migration_bitmap_sync(rs);
56e93d26
JQ
2217 rcu_read_unlock();
2218 qemu_mutex_unlock_iothread();
9edabd4d 2219 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 2220 }
c31b098f
DDAG
2221
2222 /* We can do postcopy, and all the data is postcopiable */
2223 *postcopiable_pending += remaining_size;
56e93d26
JQ
2224}
2225
2226static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2227{
2228 unsigned int xh_len;
2229 int xh_flags;
063e760a 2230 uint8_t *loaded_data;
56e93d26
JQ
2231
2232 if (!xbzrle_decoded_buf) {
2233 xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2234 }
063e760a 2235 loaded_data = xbzrle_decoded_buf;
56e93d26
JQ
2236
2237 /* extract RLE header */
2238 xh_flags = qemu_get_byte(f);
2239 xh_len = qemu_get_be16(f);
2240
2241 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2242 error_report("Failed to load XBZRLE page - wrong compression!");
2243 return -1;
2244 }
2245
2246 if (xh_len > TARGET_PAGE_SIZE) {
2247 error_report("Failed to load XBZRLE page - len overflow!");
2248 return -1;
2249 }
2250 /* load data and decode */
063e760a 2251 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
56e93d26
JQ
2252
2253 /* decode RLE */
063e760a 2254 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
56e93d26
JQ
2255 TARGET_PAGE_SIZE) == -1) {
2256 error_report("Failed to load XBZRLE page - decode error!");
2257 return -1;
2258 }
2259
2260 return 0;
2261}
2262
3d0684b2
JQ
2263/**
2264 * ram_block_from_stream: read a RAMBlock id from the migration stream
2265 *
2266 * Must be called from within a rcu critical section.
2267 *
56e93d26 2268 * Returns a pointer from within the RCU-protected ram_list.
a7180877 2269 *
3d0684b2
JQ
2270 * @f: QEMUFile where to read the data from
2271 * @flags: Page flags (mostly to see if it's a continuation of previous block)
a7180877 2272 */
3d0684b2 2273static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
56e93d26
JQ
2274{
2275 static RAMBlock *block = NULL;
2276 char id[256];
2277 uint8_t len;
2278
2279 if (flags & RAM_SAVE_FLAG_CONTINUE) {
4c4bad48 2280 if (!block) {
56e93d26
JQ
2281 error_report("Ack, bad migration stream!");
2282 return NULL;
2283 }
4c4bad48 2284 return block;
56e93d26
JQ
2285 }
2286
2287 len = qemu_get_byte(f);
2288 qemu_get_buffer(f, (uint8_t *)id, len);
2289 id[len] = 0;
2290
e3dd7493 2291 block = qemu_ram_block_by_name(id);
4c4bad48
HZ
2292 if (!block) {
2293 error_report("Can't find block %s", id);
2294 return NULL;
56e93d26
JQ
2295 }
2296
4c4bad48
HZ
2297 return block;
2298}
2299
2300static inline void *host_from_ram_block_offset(RAMBlock *block,
2301 ram_addr_t offset)
2302{
2303 if (!offset_in_ramblock(block, offset)) {
2304 return NULL;
2305 }
2306
2307 return block->host + offset;
56e93d26
JQ
2308}
2309
3d0684b2
JQ
2310/**
2311 * ram_handle_compressed: handle the zero page case
2312 *
56e93d26
JQ
2313 * If a page (or a whole RDMA chunk) has been
2314 * determined to be zero, then zap it.
3d0684b2
JQ
2315 *
2316 * @host: host address for the zero page
2317 * @ch: what the page is filled from. We only support zero
2318 * @size: size of the zero page
56e93d26
JQ
2319 */
2320void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2321{
2322 if (ch != 0 || !is_zero_range(host, size)) {
2323 memset(host, ch, size);
2324 }
2325}
2326
2327static void *do_data_decompress(void *opaque)
2328{
2329 DecompressParam *param = opaque;
2330 unsigned long pagesize;
33d151f4
LL
2331 uint8_t *des;
2332 int len;
56e93d26 2333
33d151f4 2334 qemu_mutex_lock(&param->mutex);
90e56fb4 2335 while (!param->quit) {
33d151f4
LL
2336 if (param->des) {
2337 des = param->des;
2338 len = param->len;
2339 param->des = 0;
2340 qemu_mutex_unlock(&param->mutex);
2341
56e93d26 2342 pagesize = TARGET_PAGE_SIZE;
73a8912b
LL
2343 /* uncompress() will return failed in some case, especially
2344 * when the page is dirted when doing the compression, it's
2345 * not a problem because the dirty page will be retransferred
2346 * and uncompress() won't break the data in other pages.
2347 */
33d151f4
LL
2348 uncompress((Bytef *)des, &pagesize,
2349 (const Bytef *)param->compbuf, len);
73a8912b 2350
33d151f4
LL
2351 qemu_mutex_lock(&decomp_done_lock);
2352 param->done = true;
2353 qemu_cond_signal(&decomp_done_cond);
2354 qemu_mutex_unlock(&decomp_done_lock);
2355
2356 qemu_mutex_lock(&param->mutex);
2357 } else {
2358 qemu_cond_wait(&param->cond, &param->mutex);
2359 }
56e93d26 2360 }
33d151f4 2361 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
2362
2363 return NULL;
2364}
2365
5533b2e9
LL
2366static void wait_for_decompress_done(void)
2367{
2368 int idx, thread_count;
2369
2370 if (!migrate_use_compression()) {
2371 return;
2372 }
2373
2374 thread_count = migrate_decompress_threads();
2375 qemu_mutex_lock(&decomp_done_lock);
2376 for (idx = 0; idx < thread_count; idx++) {
2377 while (!decomp_param[idx].done) {
2378 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2379 }
2380 }
2381 qemu_mutex_unlock(&decomp_done_lock);
2382}
2383
56e93d26
JQ
2384void migrate_decompress_threads_create(void)
2385{
2386 int i, thread_count;
2387
2388 thread_count = migrate_decompress_threads();
2389 decompress_threads = g_new0(QemuThread, thread_count);
2390 decomp_param = g_new0(DecompressParam, thread_count);
73a8912b
LL
2391 qemu_mutex_init(&decomp_done_lock);
2392 qemu_cond_init(&decomp_done_cond);
56e93d26
JQ
2393 for (i = 0; i < thread_count; i++) {
2394 qemu_mutex_init(&decomp_param[i].mutex);
2395 qemu_cond_init(&decomp_param[i].cond);
2396 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
73a8912b 2397 decomp_param[i].done = true;
90e56fb4 2398 decomp_param[i].quit = false;
56e93d26
JQ
2399 qemu_thread_create(decompress_threads + i, "decompress",
2400 do_data_decompress, decomp_param + i,
2401 QEMU_THREAD_JOINABLE);
2402 }
2403}
2404
2405void migrate_decompress_threads_join(void)
2406{
2407 int i, thread_count;
2408
56e93d26
JQ
2409 thread_count = migrate_decompress_threads();
2410 for (i = 0; i < thread_count; i++) {
2411 qemu_mutex_lock(&decomp_param[i].mutex);
90e56fb4 2412 decomp_param[i].quit = true;
56e93d26
JQ
2413 qemu_cond_signal(&decomp_param[i].cond);
2414 qemu_mutex_unlock(&decomp_param[i].mutex);
2415 }
2416 for (i = 0; i < thread_count; i++) {
2417 qemu_thread_join(decompress_threads + i);
2418 qemu_mutex_destroy(&decomp_param[i].mutex);
2419 qemu_cond_destroy(&decomp_param[i].cond);
2420 g_free(decomp_param[i].compbuf);
2421 }
2422 g_free(decompress_threads);
2423 g_free(decomp_param);
56e93d26
JQ
2424 decompress_threads = NULL;
2425 decomp_param = NULL;
56e93d26
JQ
2426}
2427
c1bc6626 2428static void decompress_data_with_multi_threads(QEMUFile *f,
56e93d26
JQ
2429 void *host, int len)
2430{
2431 int idx, thread_count;
2432
2433 thread_count = migrate_decompress_threads();
73a8912b 2434 qemu_mutex_lock(&decomp_done_lock);
56e93d26
JQ
2435 while (true) {
2436 for (idx = 0; idx < thread_count; idx++) {
73a8912b 2437 if (decomp_param[idx].done) {
33d151f4
LL
2438 decomp_param[idx].done = false;
2439 qemu_mutex_lock(&decomp_param[idx].mutex);
c1bc6626 2440 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
56e93d26
JQ
2441 decomp_param[idx].des = host;
2442 decomp_param[idx].len = len;
33d151f4
LL
2443 qemu_cond_signal(&decomp_param[idx].cond);
2444 qemu_mutex_unlock(&decomp_param[idx].mutex);
56e93d26
JQ
2445 break;
2446 }
2447 }
2448 if (idx < thread_count) {
2449 break;
73a8912b
LL
2450 } else {
2451 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
56e93d26
JQ
2452 }
2453 }
73a8912b 2454 qemu_mutex_unlock(&decomp_done_lock);
56e93d26
JQ
2455}
2456
3d0684b2
JQ
2457/**
2458 * ram_postcopy_incoming_init: allocate postcopy data structures
2459 *
2460 * Returns 0 for success and negative if there was one error
2461 *
2462 * @mis: current migration incoming state
2463 *
2464 * Allocate data structures etc needed by incoming migration with
2465 * postcopy-ram. postcopy-ram's similarly names
2466 * postcopy_ram_incoming_init does the work.
1caddf8a
DDAG
2467 */
2468int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2469{
2470 size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
2471
2472 return postcopy_ram_incoming_init(mis, ram_pages);
2473}
2474
3d0684b2
JQ
2475/**
2476 * ram_load_postcopy: load a page in postcopy case
2477 *
2478 * Returns 0 for success or -errno in case of error
2479 *
a7180877
DDAG
2480 * Called in postcopy mode by ram_load().
2481 * rcu_read_lock is taken prior to this being called.
3d0684b2
JQ
2482 *
2483 * @f: QEMUFile where to send the data
a7180877
DDAG
2484 */
2485static int ram_load_postcopy(QEMUFile *f)
2486{
2487 int flags = 0, ret = 0;
2488 bool place_needed = false;
28abd200 2489 bool matching_page_sizes = false;
a7180877
DDAG
2490 MigrationIncomingState *mis = migration_incoming_get_current();
2491 /* Temporary page that is later 'placed' */
2492 void *postcopy_host_page = postcopy_get_tmp_page(mis);
c53b7ddc 2493 void *last_host = NULL;
a3b6ff6d 2494 bool all_zero = false;
a7180877
DDAG
2495
2496 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2497 ram_addr_t addr;
2498 void *host = NULL;
2499 void *page_buffer = NULL;
2500 void *place_source = NULL;
df9ff5e1 2501 RAMBlock *block = NULL;
a7180877 2502 uint8_t ch;
a7180877
DDAG
2503
2504 addr = qemu_get_be64(f);
2505 flags = addr & ~TARGET_PAGE_MASK;
2506 addr &= TARGET_PAGE_MASK;
2507
2508 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2509 place_needed = false;
2510 if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) {
df9ff5e1 2511 block = ram_block_from_stream(f, flags);
4c4bad48
HZ
2512
2513 host = host_from_ram_block_offset(block, addr);
a7180877
DDAG
2514 if (!host) {
2515 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2516 ret = -EINVAL;
2517 break;
2518 }
28abd200 2519 matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
a7180877 2520 /*
28abd200
DDAG
2521 * Postcopy requires that we place whole host pages atomically;
2522 * these may be huge pages for RAMBlocks that are backed by
2523 * hugetlbfs.
a7180877
DDAG
2524 * To make it atomic, the data is read into a temporary page
2525 * that's moved into place later.
2526 * The migration protocol uses, possibly smaller, target-pages
2527 * however the source ensures it always sends all the components
2528 * of a host page in order.
2529 */
2530 page_buffer = postcopy_host_page +
28abd200 2531 ((uintptr_t)host & (block->page_size - 1));
a7180877 2532 /* If all TP are zero then we can optimise the place */
28abd200 2533 if (!((uintptr_t)host & (block->page_size - 1))) {
a7180877 2534 all_zero = true;
c53b7ddc
DDAG
2535 } else {
2536 /* not the 1st TP within the HP */
2537 if (host != (last_host + TARGET_PAGE_SIZE)) {
9af9e0fe 2538 error_report("Non-sequential target page %p/%p",
c53b7ddc
DDAG
2539 host, last_host);
2540 ret = -EINVAL;
2541 break;
2542 }
a7180877
DDAG
2543 }
2544
c53b7ddc 2545
a7180877
DDAG
2546 /*
2547 * If it's the last part of a host page then we place the host
2548 * page
2549 */
2550 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
28abd200 2551 (block->page_size - 1)) == 0;
a7180877
DDAG
2552 place_source = postcopy_host_page;
2553 }
c53b7ddc 2554 last_host = host;
a7180877
DDAG
2555
2556 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2557 case RAM_SAVE_FLAG_COMPRESS:
2558 ch = qemu_get_byte(f);
2559 memset(page_buffer, ch, TARGET_PAGE_SIZE);
2560 if (ch) {
2561 all_zero = false;
2562 }
2563 break;
2564
2565 case RAM_SAVE_FLAG_PAGE:
2566 all_zero = false;
2567 if (!place_needed || !matching_page_sizes) {
2568 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2569 } else {
2570 /* Avoids the qemu_file copy during postcopy, which is
2571 * going to do a copy later; can only do it when we
2572 * do this read in one go (matching page sizes)
2573 */
2574 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2575 TARGET_PAGE_SIZE);
2576 }
2577 break;
2578 case RAM_SAVE_FLAG_EOS:
2579 /* normal exit */
2580 break;
2581 default:
2582 error_report("Unknown combination of migration flags: %#x"
2583 " (postcopy mode)", flags);
2584 ret = -EINVAL;
2585 }
2586
2587 if (place_needed) {
2588 /* This gets called at the last target page in the host page */
df9ff5e1
DDAG
2589 void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2590
a7180877 2591 if (all_zero) {
df9ff5e1
DDAG
2592 ret = postcopy_place_page_zero(mis, place_dest,
2593 block->page_size);
a7180877 2594 } else {
df9ff5e1
DDAG
2595 ret = postcopy_place_page(mis, place_dest,
2596 place_source, block->page_size);
a7180877
DDAG
2597 }
2598 }
2599 if (!ret) {
2600 ret = qemu_file_get_error(f);
2601 }
2602 }
2603
2604 return ret;
2605}
2606
56e93d26
JQ
2607static int ram_load(QEMUFile *f, void *opaque, int version_id)
2608{
2609 int flags = 0, ret = 0;
2610 static uint64_t seq_iter;
2611 int len = 0;
a7180877
DDAG
2612 /*
2613 * If system is running in postcopy mode, page inserts to host memory must
2614 * be atomic
2615 */
2616 bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
ef08fb38
DDAG
2617 /* ADVISE is earlier, it shows the source has the postcopy capability on */
2618 bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
56e93d26
JQ
2619
2620 seq_iter++;
2621
2622 if (version_id != 4) {
2623 ret = -EINVAL;
2624 }
2625
2626 /* This RCU critical section can be very long running.
2627 * When RCU reclaims in the code start to become numerous,
2628 * it will be necessary to reduce the granularity of this
2629 * critical section.
2630 */
2631 rcu_read_lock();
a7180877
DDAG
2632
2633 if (postcopy_running) {
2634 ret = ram_load_postcopy(f);
2635 }
2636
2637 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
56e93d26 2638 ram_addr_t addr, total_ram_bytes;
a776aa15 2639 void *host = NULL;
56e93d26
JQ
2640 uint8_t ch;
2641
2642 addr = qemu_get_be64(f);
2643 flags = addr & ~TARGET_PAGE_MASK;
2644 addr &= TARGET_PAGE_MASK;
2645
a776aa15
DDAG
2646 if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE |
2647 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4c4bad48
HZ
2648 RAMBlock *block = ram_block_from_stream(f, flags);
2649
2650 host = host_from_ram_block_offset(block, addr);
a776aa15
DDAG
2651 if (!host) {
2652 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2653 ret = -EINVAL;
2654 break;
2655 }
2656 }
2657
56e93d26
JQ
2658 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2659 case RAM_SAVE_FLAG_MEM_SIZE:
2660 /* Synchronize RAM block list */
2661 total_ram_bytes = addr;
2662 while (!ret && total_ram_bytes) {
2663 RAMBlock *block;
56e93d26
JQ
2664 char id[256];
2665 ram_addr_t length;
2666
2667 len = qemu_get_byte(f);
2668 qemu_get_buffer(f, (uint8_t *)id, len);
2669 id[len] = 0;
2670 length = qemu_get_be64(f);
2671
e3dd7493
DDAG
2672 block = qemu_ram_block_by_name(id);
2673 if (block) {
2674 if (length != block->used_length) {
2675 Error *local_err = NULL;
56e93d26 2676
fa53a0e5 2677 ret = qemu_ram_resize(block, length,
e3dd7493
DDAG
2678 &local_err);
2679 if (local_err) {
2680 error_report_err(local_err);
56e93d26 2681 }
56e93d26 2682 }
ef08fb38
DDAG
2683 /* For postcopy we need to check hugepage sizes match */
2684 if (postcopy_advised &&
2685 block->page_size != qemu_host_page_size) {
2686 uint64_t remote_page_size = qemu_get_be64(f);
2687 if (remote_page_size != block->page_size) {
2688 error_report("Mismatched RAM page size %s "
2689 "(local) %zd != %" PRId64,
2690 id, block->page_size,
2691 remote_page_size);
2692 ret = -EINVAL;
2693 }
2694 }
e3dd7493
DDAG
2695 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2696 block->idstr);
2697 } else {
56e93d26
JQ
2698 error_report("Unknown ramblock \"%s\", cannot "
2699 "accept migration", id);
2700 ret = -EINVAL;
2701 }
2702
2703 total_ram_bytes -= length;
2704 }
2705 break;
a776aa15 2706
56e93d26 2707 case RAM_SAVE_FLAG_COMPRESS:
56e93d26
JQ
2708 ch = qemu_get_byte(f);
2709 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2710 break;
a776aa15 2711
56e93d26 2712 case RAM_SAVE_FLAG_PAGE:
56e93d26
JQ
2713 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2714 break;
56e93d26 2715
a776aa15 2716 case RAM_SAVE_FLAG_COMPRESS_PAGE:
56e93d26
JQ
2717 len = qemu_get_be32(f);
2718 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2719 error_report("Invalid compressed data length: %d", len);
2720 ret = -EINVAL;
2721 break;
2722 }
c1bc6626 2723 decompress_data_with_multi_threads(f, host, len);
56e93d26 2724 break;
a776aa15 2725
56e93d26 2726 case RAM_SAVE_FLAG_XBZRLE:
56e93d26
JQ
2727 if (load_xbzrle(f, addr, host) < 0) {
2728 error_report("Failed to decompress XBZRLE page at "
2729 RAM_ADDR_FMT, addr);
2730 ret = -EINVAL;
2731 break;
2732 }
2733 break;
2734 case RAM_SAVE_FLAG_EOS:
2735 /* normal exit */
2736 break;
2737 default:
2738 if (flags & RAM_SAVE_FLAG_HOOK) {
632e3a5c 2739 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
56e93d26
JQ
2740 } else {
2741 error_report("Unknown combination of migration flags: %#x",
2742 flags);
2743 ret = -EINVAL;
2744 }
2745 }
2746 if (!ret) {
2747 ret = qemu_file_get_error(f);
2748 }
2749 }
2750
5533b2e9 2751 wait_for_decompress_done();
56e93d26 2752 rcu_read_unlock();
55c4446b 2753 trace_ram_load_complete(ret, seq_iter);
56e93d26
JQ
2754 return ret;
2755}
2756
2757static SaveVMHandlers savevm_ram_handlers = {
2758 .save_live_setup = ram_save_setup,
2759 .save_live_iterate = ram_save_iterate,
763c906b 2760 .save_live_complete_postcopy = ram_save_complete,
a3e06c3d 2761 .save_live_complete_precopy = ram_save_complete,
56e93d26
JQ
2762 .save_live_pending = ram_save_pending,
2763 .load_state = ram_load,
6ad2a215 2764 .cleanup = ram_migration_cleanup,
56e93d26
JQ
2765};
2766
2767void ram_mig_init(void)
2768{
2769 qemu_mutex_init(&XBZRLE.lock);
6f37bb8b 2770 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
56e93d26 2771}