]> git.proxmox.com Git - mirror_qemu.git/blame - migration/ram.c
ram: Move xbzrle_cache_miss_rate into RAMState
[mirror_qemu.git] / migration / ram.c
CommitLineData
56e93d26
JQ
1/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
76cc7b58
JQ
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
56e93d26
JQ
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
1393a485 28#include "qemu/osdep.h"
33c11879
PB
29#include "qemu-common.h"
30#include "cpu.h"
56e93d26 31#include <zlib.h>
4addcd4f 32#include "qapi-event.h"
f348b6d1 33#include "qemu/cutils.h"
56e93d26
JQ
34#include "qemu/bitops.h"
35#include "qemu/bitmap.h"
7205c9ec
JQ
36#include "qemu/timer.h"
37#include "qemu/main-loop.h"
56e93d26 38#include "migration/migration.h"
e0b266f0 39#include "migration/postcopy-ram.h"
56e93d26
JQ
40#include "exec/address-spaces.h"
41#include "migration/page_cache.h"
56e93d26 42#include "qemu/error-report.h"
56e93d26 43#include "trace.h"
56e93d26 44#include "exec/ram_addr.h"
56e93d26 45#include "qemu/rcu_queue.h"
a91246c9 46#include "migration/colo.h"
56e93d26 47
56e93d26
JQ
48/***********************************************************/
49/* ram save/restore */
50
51#define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
52#define RAM_SAVE_FLAG_COMPRESS 0x02
53#define RAM_SAVE_FLAG_MEM_SIZE 0x04
54#define RAM_SAVE_FLAG_PAGE 0x08
55#define RAM_SAVE_FLAG_EOS 0x10
56#define RAM_SAVE_FLAG_CONTINUE 0x20
57#define RAM_SAVE_FLAG_XBZRLE 0x40
58/* 0x80 is reserved in migration.h start with 0x100 next */
59#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
60
adb65dec 61static uint8_t *ZERO_TARGET_PAGE;
56e93d26
JQ
62
63static inline bool is_zero_range(uint8_t *p, uint64_t size)
64{
a1febc49 65 return buffer_is_zero(p, size);
56e93d26
JQ
66}
67
68/* struct contains XBZRLE cache and a static page
69 used by the compression */
70static struct {
71 /* buffer used for XBZRLE encoding */
72 uint8_t *encoded_buf;
73 /* buffer for storing page content */
74 uint8_t *current_buf;
75 /* Cache for XBZRLE, Protected by lock. */
76 PageCache *cache;
77 QemuMutex lock;
78} XBZRLE;
79
80/* buffer used for XBZRLE decoding */
81static uint8_t *xbzrle_decoded_buf;
82
83static void XBZRLE_cache_lock(void)
84{
85 if (migrate_use_xbzrle())
86 qemu_mutex_lock(&XBZRLE.lock);
87}
88
89static void XBZRLE_cache_unlock(void)
90{
91 if (migrate_use_xbzrle())
92 qemu_mutex_unlock(&XBZRLE.lock);
93}
94
3d0684b2
JQ
95/**
96 * xbzrle_cache_resize: resize the xbzrle cache
97 *
98 * This function is called from qmp_migrate_set_cache_size in main
99 * thread, possibly while a migration is in progress. A running
100 * migration may be using the cache and might finish during this call,
101 * hence changes to the cache are protected by XBZRLE.lock().
102 *
103 * Returns the new_size or negative in case of error.
104 *
105 * @new_size: new cache size
56e93d26
JQ
106 */
107int64_t xbzrle_cache_resize(int64_t new_size)
108{
109 PageCache *new_cache;
110 int64_t ret;
111
112 if (new_size < TARGET_PAGE_SIZE) {
113 return -1;
114 }
115
116 XBZRLE_cache_lock();
117
118 if (XBZRLE.cache != NULL) {
119 if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
120 goto out_new_size;
121 }
122 new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
123 TARGET_PAGE_SIZE);
124 if (!new_cache) {
125 error_report("Error creating cache");
126 ret = -1;
127 goto out;
128 }
129
130 cache_fini(XBZRLE.cache);
131 XBZRLE.cache = new_cache;
132 }
133
134out_new_size:
135 ret = pow2floor(new_size);
136out:
137 XBZRLE_cache_unlock();
138 return ret;
139}
140
6f37bb8b
JQ
141/* State of RAM for migration */
142struct RAMState {
143 /* Last block that we have visited searching for dirty pages */
144 RAMBlock *last_seen_block;
145 /* Last block from where we have sent data */
146 RAMBlock *last_sent_block;
147 /* Last offset we have sent data from */
148 ram_addr_t last_offset;
149 /* last ram version we have seen */
150 uint32_t last_version;
151 /* We are in the first round */
152 bool ram_bulk_stage;
8d820d6f
JQ
153 /* How many times we have dirty too many pages */
154 int dirty_rate_high_cnt;
5a987738
JQ
155 /* How many times we have synchronized the bitmap */
156 uint64_t bitmap_sync_count;
f664da80
JQ
157 /* these variables are used for bitmap sync */
158 /* last time we did a full bitmap_sync */
159 int64_t time_last_bitmap_sync;
eac74159 160 /* bytes transferred at start_time */
c4bdf0cf 161 uint64_t bytes_xfer_prev;
a66cd90c 162 /* number of dirty pages since start_time */
68908ed6 163 uint64_t num_dirty_pages_period;
b5833fde
JQ
164 /* xbzrle misses since the beginning of the period */
165 uint64_t xbzrle_cache_miss_prev;
36040d9c
JQ
166 /* number of iterations at the beginning of period */
167 uint64_t iterations_prev;
f7ccd61b
JQ
168 /* Accounting fields */
169 /* number of zero pages. It used to be pages filled by the same char. */
170 uint64_t zero_pages;
b4d1c6e7
JQ
171 /* number of normal transferred pages */
172 uint64_t norm_pages;
23b28c3c
JQ
173 /* Iterations since start */
174 uint64_t iterations;
f36ada95
JQ
175 /* xbzrle transmitted bytes. Notice that this is with
176 * compression, they can't be calculated from the pages */
07ed50a2 177 uint64_t xbzrle_bytes;
f36ada95
JQ
178 /* xbzrle transmmited pages */
179 uint64_t xbzrle_pages;
544c36f1
JQ
180 /* xbzrle number of cache miss */
181 uint64_t xbzrle_cache_miss;
b07016b6
JQ
182 /* xbzrle miss rate */
183 double xbzrle_cache_miss_rate;
6f37bb8b
JQ
184};
185typedef struct RAMState RAMState;
186
187static RAMState ram_state;
188
56e93d26
JQ
189/* accounting for migration statistics */
190typedef struct AccountingInfo {
56e93d26
JQ
191 uint64_t xbzrle_overflows;
192} AccountingInfo;
193
194static AccountingInfo acct_info;
195
196static void acct_clear(void)
197{
198 memset(&acct_info, 0, sizeof(acct_info));
199}
200
56e93d26
JQ
201uint64_t dup_mig_pages_transferred(void)
202{
f7ccd61b 203 return ram_state.zero_pages;
56e93d26
JQ
204}
205
56e93d26
JQ
206uint64_t norm_mig_pages_transferred(void)
207{
b4d1c6e7 208 return ram_state.norm_pages;
56e93d26
JQ
209}
210
211uint64_t xbzrle_mig_bytes_transferred(void)
212{
07ed50a2 213 return ram_state.xbzrle_bytes;
56e93d26
JQ
214}
215
216uint64_t xbzrle_mig_pages_transferred(void)
217{
f36ada95 218 return ram_state.xbzrle_pages;
56e93d26
JQ
219}
220
221uint64_t xbzrle_mig_pages_cache_miss(void)
222{
544c36f1 223 return ram_state.xbzrle_cache_miss;
56e93d26
JQ
224}
225
226double xbzrle_mig_cache_miss_rate(void)
227{
b07016b6 228 return ram_state.xbzrle_cache_miss_rate;
56e93d26
JQ
229}
230
231uint64_t xbzrle_mig_pages_overflow(void)
232{
233 return acct_info.xbzrle_overflows;
234}
235
dd631697 236static QemuMutex migration_bitmap_mutex;
56e93d26 237static uint64_t migration_dirty_pages;
56e93d26 238
b8fb8cb7
DDAG
239/* used by the search for pages to send */
240struct PageSearchStatus {
241 /* Current block being searched */
242 RAMBlock *block;
243 /* Current offset to search from */
244 ram_addr_t offset;
245 /* Set once we wrap around */
246 bool complete_round;
247};
248typedef struct PageSearchStatus PageSearchStatus;
249
60be6340
DL
250static struct BitmapRcu {
251 struct rcu_head rcu;
f3f491fc 252 /* Main migration bitmap */
60be6340 253 unsigned long *bmap;
f3f491fc
DDAG
254 /* bitmap of pages that haven't been sent even once
255 * only maintained and used in postcopy at the moment
256 * where it's used to send the dirtymap at the start
257 * of the postcopy phase
258 */
259 unsigned long *unsentmap;
60be6340
DL
260} *migration_bitmap_rcu;
261
56e93d26 262struct CompressParam {
56e93d26 263 bool done;
90e56fb4 264 bool quit;
56e93d26
JQ
265 QEMUFile *file;
266 QemuMutex mutex;
267 QemuCond cond;
268 RAMBlock *block;
269 ram_addr_t offset;
270};
271typedef struct CompressParam CompressParam;
272
273struct DecompressParam {
73a8912b 274 bool done;
90e56fb4 275 bool quit;
56e93d26
JQ
276 QemuMutex mutex;
277 QemuCond cond;
278 void *des;
d341d9f3 279 uint8_t *compbuf;
56e93d26
JQ
280 int len;
281};
282typedef struct DecompressParam DecompressParam;
283
284static CompressParam *comp_param;
285static QemuThread *compress_threads;
286/* comp_done_cond is used to wake up the migration thread when
287 * one of the compression threads has finished the compression.
288 * comp_done_lock is used to co-work with comp_done_cond.
289 */
0d9f9a5c
LL
290static QemuMutex comp_done_lock;
291static QemuCond comp_done_cond;
56e93d26
JQ
292/* The empty QEMUFileOps will be used by file in CompressParam */
293static const QEMUFileOps empty_ops = { };
294
295static bool compression_switch;
56e93d26
JQ
296static DecompressParam *decomp_param;
297static QemuThread *decompress_threads;
73a8912b
LL
298static QemuMutex decomp_done_lock;
299static QemuCond decomp_done_cond;
56e93d26 300
a7a9a88f
LL
301static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
302 ram_addr_t offset);
56e93d26
JQ
303
304static void *do_data_compress(void *opaque)
305{
306 CompressParam *param = opaque;
a7a9a88f
LL
307 RAMBlock *block;
308 ram_addr_t offset;
56e93d26 309
a7a9a88f 310 qemu_mutex_lock(&param->mutex);
90e56fb4 311 while (!param->quit) {
a7a9a88f
LL
312 if (param->block) {
313 block = param->block;
314 offset = param->offset;
315 param->block = NULL;
316 qemu_mutex_unlock(&param->mutex);
317
318 do_compress_ram_page(param->file, block, offset);
319
0d9f9a5c 320 qemu_mutex_lock(&comp_done_lock);
a7a9a88f 321 param->done = true;
0d9f9a5c
LL
322 qemu_cond_signal(&comp_done_cond);
323 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
324
325 qemu_mutex_lock(&param->mutex);
326 } else {
56e93d26
JQ
327 qemu_cond_wait(&param->cond, &param->mutex);
328 }
56e93d26 329 }
a7a9a88f 330 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
331
332 return NULL;
333}
334
335static inline void terminate_compression_threads(void)
336{
337 int idx, thread_count;
338
339 thread_count = migrate_compress_threads();
3d0684b2 340
56e93d26
JQ
341 for (idx = 0; idx < thread_count; idx++) {
342 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 343 comp_param[idx].quit = true;
56e93d26
JQ
344 qemu_cond_signal(&comp_param[idx].cond);
345 qemu_mutex_unlock(&comp_param[idx].mutex);
346 }
347}
348
349void migrate_compress_threads_join(void)
350{
351 int i, thread_count;
352
353 if (!migrate_use_compression()) {
354 return;
355 }
356 terminate_compression_threads();
357 thread_count = migrate_compress_threads();
358 for (i = 0; i < thread_count; i++) {
359 qemu_thread_join(compress_threads + i);
360 qemu_fclose(comp_param[i].file);
361 qemu_mutex_destroy(&comp_param[i].mutex);
362 qemu_cond_destroy(&comp_param[i].cond);
363 }
0d9f9a5c
LL
364 qemu_mutex_destroy(&comp_done_lock);
365 qemu_cond_destroy(&comp_done_cond);
56e93d26
JQ
366 g_free(compress_threads);
367 g_free(comp_param);
56e93d26
JQ
368 compress_threads = NULL;
369 comp_param = NULL;
56e93d26
JQ
370}
371
372void migrate_compress_threads_create(void)
373{
374 int i, thread_count;
375
376 if (!migrate_use_compression()) {
377 return;
378 }
56e93d26
JQ
379 compression_switch = true;
380 thread_count = migrate_compress_threads();
381 compress_threads = g_new0(QemuThread, thread_count);
382 comp_param = g_new0(CompressParam, thread_count);
0d9f9a5c
LL
383 qemu_cond_init(&comp_done_cond);
384 qemu_mutex_init(&comp_done_lock);
56e93d26 385 for (i = 0; i < thread_count; i++) {
e110aa91
C
386 /* comp_param[i].file is just used as a dummy buffer to save data,
387 * set its ops to empty.
56e93d26
JQ
388 */
389 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
390 comp_param[i].done = true;
90e56fb4 391 comp_param[i].quit = false;
56e93d26
JQ
392 qemu_mutex_init(&comp_param[i].mutex);
393 qemu_cond_init(&comp_param[i].cond);
394 qemu_thread_create(compress_threads + i, "compress",
395 do_data_compress, comp_param + i,
396 QEMU_THREAD_JOINABLE);
397 }
398}
399
400/**
3d0684b2 401 * save_page_header: write page header to wire
56e93d26
JQ
402 *
403 * If this is the 1st block, it also writes the block identification
404 *
3d0684b2 405 * Returns the number of bytes written
56e93d26
JQ
406 *
407 * @f: QEMUFile where to send the data
408 * @block: block that contains the page we want to send
409 * @offset: offset inside the block for the page
410 * in the lower bits, it contains flags
411 */
412static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
413{
9f5f380b 414 size_t size, len;
56e93d26
JQ
415
416 qemu_put_be64(f, offset);
417 size = 8;
418
419 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
9f5f380b
LL
420 len = strlen(block->idstr);
421 qemu_put_byte(f, len);
422 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
423 size += 1 + len;
56e93d26
JQ
424 }
425 return size;
426}
427
3d0684b2
JQ
428/**
429 * mig_throttle_guest_down: throotle down the guest
430 *
431 * Reduce amount of guest cpu execution to hopefully slow down memory
432 * writes. If guest dirty memory rate is reduced below the rate at
433 * which we can transfer pages to the destination then we should be
434 * able to complete migration. Some workloads dirty memory way too
435 * fast and will not effectively converge, even with auto-converge.
070afca2
JH
436 */
437static void mig_throttle_guest_down(void)
438{
439 MigrationState *s = migrate_get_current();
2594f56d
DB
440 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
441 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
070afca2
JH
442
443 /* We have not started throttling yet. Let's start it. */
444 if (!cpu_throttle_active()) {
445 cpu_throttle_set(pct_initial);
446 } else {
447 /* Throttling already on, just increase the rate */
448 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
449 }
450}
451
3d0684b2
JQ
452/**
453 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
454 *
6f37bb8b 455 * @rs: current RAM state
3d0684b2
JQ
456 * @current_addr: address for the zero page
457 *
458 * Update the xbzrle cache to reflect a page that's been sent as all 0.
56e93d26
JQ
459 * The important thing is that a stale (not-yet-0'd) page be replaced
460 * by the new data.
461 * As a bonus, if the page wasn't in the cache it gets added so that
3d0684b2 462 * when a small write is made into the 0'd page it gets XBZRLE sent.
56e93d26 463 */
6f37bb8b 464static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
56e93d26 465{
6f37bb8b 466 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
56e93d26
JQ
467 return;
468 }
469
470 /* We don't care if this fails to allocate a new cache page
471 * as long as it updated an old one */
472 cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
5a987738 473 rs->bitmap_sync_count);
56e93d26
JQ
474}
475
476#define ENCODING_FLAG_XBZRLE 0x1
477
478/**
479 * save_xbzrle_page: compress and send current page
480 *
481 * Returns: 1 means that we wrote the page
482 * 0 means that page is identical to the one already sent
483 * -1 means that xbzrle would be longer than normal
484 *
5a987738 485 * @rs: current RAM state
56e93d26 486 * @f: QEMUFile where to send the data
3d0684b2
JQ
487 * @current_data: pointer to the address of the page contents
488 * @current_addr: addr of the page
56e93d26
JQ
489 * @block: block that contains the page we want to send
490 * @offset: offset inside the block for the page
491 * @last_stage: if we are at the completion stage
492 * @bytes_transferred: increase it with the number of transferred bytes
493 */
5a987738 494static int save_xbzrle_page(RAMState *rs, QEMUFile *f, uint8_t **current_data,
56e93d26
JQ
495 ram_addr_t current_addr, RAMBlock *block,
496 ram_addr_t offset, bool last_stage,
497 uint64_t *bytes_transferred)
498{
499 int encoded_len = 0, bytes_xbzrle;
500 uint8_t *prev_cached_page;
501
5a987738 502 if (!cache_is_cached(XBZRLE.cache, current_addr, rs->bitmap_sync_count)) {
544c36f1 503 rs->xbzrle_cache_miss++;
56e93d26
JQ
504 if (!last_stage) {
505 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
5a987738 506 rs->bitmap_sync_count) == -1) {
56e93d26
JQ
507 return -1;
508 } else {
509 /* update *current_data when the page has been
510 inserted into cache */
511 *current_data = get_cached_data(XBZRLE.cache, current_addr);
512 }
513 }
514 return -1;
515 }
516
517 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
518
519 /* save current buffer into memory */
520 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
521
522 /* XBZRLE encoding (if there is no overflow) */
523 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
524 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
525 TARGET_PAGE_SIZE);
526 if (encoded_len == 0) {
55c4446b 527 trace_save_xbzrle_page_skipping();
56e93d26
JQ
528 return 0;
529 } else if (encoded_len == -1) {
55c4446b 530 trace_save_xbzrle_page_overflow();
56e93d26
JQ
531 acct_info.xbzrle_overflows++;
532 /* update data in the cache */
533 if (!last_stage) {
534 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
535 *current_data = prev_cached_page;
536 }
537 return -1;
538 }
539
540 /* we need to update the data in the cache, in order to get the same data */
541 if (!last_stage) {
542 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
543 }
544
545 /* Send XBZRLE based compressed page */
546 bytes_xbzrle = save_page_header(f, block, offset | RAM_SAVE_FLAG_XBZRLE);
547 qemu_put_byte(f, ENCODING_FLAG_XBZRLE);
548 qemu_put_be16(f, encoded_len);
549 qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len);
550 bytes_xbzrle += encoded_len + 1 + 2;
f36ada95 551 rs->xbzrle_pages++;
07ed50a2 552 rs->xbzrle_bytes += bytes_xbzrle;
56e93d26
JQ
553 *bytes_transferred += bytes_xbzrle;
554
555 return 1;
556}
557
3d0684b2
JQ
558/**
559 * migration_bitmap_find_dirty: find the next dirty page from start
f3f491fc 560 *
3d0684b2
JQ
561 * Called with rcu_read_lock() to protect migration_bitmap
562 *
563 * Returns the byte offset within memory region of the start of a dirty page
564 *
6f37bb8b 565 * @rs: current RAM state
3d0684b2
JQ
566 * @rb: RAMBlock where to search for dirty pages
567 * @start: starting address (typically so we can continue from previous page)
568 * @ram_addr_abs: pointer into which to store the address of the dirty page
569 * within the global ram_addr space
f3f491fc 570 */
56e93d26 571static inline
6f37bb8b 572ram_addr_t migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
a82d593b
DDAG
573 ram_addr_t start,
574 ram_addr_t *ram_addr_abs)
56e93d26 575{
2f68e399 576 unsigned long base = rb->offset >> TARGET_PAGE_BITS;
56e93d26 577 unsigned long nr = base + (start >> TARGET_PAGE_BITS);
2f68e399
DDAG
578 uint64_t rb_size = rb->used_length;
579 unsigned long size = base + (rb_size >> TARGET_PAGE_BITS);
2ff64038 580 unsigned long *bitmap;
56e93d26
JQ
581
582 unsigned long next;
583
60be6340 584 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
6f37bb8b 585 if (rs->ram_bulk_stage && nr > base) {
56e93d26
JQ
586 next = nr + 1;
587 } else {
2ff64038 588 next = find_next_bit(bitmap, size, nr);
56e93d26
JQ
589 }
590
f3f491fc 591 *ram_addr_abs = next << TARGET_PAGE_BITS;
56e93d26
JQ
592 return (next - base) << TARGET_PAGE_BITS;
593}
594
a82d593b
DDAG
595static inline bool migration_bitmap_clear_dirty(ram_addr_t addr)
596{
597 bool ret;
598 int nr = addr >> TARGET_PAGE_BITS;
599 unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
600
601 ret = test_and_clear_bit(nr, bitmap);
602
603 if (ret) {
604 migration_dirty_pages--;
605 }
606 return ret;
607}
608
a66cd90c
JQ
609static void migration_bitmap_sync_range(RAMState *rs, ram_addr_t start,
610 ram_addr_t length)
56e93d26 611{
2ff64038 612 unsigned long *bitmap;
60be6340 613 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1ffb5dfd 614 migration_dirty_pages += cpu_physical_memory_sync_dirty_bitmap(bitmap,
a66cd90c 615 start, length, &rs->num_dirty_pages_period);
56e93d26
JQ
616}
617
f664da80 618static void migration_bitmap_sync_init(RAMState *rs)
56e93d26 619{
f664da80 620 rs->time_last_bitmap_sync = 0;
eac74159 621 rs->bytes_xfer_prev = 0;
a66cd90c 622 rs->num_dirty_pages_period = 0;
b5833fde 623 rs->xbzrle_cache_miss_prev = 0;
36040d9c 624 rs->iterations_prev = 0;
56e93d26
JQ
625}
626
3d0684b2
JQ
627/**
628 * ram_pagesize_summary: calculate all the pagesizes of a VM
629 *
630 * Returns a summary bitmap of the page sizes of all RAMBlocks
631 *
632 * For VMs with just normal pages this is equivalent to the host page
633 * size. If it's got some huge pages then it's the OR of all the
634 * different page sizes.
e8ca1db2
DDAG
635 */
636uint64_t ram_pagesize_summary(void)
637{
638 RAMBlock *block;
639 uint64_t summary = 0;
640
641 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
642 summary |= block->page_size;
643 }
644
645 return summary;
646}
647
8d820d6f 648static void migration_bitmap_sync(RAMState *rs)
56e93d26
JQ
649{
650 RAMBlock *block;
56e93d26
JQ
651 MigrationState *s = migrate_get_current();
652 int64_t end_time;
c4bdf0cf 653 uint64_t bytes_xfer_now;
56e93d26 654
5a987738 655 rs->bitmap_sync_count++;
56e93d26 656
eac74159
JQ
657 if (!rs->bytes_xfer_prev) {
658 rs->bytes_xfer_prev = ram_bytes_transferred();
56e93d26
JQ
659 }
660
f664da80
JQ
661 if (!rs->time_last_bitmap_sync) {
662 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
56e93d26
JQ
663 }
664
665 trace_migration_bitmap_sync_start();
9c1f8f44 666 memory_global_dirty_log_sync();
56e93d26 667
dd631697 668 qemu_mutex_lock(&migration_bitmap_mutex);
56e93d26
JQ
669 rcu_read_lock();
670 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
a66cd90c 671 migration_bitmap_sync_range(rs, block->offset, block->used_length);
56e93d26
JQ
672 }
673 rcu_read_unlock();
dd631697 674 qemu_mutex_unlock(&migration_bitmap_mutex);
56e93d26 675
a66cd90c 676 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1ffb5dfd 677
56e93d26
JQ
678 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
679
680 /* more than 1 second = 1000 millisecons */
f664da80 681 if (end_time > rs->time_last_bitmap_sync + 1000) {
56e93d26
JQ
682 if (migrate_auto_converge()) {
683 /* The following detection logic can be refined later. For now:
684 Check to see if the dirtied bytes is 50% more than the approx.
685 amount of bytes that just got transferred since the last time we
070afca2
JH
686 were in this routine. If that happens twice, start or increase
687 throttling */
56e93d26 688 bytes_xfer_now = ram_bytes_transferred();
070afca2 689
56e93d26 690 if (s->dirty_pages_rate &&
a66cd90c 691 (rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
eac74159 692 (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
8d820d6f 693 (rs->dirty_rate_high_cnt++ >= 2)) {
56e93d26 694 trace_migration_throttle();
8d820d6f 695 rs->dirty_rate_high_cnt = 0;
070afca2 696 mig_throttle_guest_down();
56e93d26 697 }
eac74159 698 rs->bytes_xfer_prev = bytes_xfer_now;
56e93d26 699 }
070afca2 700
56e93d26 701 if (migrate_use_xbzrle()) {
23b28c3c 702 if (rs->iterations_prev != rs->iterations) {
b07016b6 703 rs->xbzrle_cache_miss_rate =
544c36f1 704 (double)(rs->xbzrle_cache_miss -
b5833fde 705 rs->xbzrle_cache_miss_prev) /
23b28c3c 706 (rs->iterations - rs->iterations_prev);
56e93d26 707 }
23b28c3c 708 rs->iterations_prev = rs->iterations;
544c36f1 709 rs->xbzrle_cache_miss_prev = rs->xbzrle_cache_miss;
56e93d26 710 }
a66cd90c 711 s->dirty_pages_rate = rs->num_dirty_pages_period * 1000
f664da80 712 / (end_time - rs->time_last_bitmap_sync);
56e93d26 713 s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE;
f664da80 714 rs->time_last_bitmap_sync = end_time;
a66cd90c 715 rs->num_dirty_pages_period = 0;
56e93d26 716 }
5a987738 717 s->dirty_sync_count = rs->bitmap_sync_count;
4addcd4f 718 if (migrate_use_events()) {
5a987738 719 qapi_event_send_migration_pass(rs->bitmap_sync_count, NULL);
4addcd4f 720 }
56e93d26
JQ
721}
722
723/**
3d0684b2 724 * save_zero_page: send the zero page to the stream
56e93d26 725 *
3d0684b2 726 * Returns the number of pages written.
56e93d26 727 *
f7ccd61b 728 * @rs: current RAM state
56e93d26
JQ
729 * @f: QEMUFile where to send the data
730 * @block: block that contains the page we want to send
731 * @offset: offset inside the block for the page
732 * @p: pointer to the page
733 * @bytes_transferred: increase it with the number of transferred bytes
734 */
f7ccd61b
JQ
735static int save_zero_page(RAMState *rs, QEMUFile *f, RAMBlock *block,
736 ram_addr_t offset,
56e93d26
JQ
737 uint8_t *p, uint64_t *bytes_transferred)
738{
739 int pages = -1;
740
741 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
f7ccd61b 742 rs->zero_pages++;
56e93d26
JQ
743 *bytes_transferred += save_page_header(f, block,
744 offset | RAM_SAVE_FLAG_COMPRESS);
745 qemu_put_byte(f, 0);
746 *bytes_transferred += 1;
747 pages = 1;
748 }
749
750 return pages;
751}
752
36449157 753static void ram_release_pages(MigrationState *ms, const char *rbname,
53f09a10
PB
754 uint64_t offset, int pages)
755{
756 if (!migrate_release_ram() || !migration_in_postcopy(ms)) {
757 return;
758 }
759
36449157 760 ram_discard_range(NULL, rbname, offset, pages << TARGET_PAGE_BITS);
53f09a10
PB
761}
762
56e93d26 763/**
3d0684b2 764 * ram_save_page: send the given page to the stream
56e93d26 765 *
3d0684b2 766 * Returns the number of pages written.
3fd3c4b3
DDAG
767 * < 0 - error
768 * >=0 - Number of pages written - this might legally be 0
769 * if xbzrle noticed the page was the same.
56e93d26 770 *
6f37bb8b 771 * @rs: current RAM state
3d0684b2 772 * @ms: current migration state
56e93d26
JQ
773 * @f: QEMUFile where to send the data
774 * @block: block that contains the page we want to send
775 * @offset: offset inside the block for the page
776 * @last_stage: if we are at the completion stage
777 * @bytes_transferred: increase it with the number of transferred bytes
778 */
6f37bb8b
JQ
779static int ram_save_page(RAMState *rs, MigrationState *ms, QEMUFile *f,
780 PageSearchStatus *pss, bool last_stage,
781 uint64_t *bytes_transferred)
56e93d26
JQ
782{
783 int pages = -1;
784 uint64_t bytes_xmit;
785 ram_addr_t current_addr;
56e93d26
JQ
786 uint8_t *p;
787 int ret;
788 bool send_async = true;
a08f6890
HZ
789 RAMBlock *block = pss->block;
790 ram_addr_t offset = pss->offset;
56e93d26 791
2f68e399 792 p = block->host + offset;
56e93d26
JQ
793
794 /* In doubt sent page as normal */
795 bytes_xmit = 0;
796 ret = ram_control_save_page(f, block->offset,
797 offset, TARGET_PAGE_SIZE, &bytes_xmit);
798 if (bytes_xmit) {
799 *bytes_transferred += bytes_xmit;
800 pages = 1;
801 }
802
803 XBZRLE_cache_lock();
804
805 current_addr = block->offset + offset;
806
6f37bb8b 807 if (block == rs->last_sent_block) {
56e93d26
JQ
808 offset |= RAM_SAVE_FLAG_CONTINUE;
809 }
810 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
811 if (ret != RAM_SAVE_CONTROL_DELAYED) {
812 if (bytes_xmit > 0) {
b4d1c6e7 813 rs->norm_pages++;
56e93d26 814 } else if (bytes_xmit == 0) {
f7ccd61b 815 rs->zero_pages++;
56e93d26
JQ
816 }
817 }
818 } else {
f7ccd61b 819 pages = save_zero_page(rs, f, block, offset, p, bytes_transferred);
56e93d26
JQ
820 if (pages > 0) {
821 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
822 * page would be stale
823 */
6f37bb8b 824 xbzrle_cache_zero_page(rs, current_addr);
53f09a10 825 ram_release_pages(ms, block->idstr, pss->offset, pages);
6f37bb8b 826 } else if (!rs->ram_bulk_stage &&
9eb14766 827 !migration_in_postcopy(ms) && migrate_use_xbzrle()) {
5a987738 828 pages = save_xbzrle_page(rs, f, &p, current_addr, block,
56e93d26
JQ
829 offset, last_stage, bytes_transferred);
830 if (!last_stage) {
831 /* Can't send this cached data async, since the cache page
832 * might get updated before it gets to the wire
833 */
834 send_async = false;
835 }
836 }
837 }
838
839 /* XBZRLE overflow or normal page */
840 if (pages == -1) {
841 *bytes_transferred += save_page_header(f, block,
842 offset | RAM_SAVE_FLAG_PAGE);
843 if (send_async) {
53f09a10
PB
844 qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE,
845 migrate_release_ram() &
846 migration_in_postcopy(ms));
56e93d26
JQ
847 } else {
848 qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
849 }
850 *bytes_transferred += TARGET_PAGE_SIZE;
851 pages = 1;
b4d1c6e7 852 rs->norm_pages++;
56e93d26
JQ
853 }
854
855 XBZRLE_cache_unlock();
856
857 return pages;
858}
859
a7a9a88f
LL
860static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
861 ram_addr_t offset)
56e93d26
JQ
862{
863 int bytes_sent, blen;
a7a9a88f 864 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
56e93d26 865
a7a9a88f 866 bytes_sent = save_page_header(f, block, offset |
56e93d26 867 RAM_SAVE_FLAG_COMPRESS_PAGE);
a7a9a88f 868 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
56e93d26 869 migrate_compress_level());
b3be2896
LL
870 if (blen < 0) {
871 bytes_sent = 0;
872 qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
873 error_report("compressed data failed!");
874 } else {
875 bytes_sent += blen;
53f09a10
PB
876 ram_release_pages(migrate_get_current(), block->idstr,
877 offset & TARGET_PAGE_MASK, 1);
b3be2896 878 }
56e93d26
JQ
879
880 return bytes_sent;
881}
882
56e93d26
JQ
883static uint64_t bytes_transferred;
884
885static void flush_compressed_data(QEMUFile *f)
886{
887 int idx, len, thread_count;
888
889 if (!migrate_use_compression()) {
890 return;
891 }
892 thread_count = migrate_compress_threads();
a7a9a88f 893
0d9f9a5c 894 qemu_mutex_lock(&comp_done_lock);
56e93d26 895 for (idx = 0; idx < thread_count; idx++) {
a7a9a88f 896 while (!comp_param[idx].done) {
0d9f9a5c 897 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26 898 }
a7a9a88f 899 }
0d9f9a5c 900 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
901
902 for (idx = 0; idx < thread_count; idx++) {
903 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 904 if (!comp_param[idx].quit) {
56e93d26
JQ
905 len = qemu_put_qemu_file(f, comp_param[idx].file);
906 bytes_transferred += len;
907 }
a7a9a88f 908 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26
JQ
909 }
910}
911
912static inline void set_compress_params(CompressParam *param, RAMBlock *block,
913 ram_addr_t offset)
914{
915 param->block = block;
916 param->offset = offset;
917}
918
b4d1c6e7
JQ
919static int compress_page_with_multi_thread(RAMState *rs, QEMUFile *f,
920 RAMBlock *block, ram_addr_t offset,
56e93d26
JQ
921 uint64_t *bytes_transferred)
922{
923 int idx, thread_count, bytes_xmit = -1, pages = -1;
924
925 thread_count = migrate_compress_threads();
0d9f9a5c 926 qemu_mutex_lock(&comp_done_lock);
56e93d26
JQ
927 while (true) {
928 for (idx = 0; idx < thread_count; idx++) {
929 if (comp_param[idx].done) {
a7a9a88f 930 comp_param[idx].done = false;
56e93d26 931 bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file);
a7a9a88f 932 qemu_mutex_lock(&comp_param[idx].mutex);
56e93d26 933 set_compress_params(&comp_param[idx], block, offset);
a7a9a88f
LL
934 qemu_cond_signal(&comp_param[idx].cond);
935 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26 936 pages = 1;
b4d1c6e7 937 rs->norm_pages++;
56e93d26
JQ
938 *bytes_transferred += bytes_xmit;
939 break;
940 }
941 }
942 if (pages > 0) {
943 break;
944 } else {
0d9f9a5c 945 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26
JQ
946 }
947 }
0d9f9a5c 948 qemu_mutex_unlock(&comp_done_lock);
56e93d26
JQ
949
950 return pages;
951}
952
953/**
954 * ram_save_compressed_page: compress the given page and send it to the stream
955 *
3d0684b2 956 * Returns the number of pages written.
56e93d26 957 *
6f37bb8b 958 * @rs: current RAM state
3d0684b2 959 * @ms: current migration state
56e93d26
JQ
960 * @f: QEMUFile where to send the data
961 * @block: block that contains the page we want to send
962 * @offset: offset inside the block for the page
963 * @last_stage: if we are at the completion stage
964 * @bytes_transferred: increase it with the number of transferred bytes
965 */
6f37bb8b
JQ
966static int ram_save_compressed_page(RAMState *rs, MigrationState *ms,
967 QEMUFile *f,
9eb14766 968 PageSearchStatus *pss, bool last_stage,
56e93d26
JQ
969 uint64_t *bytes_transferred)
970{
971 int pages = -1;
fc50438e 972 uint64_t bytes_xmit = 0;
56e93d26 973 uint8_t *p;
fc50438e 974 int ret, blen;
a08f6890
HZ
975 RAMBlock *block = pss->block;
976 ram_addr_t offset = pss->offset;
56e93d26 977
2f68e399 978 p = block->host + offset;
56e93d26 979
56e93d26
JQ
980 ret = ram_control_save_page(f, block->offset,
981 offset, TARGET_PAGE_SIZE, &bytes_xmit);
982 if (bytes_xmit) {
983 *bytes_transferred += bytes_xmit;
984 pages = 1;
985 }
56e93d26
JQ
986 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
987 if (ret != RAM_SAVE_CONTROL_DELAYED) {
988 if (bytes_xmit > 0) {
b4d1c6e7 989 rs->norm_pages++;
56e93d26 990 } else if (bytes_xmit == 0) {
f7ccd61b 991 rs->zero_pages++;
56e93d26
JQ
992 }
993 }
994 } else {
995 /* When starting the process of a new block, the first page of
996 * the block should be sent out before other pages in the same
997 * block, and all the pages in last block should have been sent
998 * out, keeping this order is important, because the 'cont' flag
999 * is used to avoid resending the block name.
1000 */
6f37bb8b 1001 if (block != rs->last_sent_block) {
56e93d26 1002 flush_compressed_data(f);
f7ccd61b 1003 pages = save_zero_page(rs, f, block, offset, p, bytes_transferred);
56e93d26 1004 if (pages == -1) {
fc50438e
LL
1005 /* Make sure the first page is sent out before other pages */
1006 bytes_xmit = save_page_header(f, block, offset |
1007 RAM_SAVE_FLAG_COMPRESS_PAGE);
1008 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
1009 migrate_compress_level());
1010 if (blen > 0) {
1011 *bytes_transferred += bytes_xmit + blen;
b4d1c6e7 1012 rs->norm_pages++;
b3be2896 1013 pages = 1;
fc50438e
LL
1014 } else {
1015 qemu_file_set_error(f, blen);
1016 error_report("compressed data failed!");
b3be2896 1017 }
56e93d26 1018 }
53f09a10
PB
1019 if (pages > 0) {
1020 ram_release_pages(ms, block->idstr, pss->offset, pages);
1021 }
56e93d26 1022 } else {
fc50438e 1023 offset |= RAM_SAVE_FLAG_CONTINUE;
f7ccd61b 1024 pages = save_zero_page(rs, f, block, offset, p, bytes_transferred);
56e93d26 1025 if (pages == -1) {
b4d1c6e7 1026 pages = compress_page_with_multi_thread(rs, f, block, offset,
56e93d26 1027 bytes_transferred);
53f09a10
PB
1028 } else {
1029 ram_release_pages(ms, block->idstr, pss->offset, pages);
56e93d26
JQ
1030 }
1031 }
1032 }
1033
1034 return pages;
1035}
1036
3d0684b2
JQ
1037/**
1038 * find_dirty_block: find the next dirty page and update any state
1039 * associated with the search process.
b9e60928 1040 *
3d0684b2 1041 * Returns if a page is found
b9e60928 1042 *
6f37bb8b 1043 * @rs: current RAM state
3d0684b2
JQ
1044 * @f: QEMUFile where to send the data
1045 * @pss: data about the state of the current dirty page scan
1046 * @again: set to false if the search has scanned the whole of RAM
1047 * @ram_addr_abs: pointer into which to store the address of the dirty page
1048 * within the global ram_addr space
b9e60928 1049 */
6f37bb8b 1050static bool find_dirty_block(RAMState *rs, QEMUFile *f, PageSearchStatus *pss,
f3f491fc 1051 bool *again, ram_addr_t *ram_addr_abs)
b9e60928 1052{
6f37bb8b 1053 pss->offset = migration_bitmap_find_dirty(rs, pss->block, pss->offset,
a82d593b 1054 ram_addr_abs);
6f37bb8b
JQ
1055 if (pss->complete_round && pss->block == rs->last_seen_block &&
1056 pss->offset >= rs->last_offset) {
b9e60928
DDAG
1057 /*
1058 * We've been once around the RAM and haven't found anything.
1059 * Give up.
1060 */
1061 *again = false;
1062 return false;
1063 }
1064 if (pss->offset >= pss->block->used_length) {
1065 /* Didn't find anything in this RAM Block */
1066 pss->offset = 0;
1067 pss->block = QLIST_NEXT_RCU(pss->block, next);
1068 if (!pss->block) {
1069 /* Hit the end of the list */
1070 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1071 /* Flag that we've looped */
1072 pss->complete_round = true;
6f37bb8b 1073 rs->ram_bulk_stage = false;
b9e60928
DDAG
1074 if (migrate_use_xbzrle()) {
1075 /* If xbzrle is on, stop using the data compression at this
1076 * point. In theory, xbzrle can do better than compression.
1077 */
1078 flush_compressed_data(f);
1079 compression_switch = false;
1080 }
1081 }
1082 /* Didn't find anything this time, but try again on the new block */
1083 *again = true;
1084 return false;
1085 } else {
1086 /* Can go around again, but... */
1087 *again = true;
1088 /* We've found something so probably don't need to */
1089 return true;
1090 }
1091}
1092
3d0684b2
JQ
1093/**
1094 * unqueue_page: gets a page of the queue
1095 *
a82d593b 1096 * Helper for 'get_queued_page' - gets a page off the queue
a82d593b 1097 *
3d0684b2
JQ
1098 * Returns the block of the page (or NULL if none available)
1099 *
1100 * @ms: current migration state
1101 * @offset: used to return the offset within the RAMBlock
1102 * @ram_addr_abs: pointer into which to store the address of the dirty page
1103 * within the global ram_addr space
a82d593b
DDAG
1104 */
1105static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset,
1106 ram_addr_t *ram_addr_abs)
1107{
1108 RAMBlock *block = NULL;
1109
1110 qemu_mutex_lock(&ms->src_page_req_mutex);
1111 if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) {
1112 struct MigrationSrcPageRequest *entry =
1113 QSIMPLEQ_FIRST(&ms->src_page_requests);
1114 block = entry->rb;
1115 *offset = entry->offset;
1116 *ram_addr_abs = (entry->offset + entry->rb->offset) &
1117 TARGET_PAGE_MASK;
1118
1119 if (entry->len > TARGET_PAGE_SIZE) {
1120 entry->len -= TARGET_PAGE_SIZE;
1121 entry->offset += TARGET_PAGE_SIZE;
1122 } else {
1123 memory_region_unref(block->mr);
1124 QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1125 g_free(entry);
1126 }
1127 }
1128 qemu_mutex_unlock(&ms->src_page_req_mutex);
1129
1130 return block;
1131}
1132
3d0684b2
JQ
1133/**
1134 * get_queued_page: unqueue a page from the postocpy requests
1135 *
1136 * Skips pages that are already sent (!dirty)
a82d593b 1137 *
3d0684b2 1138 * Returns if a queued page is found
a82d593b 1139 *
6f37bb8b 1140 * @rs: current RAM state
3d0684b2
JQ
1141 * @ms: current migration state
1142 * @pss: data about the state of the current dirty page scan
1143 * @ram_addr_abs: pointer into which to store the address of the dirty page
1144 * within the global ram_addr space
a82d593b 1145 */
6f37bb8b
JQ
1146static bool get_queued_page(RAMState *rs, MigrationState *ms,
1147 PageSearchStatus *pss,
a82d593b
DDAG
1148 ram_addr_t *ram_addr_abs)
1149{
1150 RAMBlock *block;
1151 ram_addr_t offset;
1152 bool dirty;
1153
1154 do {
1155 block = unqueue_page(ms, &offset, ram_addr_abs);
1156 /*
1157 * We're sending this page, and since it's postcopy nothing else
1158 * will dirty it, and we must make sure it doesn't get sent again
1159 * even if this queue request was received after the background
1160 * search already sent it.
1161 */
1162 if (block) {
1163 unsigned long *bitmap;
1164 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1165 dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap);
1166 if (!dirty) {
1167 trace_get_queued_page_not_dirty(
1168 block->idstr, (uint64_t)offset,
1169 (uint64_t)*ram_addr_abs,
1170 test_bit(*ram_addr_abs >> TARGET_PAGE_BITS,
1171 atomic_rcu_read(&migration_bitmap_rcu)->unsentmap));
1172 } else {
1173 trace_get_queued_page(block->idstr,
1174 (uint64_t)offset,
1175 (uint64_t)*ram_addr_abs);
1176 }
1177 }
1178
1179 } while (block && !dirty);
1180
1181 if (block) {
1182 /*
1183 * As soon as we start servicing pages out of order, then we have
1184 * to kill the bulk stage, since the bulk stage assumes
1185 * in (migration_bitmap_find_and_reset_dirty) that every page is
1186 * dirty, that's no longer true.
1187 */
6f37bb8b 1188 rs->ram_bulk_stage = false;
a82d593b
DDAG
1189
1190 /*
1191 * We want the background search to continue from the queued page
1192 * since the guest is likely to want other pages near to the page
1193 * it just requested.
1194 */
1195 pss->block = block;
1196 pss->offset = offset;
1197 }
1198
1199 return !!block;
1200}
1201
6c595cde 1202/**
5e58f968
JQ
1203 * migration_page_queue_free: drop any remaining pages in the ram
1204 * request queue
6c595cde 1205 *
3d0684b2
JQ
1206 * It should be empty at the end anyway, but in error cases there may
1207 * be some left. in case that there is any page left, we drop it.
1208 *
1209 * @ms: current migration state
6c595cde 1210 */
5e58f968 1211void migration_page_queue_free(MigrationState *ms)
6c595cde
DDAG
1212{
1213 struct MigrationSrcPageRequest *mspr, *next_mspr;
1214 /* This queue generally should be empty - but in the case of a failed
1215 * migration might have some droppings in.
1216 */
1217 rcu_read_lock();
1218 QSIMPLEQ_FOREACH_SAFE(mspr, &ms->src_page_requests, next_req, next_mspr) {
1219 memory_region_unref(mspr->rb->mr);
1220 QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1221 g_free(mspr);
1222 }
1223 rcu_read_unlock();
1224}
1225
1226/**
3d0684b2
JQ
1227 * ram_save_queue_pages: queue the page for transmission
1228 *
1229 * A request from postcopy destination for example.
1230 *
1231 * Returns zero on success or negative on error
1232 *
1233 * @ms: current migration state
1234 * @rbname: Name of the RAMBLock of the request. NULL means the
1235 * same that last one.
1236 * @start: starting address from the start of the RAMBlock
1237 * @len: length (in bytes) to send
6c595cde
DDAG
1238 */
1239int ram_save_queue_pages(MigrationState *ms, const char *rbname,
1240 ram_addr_t start, ram_addr_t len)
1241{
1242 RAMBlock *ramblock;
1243
d3bf5418 1244 ms->postcopy_requests++;
6c595cde
DDAG
1245 rcu_read_lock();
1246 if (!rbname) {
1247 /* Reuse last RAMBlock */
1248 ramblock = ms->last_req_rb;
1249
1250 if (!ramblock) {
1251 /*
1252 * Shouldn't happen, we can't reuse the last RAMBlock if
1253 * it's the 1st request.
1254 */
1255 error_report("ram_save_queue_pages no previous block");
1256 goto err;
1257 }
1258 } else {
1259 ramblock = qemu_ram_block_by_name(rbname);
1260
1261 if (!ramblock) {
1262 /* We shouldn't be asked for a non-existent RAMBlock */
1263 error_report("ram_save_queue_pages no block '%s'", rbname);
1264 goto err;
1265 }
1266 ms->last_req_rb = ramblock;
1267 }
1268 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1269 if (start+len > ramblock->used_length) {
9458ad6b
JQ
1270 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1271 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
6c595cde
DDAG
1272 __func__, start, len, ramblock->used_length);
1273 goto err;
1274 }
1275
1276 struct MigrationSrcPageRequest *new_entry =
1277 g_malloc0(sizeof(struct MigrationSrcPageRequest));
1278 new_entry->rb = ramblock;
1279 new_entry->offset = start;
1280 new_entry->len = len;
1281
1282 memory_region_ref(ramblock->mr);
1283 qemu_mutex_lock(&ms->src_page_req_mutex);
1284 QSIMPLEQ_INSERT_TAIL(&ms->src_page_requests, new_entry, next_req);
1285 qemu_mutex_unlock(&ms->src_page_req_mutex);
1286 rcu_read_unlock();
1287
1288 return 0;
1289
1290err:
1291 rcu_read_unlock();
1292 return -1;
1293}
1294
a82d593b 1295/**
3d0684b2 1296 * ram_save_target_page: save one target page
a82d593b 1297 *
3d0684b2 1298 * Returns the number of pages written
a82d593b 1299 *
6f37bb8b 1300 * @rs: current RAM state
3d0684b2 1301 * @ms: current migration state
a82d593b 1302 * @f: QEMUFile where to send the data
3d0684b2 1303 * @pss: data about the page we want to send
a82d593b
DDAG
1304 * @last_stage: if we are at the completion stage
1305 * @bytes_transferred: increase it with the number of transferred bytes
3d0684b2 1306 * @dirty_ram_abs: address of the start of the dirty page in ram_addr_t space
a82d593b 1307 */
6f37bb8b 1308static int ram_save_target_page(RAMState *rs, MigrationState *ms, QEMUFile *f,
a08f6890 1309 PageSearchStatus *pss,
a82d593b
DDAG
1310 bool last_stage,
1311 uint64_t *bytes_transferred,
1312 ram_addr_t dirty_ram_abs)
1313{
1314 int res = 0;
1315
1316 /* Check the pages is dirty and if it is send it */
1317 if (migration_bitmap_clear_dirty(dirty_ram_abs)) {
1318 unsigned long *unsentmap;
1319 if (compression_switch && migrate_use_compression()) {
6f37bb8b 1320 res = ram_save_compressed_page(rs, ms, f, pss,
a82d593b
DDAG
1321 last_stage,
1322 bytes_transferred);
1323 } else {
6f37bb8b 1324 res = ram_save_page(rs, ms, f, pss, last_stage,
a82d593b
DDAG
1325 bytes_transferred);
1326 }
1327
1328 if (res < 0) {
1329 return res;
1330 }
1331 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1332 if (unsentmap) {
1333 clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap);
1334 }
3fd3c4b3
DDAG
1335 /* Only update last_sent_block if a block was actually sent; xbzrle
1336 * might have decided the page was identical so didn't bother writing
1337 * to the stream.
1338 */
1339 if (res > 0) {
6f37bb8b 1340 rs->last_sent_block = pss->block;
3fd3c4b3 1341 }
a82d593b
DDAG
1342 }
1343
1344 return res;
1345}
1346
1347/**
3d0684b2 1348 * ram_save_host_page: save a whole host page
a82d593b 1349 *
3d0684b2
JQ
1350 * Starting at *offset send pages up to the end of the current host
1351 * page. It's valid for the initial offset to point into the middle of
1352 * a host page in which case the remainder of the hostpage is sent.
1353 * Only dirty target pages are sent. Note that the host page size may
1354 * be a huge page for this block.
a82d593b 1355 *
3d0684b2
JQ
1356 * Returns the number of pages written or negative on error
1357 *
6f37bb8b 1358 * @rs: current RAM state
3d0684b2 1359 * @ms: current migration state
a82d593b 1360 * @f: QEMUFile where to send the data
3d0684b2 1361 * @pss: data about the page we want to send
a82d593b
DDAG
1362 * @last_stage: if we are at the completion stage
1363 * @bytes_transferred: increase it with the number of transferred bytes
1364 * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1365 */
6f37bb8b 1366static int ram_save_host_page(RAMState *rs, MigrationState *ms, QEMUFile *f,
a08f6890
HZ
1367 PageSearchStatus *pss,
1368 bool last_stage,
a82d593b
DDAG
1369 uint64_t *bytes_transferred,
1370 ram_addr_t dirty_ram_abs)
1371{
1372 int tmppages, pages = 0;
4c011c37
DDAG
1373 size_t pagesize = qemu_ram_pagesize(pss->block);
1374
a82d593b 1375 do {
6f37bb8b 1376 tmppages = ram_save_target_page(rs, ms, f, pss, last_stage,
a82d593b
DDAG
1377 bytes_transferred, dirty_ram_abs);
1378 if (tmppages < 0) {
1379 return tmppages;
1380 }
1381
1382 pages += tmppages;
a08f6890 1383 pss->offset += TARGET_PAGE_SIZE;
a82d593b 1384 dirty_ram_abs += TARGET_PAGE_SIZE;
4c011c37 1385 } while (pss->offset & (pagesize - 1));
a82d593b
DDAG
1386
1387 /* The offset we leave with is the last one we looked at */
a08f6890 1388 pss->offset -= TARGET_PAGE_SIZE;
a82d593b
DDAG
1389 return pages;
1390}
6c595cde 1391
56e93d26 1392/**
3d0684b2 1393 * ram_find_and_save_block: finds a dirty page and sends it to f
56e93d26
JQ
1394 *
1395 * Called within an RCU critical section.
1396 *
3d0684b2 1397 * Returns the number of pages written where zero means no dirty pages
56e93d26 1398 *
6f37bb8b 1399 * @rs: current RAM state
56e93d26
JQ
1400 * @f: QEMUFile where to send the data
1401 * @last_stage: if we are at the completion stage
1402 * @bytes_transferred: increase it with the number of transferred bytes
a82d593b
DDAG
1403 *
1404 * On systems where host-page-size > target-page-size it will send all the
1405 * pages in a host page that are dirty.
56e93d26
JQ
1406 */
1407
6f37bb8b 1408static int ram_find_and_save_block(RAMState *rs, QEMUFile *f, bool last_stage,
56e93d26
JQ
1409 uint64_t *bytes_transferred)
1410{
b8fb8cb7 1411 PageSearchStatus pss;
a82d593b 1412 MigrationState *ms = migrate_get_current();
56e93d26 1413 int pages = 0;
b9e60928 1414 bool again, found;
f3f491fc
DDAG
1415 ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
1416 ram_addr_t space */
56e93d26 1417
0827b9e9
AA
1418 /* No dirty page as there is zero RAM */
1419 if (!ram_bytes_total()) {
1420 return pages;
1421 }
1422
6f37bb8b
JQ
1423 pss.block = rs->last_seen_block;
1424 pss.offset = rs->last_offset;
b8fb8cb7
DDAG
1425 pss.complete_round = false;
1426
1427 if (!pss.block) {
1428 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1429 }
56e93d26 1430
b9e60928 1431 do {
a82d593b 1432 again = true;
6f37bb8b 1433 found = get_queued_page(rs, ms, &pss, &dirty_ram_abs);
b9e60928 1434
a82d593b
DDAG
1435 if (!found) {
1436 /* priority queue empty, so just search for something dirty */
6f37bb8b 1437 found = find_dirty_block(rs, f, &pss, &again, &dirty_ram_abs);
a82d593b 1438 }
f3f491fc 1439
a82d593b 1440 if (found) {
6f37bb8b 1441 pages = ram_save_host_page(rs, ms, f, &pss,
a82d593b
DDAG
1442 last_stage, bytes_transferred,
1443 dirty_ram_abs);
56e93d26 1444 }
b9e60928 1445 } while (!pages && again);
56e93d26 1446
6f37bb8b
JQ
1447 rs->last_seen_block = pss.block;
1448 rs->last_offset = pss.offset;
56e93d26
JQ
1449
1450 return pages;
1451}
1452
1453void acct_update_position(QEMUFile *f, size_t size, bool zero)
1454{
1455 uint64_t pages = size / TARGET_PAGE_SIZE;
f7ccd61b
JQ
1456 RAMState *rs = &ram_state;
1457
56e93d26 1458 if (zero) {
f7ccd61b 1459 rs->zero_pages += pages;
56e93d26 1460 } else {
b4d1c6e7 1461 rs->norm_pages += pages;
56e93d26
JQ
1462 bytes_transferred += size;
1463 qemu_update_position(f, size);
1464 }
1465}
1466
1467static ram_addr_t ram_save_remaining(void)
1468{
1469 return migration_dirty_pages;
1470}
1471
1472uint64_t ram_bytes_remaining(void)
1473{
1474 return ram_save_remaining() * TARGET_PAGE_SIZE;
1475}
1476
1477uint64_t ram_bytes_transferred(void)
1478{
1479 return bytes_transferred;
1480}
1481
1482uint64_t ram_bytes_total(void)
1483{
1484 RAMBlock *block;
1485 uint64_t total = 0;
1486
1487 rcu_read_lock();
1488 QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
1489 total += block->used_length;
1490 rcu_read_unlock();
1491 return total;
1492}
1493
1494void free_xbzrle_decoded_buf(void)
1495{
1496 g_free(xbzrle_decoded_buf);
1497 xbzrle_decoded_buf = NULL;
1498}
1499
60be6340
DL
1500static void migration_bitmap_free(struct BitmapRcu *bmap)
1501{
1502 g_free(bmap->bmap);
f3f491fc 1503 g_free(bmap->unsentmap);
60be6340
DL
1504 g_free(bmap);
1505}
1506
6ad2a215 1507static void ram_migration_cleanup(void *opaque)
56e93d26 1508{
2ff64038
LZ
1509 /* caller have hold iothread lock or is in a bh, so there is
1510 * no writing race against this migration_bitmap
1511 */
60be6340
DL
1512 struct BitmapRcu *bitmap = migration_bitmap_rcu;
1513 atomic_rcu_set(&migration_bitmap_rcu, NULL);
2ff64038 1514 if (bitmap) {
56e93d26 1515 memory_global_dirty_log_stop();
60be6340 1516 call_rcu(bitmap, migration_bitmap_free, rcu);
56e93d26
JQ
1517 }
1518
1519 XBZRLE_cache_lock();
1520 if (XBZRLE.cache) {
1521 cache_fini(XBZRLE.cache);
1522 g_free(XBZRLE.encoded_buf);
1523 g_free(XBZRLE.current_buf);
adb65dec 1524 g_free(ZERO_TARGET_PAGE);
56e93d26
JQ
1525 XBZRLE.cache = NULL;
1526 XBZRLE.encoded_buf = NULL;
1527 XBZRLE.current_buf = NULL;
1528 }
1529 XBZRLE_cache_unlock();
1530}
1531
6f37bb8b 1532static void ram_state_reset(RAMState *rs)
56e93d26 1533{
6f37bb8b
JQ
1534 rs->last_seen_block = NULL;
1535 rs->last_sent_block = NULL;
1536 rs->last_offset = 0;
1537 rs->last_version = ram_list.version;
1538 rs->ram_bulk_stage = true;
56e93d26
JQ
1539}
1540
1541#define MAX_WAIT 50 /* ms, half buffered_file limit */
1542
dd631697
LZ
1543void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
1544{
1545 /* called in qemu main thread, so there is
1546 * no writing race against this migration_bitmap
1547 */
60be6340
DL
1548 if (migration_bitmap_rcu) {
1549 struct BitmapRcu *old_bitmap = migration_bitmap_rcu, *bitmap;
1550 bitmap = g_new(struct BitmapRcu, 1);
1551 bitmap->bmap = bitmap_new(new);
dd631697
LZ
1552
1553 /* prevent migration_bitmap content from being set bit
1554 * by migration_bitmap_sync_range() at the same time.
1555 * it is safe to migration if migration_bitmap is cleared bit
1556 * at the same time.
1557 */
1558 qemu_mutex_lock(&migration_bitmap_mutex);
60be6340
DL
1559 bitmap_copy(bitmap->bmap, old_bitmap->bmap, old);
1560 bitmap_set(bitmap->bmap, old, new - old);
f3f491fc
DDAG
1561
1562 /* We don't have a way to safely extend the sentmap
1563 * with RCU; so mark it as missing, entry to postcopy
1564 * will fail.
1565 */
1566 bitmap->unsentmap = NULL;
1567
60be6340 1568 atomic_rcu_set(&migration_bitmap_rcu, bitmap);
dd631697
LZ
1569 qemu_mutex_unlock(&migration_bitmap_mutex);
1570 migration_dirty_pages += new - old;
60be6340 1571 call_rcu(old_bitmap, migration_bitmap_free, rcu);
dd631697
LZ
1572 }
1573}
56e93d26 1574
4f2e4252
DDAG
1575/*
1576 * 'expected' is the value you expect the bitmap mostly to be full
1577 * of; it won't bother printing lines that are all this value.
1578 * If 'todump' is null the migration bitmap is dumped.
1579 */
1580void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
1581{
1582 int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1583
1584 int64_t cur;
1585 int64_t linelen = 128;
1586 char linebuf[129];
1587
1588 if (!todump) {
1589 todump = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1590 }
1591
1592 for (cur = 0; cur < ram_pages; cur += linelen) {
1593 int64_t curb;
1594 bool found = false;
1595 /*
1596 * Last line; catch the case where the line length
1597 * is longer than remaining ram
1598 */
1599 if (cur + linelen > ram_pages) {
1600 linelen = ram_pages - cur;
1601 }
1602 for (curb = 0; curb < linelen; curb++) {
1603 bool thisbit = test_bit(cur + curb, todump);
1604 linebuf[curb] = thisbit ? '1' : '.';
1605 found = found || (thisbit != expected);
1606 }
1607 if (found) {
1608 linebuf[curb] = '\0';
1609 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
1610 }
1611 }
1612}
1613
e0b266f0
DDAG
1614/* **** functions for postcopy ***** */
1615
ced1c616
PB
1616void ram_postcopy_migrated_memory_release(MigrationState *ms)
1617{
1618 struct RAMBlock *block;
1619 unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1620
1621 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1622 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1623 unsigned long range = first + (block->used_length >> TARGET_PAGE_BITS);
1624 unsigned long run_start = find_next_zero_bit(bitmap, range, first);
1625
1626 while (run_start < range) {
1627 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1628 ram_discard_range(NULL, block->idstr, run_start << TARGET_PAGE_BITS,
1629 (run_end - run_start) << TARGET_PAGE_BITS);
1630 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1631 }
1632 }
1633}
1634
3d0684b2
JQ
1635/**
1636 * postcopy_send_discard_bm_ram: discard a RAMBlock
1637 *
1638 * Returns zero on success
1639 *
e0b266f0
DDAG
1640 * Callback from postcopy_each_ram_send_discard for each RAMBlock
1641 * Note: At this point the 'unsentmap' is the processed bitmap combined
1642 * with the dirtymap; so a '1' means it's either dirty or unsent.
3d0684b2
JQ
1643 *
1644 * @ms: current migration state
1645 * @pds: state for postcopy
1646 * @start: RAMBlock starting page
1647 * @length: RAMBlock size
e0b266f0
DDAG
1648 */
1649static int postcopy_send_discard_bm_ram(MigrationState *ms,
1650 PostcopyDiscardState *pds,
1651 unsigned long start,
1652 unsigned long length)
1653{
1654 unsigned long end = start + length; /* one after the end */
1655 unsigned long current;
1656 unsigned long *unsentmap;
1657
1658 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1659 for (current = start; current < end; ) {
1660 unsigned long one = find_next_bit(unsentmap, end, current);
1661
1662 if (one <= end) {
1663 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1664 unsigned long discard_length;
1665
1666 if (zero >= end) {
1667 discard_length = end - one;
1668 } else {
1669 discard_length = zero - one;
1670 }
d688c62d
DDAG
1671 if (discard_length) {
1672 postcopy_discard_send_range(ms, pds, one, discard_length);
1673 }
e0b266f0
DDAG
1674 current = one + discard_length;
1675 } else {
1676 current = one;
1677 }
1678 }
1679
1680 return 0;
1681}
1682
3d0684b2
JQ
1683/**
1684 * postcopy_each_ram_send_discard: discard all RAMBlocks
1685 *
1686 * Returns 0 for success or negative for error
1687 *
e0b266f0
DDAG
1688 * Utility for the outgoing postcopy code.
1689 * Calls postcopy_send_discard_bm_ram for each RAMBlock
1690 * passing it bitmap indexes and name.
e0b266f0
DDAG
1691 * (qemu_ram_foreach_block ends up passing unscaled lengths
1692 * which would mean postcopy code would have to deal with target page)
3d0684b2
JQ
1693 *
1694 * @ms: current migration state
e0b266f0
DDAG
1695 */
1696static int postcopy_each_ram_send_discard(MigrationState *ms)
1697{
1698 struct RAMBlock *block;
1699 int ret;
1700
1701 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1702 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1703 PostcopyDiscardState *pds = postcopy_discard_send_init(ms,
1704 first,
1705 block->idstr);
1706
1707 /*
1708 * Postcopy sends chunks of bitmap over the wire, but it
1709 * just needs indexes at this point, avoids it having
1710 * target page specific code.
1711 */
1712 ret = postcopy_send_discard_bm_ram(ms, pds, first,
1713 block->used_length >> TARGET_PAGE_BITS);
1714 postcopy_discard_send_finish(ms, pds);
1715 if (ret) {
1716 return ret;
1717 }
1718 }
1719
1720 return 0;
1721}
1722
3d0684b2
JQ
1723/**
1724 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1725 *
1726 * Helper for postcopy_chunk_hostpages; it's called twice to
1727 * canonicalize the two bitmaps, that are similar, but one is
1728 * inverted.
99e314eb 1729 *
3d0684b2
JQ
1730 * Postcopy requires that all target pages in a hostpage are dirty or
1731 * clean, not a mix. This function canonicalizes the bitmaps.
99e314eb 1732 *
3d0684b2
JQ
1733 * @ms: current migration state
1734 * @unsent_pass: if true we need to canonicalize partially unsent host pages
1735 * otherwise we need to canonicalize partially dirty host pages
1736 * @block: block that contains the page we want to canonicalize
1737 * @pds: state for postcopy
99e314eb
DDAG
1738 */
1739static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1740 RAMBlock *block,
1741 PostcopyDiscardState *pds)
1742{
1743 unsigned long *bitmap;
1744 unsigned long *unsentmap;
29c59172 1745 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
99e314eb
DDAG
1746 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1747 unsigned long len = block->used_length >> TARGET_PAGE_BITS;
1748 unsigned long last = first + (len - 1);
1749 unsigned long run_start;
1750
29c59172
DDAG
1751 if (block->page_size == TARGET_PAGE_SIZE) {
1752 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1753 return;
1754 }
1755
99e314eb
DDAG
1756 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1757 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1758
1759 if (unsent_pass) {
1760 /* Find a sent page */
1761 run_start = find_next_zero_bit(unsentmap, last + 1, first);
1762 } else {
1763 /* Find a dirty page */
1764 run_start = find_next_bit(bitmap, last + 1, first);
1765 }
1766
1767 while (run_start <= last) {
1768 bool do_fixup = false;
1769 unsigned long fixup_start_addr;
1770 unsigned long host_offset;
1771
1772 /*
1773 * If the start of this run of pages is in the middle of a host
1774 * page, then we need to fixup this host page.
1775 */
1776 host_offset = run_start % host_ratio;
1777 if (host_offset) {
1778 do_fixup = true;
1779 run_start -= host_offset;
1780 fixup_start_addr = run_start;
1781 /* For the next pass */
1782 run_start = run_start + host_ratio;
1783 } else {
1784 /* Find the end of this run */
1785 unsigned long run_end;
1786 if (unsent_pass) {
1787 run_end = find_next_bit(unsentmap, last + 1, run_start + 1);
1788 } else {
1789 run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1);
1790 }
1791 /*
1792 * If the end isn't at the start of a host page, then the
1793 * run doesn't finish at the end of a host page
1794 * and we need to discard.
1795 */
1796 host_offset = run_end % host_ratio;
1797 if (host_offset) {
1798 do_fixup = true;
1799 fixup_start_addr = run_end - host_offset;
1800 /*
1801 * This host page has gone, the next loop iteration starts
1802 * from after the fixup
1803 */
1804 run_start = fixup_start_addr + host_ratio;
1805 } else {
1806 /*
1807 * No discards on this iteration, next loop starts from
1808 * next sent/dirty page
1809 */
1810 run_start = run_end + 1;
1811 }
1812 }
1813
1814 if (do_fixup) {
1815 unsigned long page;
1816
1817 /* Tell the destination to discard this page */
1818 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1819 /* For the unsent_pass we:
1820 * discard partially sent pages
1821 * For the !unsent_pass (dirty) we:
1822 * discard partially dirty pages that were sent
1823 * (any partially sent pages were already discarded
1824 * by the previous unsent_pass)
1825 */
1826 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1827 host_ratio);
1828 }
1829
1830 /* Clean up the bitmap */
1831 for (page = fixup_start_addr;
1832 page < fixup_start_addr + host_ratio; page++) {
1833 /* All pages in this host page are now not sent */
1834 set_bit(page, unsentmap);
1835
1836 /*
1837 * Remark them as dirty, updating the count for any pages
1838 * that weren't previously dirty.
1839 */
1840 migration_dirty_pages += !test_and_set_bit(page, bitmap);
1841 }
1842 }
1843
1844 if (unsent_pass) {
1845 /* Find the next sent page for the next iteration */
1846 run_start = find_next_zero_bit(unsentmap, last + 1,
1847 run_start);
1848 } else {
1849 /* Find the next dirty page for the next iteration */
1850 run_start = find_next_bit(bitmap, last + 1, run_start);
1851 }
1852 }
1853}
1854
3d0684b2
JQ
1855/**
1856 * postcopy_chuck_hostpages: discrad any partially sent host page
1857 *
99e314eb
DDAG
1858 * Utility for the outgoing postcopy code.
1859 *
1860 * Discard any partially sent host-page size chunks, mark any partially
29c59172
DDAG
1861 * dirty host-page size chunks as all dirty. In this case the host-page
1862 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
99e314eb 1863 *
3d0684b2
JQ
1864 * Returns zero on success
1865 *
1866 * @ms: current migration state
99e314eb
DDAG
1867 */
1868static int postcopy_chunk_hostpages(MigrationState *ms)
1869{
6f37bb8b 1870 RAMState *rs = &ram_state;
99e314eb
DDAG
1871 struct RAMBlock *block;
1872
99e314eb 1873 /* Easiest way to make sure we don't resume in the middle of a host-page */
6f37bb8b
JQ
1874 rs->last_seen_block = NULL;
1875 rs->last_sent_block = NULL;
1876 rs->last_offset = 0;
99e314eb
DDAG
1877
1878 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1879 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1880
1881 PostcopyDiscardState *pds =
1882 postcopy_discard_send_init(ms, first, block->idstr);
1883
1884 /* First pass: Discard all partially sent host pages */
1885 postcopy_chunk_hostpages_pass(ms, true, block, pds);
1886 /*
1887 * Second pass: Ensure that all partially dirty host pages are made
1888 * fully dirty.
1889 */
1890 postcopy_chunk_hostpages_pass(ms, false, block, pds);
1891
1892 postcopy_discard_send_finish(ms, pds);
1893 } /* ram_list loop */
1894
1895 return 0;
1896}
1897
3d0684b2
JQ
1898/**
1899 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1900 *
1901 * Returns zero on success
1902 *
e0b266f0
DDAG
1903 * Transmit the set of pages to be discarded after precopy to the target
1904 * these are pages that:
1905 * a) Have been previously transmitted but are now dirty again
1906 * b) Pages that have never been transmitted, this ensures that
1907 * any pages on the destination that have been mapped by background
1908 * tasks get discarded (transparent huge pages is the specific concern)
1909 * Hopefully this is pretty sparse
3d0684b2
JQ
1910 *
1911 * @ms: current migration state
e0b266f0
DDAG
1912 */
1913int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1914{
1915 int ret;
1916 unsigned long *bitmap, *unsentmap;
1917
1918 rcu_read_lock();
1919
1920 /* This should be our last sync, the src is now paused */
8d820d6f 1921 migration_bitmap_sync(&ram_state);
e0b266f0
DDAG
1922
1923 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1924 if (!unsentmap) {
1925 /* We don't have a safe way to resize the sentmap, so
1926 * if the bitmap was resized it will be NULL at this
1927 * point.
1928 */
1929 error_report("migration ram resized during precopy phase");
1930 rcu_read_unlock();
1931 return -EINVAL;
1932 }
1933
29c59172 1934 /* Deal with TPS != HPS and huge pages */
99e314eb
DDAG
1935 ret = postcopy_chunk_hostpages(ms);
1936 if (ret) {
1937 rcu_read_unlock();
1938 return ret;
1939 }
1940
e0b266f0
DDAG
1941 /*
1942 * Update the unsentmap to be unsentmap = unsentmap | dirty
1943 */
1944 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1945 bitmap_or(unsentmap, unsentmap, bitmap,
1946 last_ram_offset() >> TARGET_PAGE_BITS);
1947
1948
1949 trace_ram_postcopy_send_discard_bitmap();
1950#ifdef DEBUG_POSTCOPY
1951 ram_debug_dump_bitmap(unsentmap, true);
1952#endif
1953
1954 ret = postcopy_each_ram_send_discard(ms);
1955 rcu_read_unlock();
1956
1957 return ret;
1958}
1959
3d0684b2
JQ
1960/**
1961 * ram_discard_range: discard dirtied pages at the beginning of postcopy
e0b266f0 1962 *
3d0684b2 1963 * Returns zero on success
e0b266f0 1964 *
3d0684b2 1965 * @mis: current migration incoming state
36449157
JQ
1966 * @rbname: name of the RAMBlock of the request. NULL means the
1967 * same that last one.
3d0684b2
JQ
1968 * @start: RAMBlock starting page
1969 * @length: RAMBlock size
e0b266f0
DDAG
1970 */
1971int ram_discard_range(MigrationIncomingState *mis,
36449157 1972 const char *rbname,
e0b266f0
DDAG
1973 uint64_t start, size_t length)
1974{
1975 int ret = -1;
1976
36449157 1977 trace_ram_discard_range(rbname, start, length);
d3a5038c 1978
e0b266f0 1979 rcu_read_lock();
36449157 1980 RAMBlock *rb = qemu_ram_block_by_name(rbname);
e0b266f0
DDAG
1981
1982 if (!rb) {
36449157 1983 error_report("ram_discard_range: Failed to find block '%s'", rbname);
e0b266f0
DDAG
1984 goto err;
1985 }
1986
d3a5038c 1987 ret = ram_block_discard_range(rb, start, length);
e0b266f0
DDAG
1988
1989err:
1990 rcu_read_unlock();
1991
1992 return ret;
1993}
1994
6f37bb8b 1995static int ram_save_init_globals(RAMState *rs)
56e93d26 1996{
56e93d26
JQ
1997 int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
1998
8d820d6f 1999 rs->dirty_rate_high_cnt = 0;
5a987738 2000 rs->bitmap_sync_count = 0;
f7ccd61b 2001 rs->zero_pages = 0;
b4d1c6e7 2002 rs->norm_pages = 0;
23b28c3c 2003 rs->iterations = 0;
07ed50a2 2004 rs->xbzrle_bytes = 0;
f36ada95 2005 rs->xbzrle_pages = 0;
544c36f1 2006 rs->xbzrle_cache_miss = 0;
b07016b6 2007 rs->xbzrle_cache_miss_rate = 0;
f664da80 2008 migration_bitmap_sync_init(rs);
dd631697 2009 qemu_mutex_init(&migration_bitmap_mutex);
56e93d26
JQ
2010
2011 if (migrate_use_xbzrle()) {
2012 XBZRLE_cache_lock();
adb65dec 2013 ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE);
56e93d26
JQ
2014 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
2015 TARGET_PAGE_SIZE,
2016 TARGET_PAGE_SIZE);
2017 if (!XBZRLE.cache) {
2018 XBZRLE_cache_unlock();
2019 error_report("Error creating cache");
2020 return -1;
2021 }
2022 XBZRLE_cache_unlock();
2023
2024 /* We prefer not to abort if there is no memory */
2025 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2026 if (!XBZRLE.encoded_buf) {
2027 error_report("Error allocating encoded_buf");
2028 return -1;
2029 }
2030
2031 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2032 if (!XBZRLE.current_buf) {
2033 error_report("Error allocating current_buf");
2034 g_free(XBZRLE.encoded_buf);
2035 XBZRLE.encoded_buf = NULL;
2036 return -1;
2037 }
2038
2039 acct_clear();
2040 }
2041
49877834
PB
2042 /* For memory_global_dirty_log_start below. */
2043 qemu_mutex_lock_iothread();
2044
56e93d26
JQ
2045 qemu_mutex_lock_ramlist();
2046 rcu_read_lock();
2047 bytes_transferred = 0;
6f37bb8b 2048 ram_state_reset(rs);
56e93d26 2049
f3f491fc 2050 migration_bitmap_rcu = g_new0(struct BitmapRcu, 1);
0827b9e9
AA
2051 /* Skip setting bitmap if there is no RAM */
2052 if (ram_bytes_total()) {
2053 ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
2054 migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages);
2055 bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages);
2056
2057 if (migrate_postcopy_ram()) {
2058 migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages);
2059 bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages);
2060 }
f3f491fc
DDAG
2061 }
2062
56e93d26
JQ
2063 /*
2064 * Count the total number of pages used by ram blocks not including any
2065 * gaps due to alignment or unplugs.
2066 */
2067 migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2068
2069 memory_global_dirty_log_start();
8d820d6f 2070 migration_bitmap_sync(rs);
56e93d26 2071 qemu_mutex_unlock_ramlist();
49877834 2072 qemu_mutex_unlock_iothread();
a91246c9
HZ
2073 rcu_read_unlock();
2074
2075 return 0;
2076}
2077
3d0684b2
JQ
2078/*
2079 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
a91246c9
HZ
2080 * long-running RCU critical section. When rcu-reclaims in the code
2081 * start to become numerous it will be necessary to reduce the
2082 * granularity of these critical sections.
2083 */
2084
3d0684b2
JQ
2085/**
2086 * ram_save_setup: Setup RAM for migration
2087 *
2088 * Returns zero to indicate success and negative for error
2089 *
2090 * @f: QEMUFile where to send the data
2091 * @opaque: RAMState pointer
2092 */
a91246c9
HZ
2093static int ram_save_setup(QEMUFile *f, void *opaque)
2094{
6f37bb8b 2095 RAMState *rs = opaque;
a91246c9
HZ
2096 RAMBlock *block;
2097
2098 /* migration has already setup the bitmap, reuse it. */
2099 if (!migration_in_colo_state()) {
6f37bb8b 2100 if (ram_save_init_globals(rs) < 0) {
a91246c9
HZ
2101 return -1;
2102 }
2103 }
2104
2105 rcu_read_lock();
56e93d26
JQ
2106
2107 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2108
2109 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2110 qemu_put_byte(f, strlen(block->idstr));
2111 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2112 qemu_put_be64(f, block->used_length);
ef08fb38
DDAG
2113 if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2114 qemu_put_be64(f, block->page_size);
2115 }
56e93d26
JQ
2116 }
2117
2118 rcu_read_unlock();
2119
2120 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2121 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2122
2123 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2124
2125 return 0;
2126}
2127
3d0684b2
JQ
2128/**
2129 * ram_save_iterate: iterative stage for migration
2130 *
2131 * Returns zero to indicate success and negative for error
2132 *
2133 * @f: QEMUFile where to send the data
2134 * @opaque: RAMState pointer
2135 */
56e93d26
JQ
2136static int ram_save_iterate(QEMUFile *f, void *opaque)
2137{
6f37bb8b 2138 RAMState *rs = opaque;
56e93d26
JQ
2139 int ret;
2140 int i;
2141 int64_t t0;
5c90308f 2142 int done = 0;
56e93d26
JQ
2143
2144 rcu_read_lock();
6f37bb8b
JQ
2145 if (ram_list.version != rs->last_version) {
2146 ram_state_reset(rs);
56e93d26
JQ
2147 }
2148
2149 /* Read version before ram_list.blocks */
2150 smp_rmb();
2151
2152 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2153
2154 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2155 i = 0;
2156 while ((ret = qemu_file_rate_limit(f)) == 0) {
2157 int pages;
2158
6f37bb8b 2159 pages = ram_find_and_save_block(rs, f, false, &bytes_transferred);
56e93d26
JQ
2160 /* no more pages to sent */
2161 if (pages == 0) {
5c90308f 2162 done = 1;
56e93d26
JQ
2163 break;
2164 }
23b28c3c 2165 rs->iterations++;
070afca2 2166
56e93d26
JQ
2167 /* we want to check in the 1st loop, just in case it was the 1st time
2168 and we had to sync the dirty bitmap.
2169 qemu_get_clock_ns() is a bit expensive, so we only check each some
2170 iterations
2171 */
2172 if ((i & 63) == 0) {
2173 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2174 if (t1 > MAX_WAIT) {
55c4446b 2175 trace_ram_save_iterate_big_wait(t1, i);
56e93d26
JQ
2176 break;
2177 }
2178 }
2179 i++;
2180 }
2181 flush_compressed_data(f);
2182 rcu_read_unlock();
2183
2184 /*
2185 * Must occur before EOS (or any QEMUFile operation)
2186 * because of RDMA protocol.
2187 */
2188 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2189
2190 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2191 bytes_transferred += 8;
2192
2193 ret = qemu_file_get_error(f);
2194 if (ret < 0) {
2195 return ret;
2196 }
2197
5c90308f 2198 return done;
56e93d26
JQ
2199}
2200
3d0684b2
JQ
2201/**
2202 * ram_save_complete: function called to send the remaining amount of ram
2203 *
2204 * Returns zero to indicate success
2205 *
2206 * Called with iothread lock
2207 *
2208 * @f: QEMUFile where to send the data
2209 * @opaque: RAMState pointer
2210 */
56e93d26
JQ
2211static int ram_save_complete(QEMUFile *f, void *opaque)
2212{
6f37bb8b
JQ
2213 RAMState *rs = opaque;
2214
56e93d26
JQ
2215 rcu_read_lock();
2216
663e6c1d 2217 if (!migration_in_postcopy(migrate_get_current())) {
8d820d6f 2218 migration_bitmap_sync(rs);
663e6c1d 2219 }
56e93d26
JQ
2220
2221 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2222
2223 /* try transferring iterative blocks of memory */
2224
2225 /* flush all remaining blocks regardless of rate limiting */
2226 while (true) {
2227 int pages;
2228
6f37bb8b 2229 pages = ram_find_and_save_block(rs, f, !migration_in_colo_state(),
a91246c9 2230 &bytes_transferred);
56e93d26
JQ
2231 /* no more blocks to sent */
2232 if (pages == 0) {
2233 break;
2234 }
2235 }
2236
2237 flush_compressed_data(f);
2238 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
56e93d26
JQ
2239
2240 rcu_read_unlock();
d09a6fde 2241
56e93d26
JQ
2242 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2243
2244 return 0;
2245}
2246
c31b098f
DDAG
2247static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2248 uint64_t *non_postcopiable_pending,
2249 uint64_t *postcopiable_pending)
56e93d26 2250{
8d820d6f 2251 RAMState *rs = opaque;
56e93d26
JQ
2252 uint64_t remaining_size;
2253
2254 remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2255
663e6c1d
DDAG
2256 if (!migration_in_postcopy(migrate_get_current()) &&
2257 remaining_size < max_size) {
56e93d26
JQ
2258 qemu_mutex_lock_iothread();
2259 rcu_read_lock();
8d820d6f 2260 migration_bitmap_sync(rs);
56e93d26
JQ
2261 rcu_read_unlock();
2262 qemu_mutex_unlock_iothread();
2263 remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2264 }
c31b098f
DDAG
2265
2266 /* We can do postcopy, and all the data is postcopiable */
2267 *postcopiable_pending += remaining_size;
56e93d26
JQ
2268}
2269
2270static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2271{
2272 unsigned int xh_len;
2273 int xh_flags;
063e760a 2274 uint8_t *loaded_data;
56e93d26
JQ
2275
2276 if (!xbzrle_decoded_buf) {
2277 xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2278 }
063e760a 2279 loaded_data = xbzrle_decoded_buf;
56e93d26
JQ
2280
2281 /* extract RLE header */
2282 xh_flags = qemu_get_byte(f);
2283 xh_len = qemu_get_be16(f);
2284
2285 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2286 error_report("Failed to load XBZRLE page - wrong compression!");
2287 return -1;
2288 }
2289
2290 if (xh_len > TARGET_PAGE_SIZE) {
2291 error_report("Failed to load XBZRLE page - len overflow!");
2292 return -1;
2293 }
2294 /* load data and decode */
063e760a 2295 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
56e93d26
JQ
2296
2297 /* decode RLE */
063e760a 2298 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
56e93d26
JQ
2299 TARGET_PAGE_SIZE) == -1) {
2300 error_report("Failed to load XBZRLE page - decode error!");
2301 return -1;
2302 }
2303
2304 return 0;
2305}
2306
3d0684b2
JQ
2307/**
2308 * ram_block_from_stream: read a RAMBlock id from the migration stream
2309 *
2310 * Must be called from within a rcu critical section.
2311 *
56e93d26 2312 * Returns a pointer from within the RCU-protected ram_list.
a7180877 2313 *
3d0684b2
JQ
2314 * @f: QEMUFile where to read the data from
2315 * @flags: Page flags (mostly to see if it's a continuation of previous block)
a7180877 2316 */
3d0684b2 2317static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
56e93d26
JQ
2318{
2319 static RAMBlock *block = NULL;
2320 char id[256];
2321 uint8_t len;
2322
2323 if (flags & RAM_SAVE_FLAG_CONTINUE) {
4c4bad48 2324 if (!block) {
56e93d26
JQ
2325 error_report("Ack, bad migration stream!");
2326 return NULL;
2327 }
4c4bad48 2328 return block;
56e93d26
JQ
2329 }
2330
2331 len = qemu_get_byte(f);
2332 qemu_get_buffer(f, (uint8_t *)id, len);
2333 id[len] = 0;
2334
e3dd7493 2335 block = qemu_ram_block_by_name(id);
4c4bad48
HZ
2336 if (!block) {
2337 error_report("Can't find block %s", id);
2338 return NULL;
56e93d26
JQ
2339 }
2340
4c4bad48
HZ
2341 return block;
2342}
2343
2344static inline void *host_from_ram_block_offset(RAMBlock *block,
2345 ram_addr_t offset)
2346{
2347 if (!offset_in_ramblock(block, offset)) {
2348 return NULL;
2349 }
2350
2351 return block->host + offset;
56e93d26
JQ
2352}
2353
3d0684b2
JQ
2354/**
2355 * ram_handle_compressed: handle the zero page case
2356 *
56e93d26
JQ
2357 * If a page (or a whole RDMA chunk) has been
2358 * determined to be zero, then zap it.
3d0684b2
JQ
2359 *
2360 * @host: host address for the zero page
2361 * @ch: what the page is filled from. We only support zero
2362 * @size: size of the zero page
56e93d26
JQ
2363 */
2364void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2365{
2366 if (ch != 0 || !is_zero_range(host, size)) {
2367 memset(host, ch, size);
2368 }
2369}
2370
2371static void *do_data_decompress(void *opaque)
2372{
2373 DecompressParam *param = opaque;
2374 unsigned long pagesize;
33d151f4
LL
2375 uint8_t *des;
2376 int len;
56e93d26 2377
33d151f4 2378 qemu_mutex_lock(&param->mutex);
90e56fb4 2379 while (!param->quit) {
33d151f4
LL
2380 if (param->des) {
2381 des = param->des;
2382 len = param->len;
2383 param->des = 0;
2384 qemu_mutex_unlock(&param->mutex);
2385
56e93d26 2386 pagesize = TARGET_PAGE_SIZE;
73a8912b
LL
2387 /* uncompress() will return failed in some case, especially
2388 * when the page is dirted when doing the compression, it's
2389 * not a problem because the dirty page will be retransferred
2390 * and uncompress() won't break the data in other pages.
2391 */
33d151f4
LL
2392 uncompress((Bytef *)des, &pagesize,
2393 (const Bytef *)param->compbuf, len);
73a8912b 2394
33d151f4
LL
2395 qemu_mutex_lock(&decomp_done_lock);
2396 param->done = true;
2397 qemu_cond_signal(&decomp_done_cond);
2398 qemu_mutex_unlock(&decomp_done_lock);
2399
2400 qemu_mutex_lock(&param->mutex);
2401 } else {
2402 qemu_cond_wait(&param->cond, &param->mutex);
2403 }
56e93d26 2404 }
33d151f4 2405 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
2406
2407 return NULL;
2408}
2409
5533b2e9
LL
2410static void wait_for_decompress_done(void)
2411{
2412 int idx, thread_count;
2413
2414 if (!migrate_use_compression()) {
2415 return;
2416 }
2417
2418 thread_count = migrate_decompress_threads();
2419 qemu_mutex_lock(&decomp_done_lock);
2420 for (idx = 0; idx < thread_count; idx++) {
2421 while (!decomp_param[idx].done) {
2422 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2423 }
2424 }
2425 qemu_mutex_unlock(&decomp_done_lock);
2426}
2427
56e93d26
JQ
2428void migrate_decompress_threads_create(void)
2429{
2430 int i, thread_count;
2431
2432 thread_count = migrate_decompress_threads();
2433 decompress_threads = g_new0(QemuThread, thread_count);
2434 decomp_param = g_new0(DecompressParam, thread_count);
73a8912b
LL
2435 qemu_mutex_init(&decomp_done_lock);
2436 qemu_cond_init(&decomp_done_cond);
56e93d26
JQ
2437 for (i = 0; i < thread_count; i++) {
2438 qemu_mutex_init(&decomp_param[i].mutex);
2439 qemu_cond_init(&decomp_param[i].cond);
2440 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
73a8912b 2441 decomp_param[i].done = true;
90e56fb4 2442 decomp_param[i].quit = false;
56e93d26
JQ
2443 qemu_thread_create(decompress_threads + i, "decompress",
2444 do_data_decompress, decomp_param + i,
2445 QEMU_THREAD_JOINABLE);
2446 }
2447}
2448
2449void migrate_decompress_threads_join(void)
2450{
2451 int i, thread_count;
2452
56e93d26
JQ
2453 thread_count = migrate_decompress_threads();
2454 for (i = 0; i < thread_count; i++) {
2455 qemu_mutex_lock(&decomp_param[i].mutex);
90e56fb4 2456 decomp_param[i].quit = true;
56e93d26
JQ
2457 qemu_cond_signal(&decomp_param[i].cond);
2458 qemu_mutex_unlock(&decomp_param[i].mutex);
2459 }
2460 for (i = 0; i < thread_count; i++) {
2461 qemu_thread_join(decompress_threads + i);
2462 qemu_mutex_destroy(&decomp_param[i].mutex);
2463 qemu_cond_destroy(&decomp_param[i].cond);
2464 g_free(decomp_param[i].compbuf);
2465 }
2466 g_free(decompress_threads);
2467 g_free(decomp_param);
56e93d26
JQ
2468 decompress_threads = NULL;
2469 decomp_param = NULL;
56e93d26
JQ
2470}
2471
c1bc6626 2472static void decompress_data_with_multi_threads(QEMUFile *f,
56e93d26
JQ
2473 void *host, int len)
2474{
2475 int idx, thread_count;
2476
2477 thread_count = migrate_decompress_threads();
73a8912b 2478 qemu_mutex_lock(&decomp_done_lock);
56e93d26
JQ
2479 while (true) {
2480 for (idx = 0; idx < thread_count; idx++) {
73a8912b 2481 if (decomp_param[idx].done) {
33d151f4
LL
2482 decomp_param[idx].done = false;
2483 qemu_mutex_lock(&decomp_param[idx].mutex);
c1bc6626 2484 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
56e93d26
JQ
2485 decomp_param[idx].des = host;
2486 decomp_param[idx].len = len;
33d151f4
LL
2487 qemu_cond_signal(&decomp_param[idx].cond);
2488 qemu_mutex_unlock(&decomp_param[idx].mutex);
56e93d26
JQ
2489 break;
2490 }
2491 }
2492 if (idx < thread_count) {
2493 break;
73a8912b
LL
2494 } else {
2495 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
56e93d26
JQ
2496 }
2497 }
73a8912b 2498 qemu_mutex_unlock(&decomp_done_lock);
56e93d26
JQ
2499}
2500
3d0684b2
JQ
2501/**
2502 * ram_postcopy_incoming_init: allocate postcopy data structures
2503 *
2504 * Returns 0 for success and negative if there was one error
2505 *
2506 * @mis: current migration incoming state
2507 *
2508 * Allocate data structures etc needed by incoming migration with
2509 * postcopy-ram. postcopy-ram's similarly names
2510 * postcopy_ram_incoming_init does the work.
1caddf8a
DDAG
2511 */
2512int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2513{
2514 size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
2515
2516 return postcopy_ram_incoming_init(mis, ram_pages);
2517}
2518
3d0684b2
JQ
2519/**
2520 * ram_load_postcopy: load a page in postcopy case
2521 *
2522 * Returns 0 for success or -errno in case of error
2523 *
a7180877
DDAG
2524 * Called in postcopy mode by ram_load().
2525 * rcu_read_lock is taken prior to this being called.
3d0684b2
JQ
2526 *
2527 * @f: QEMUFile where to send the data
a7180877
DDAG
2528 */
2529static int ram_load_postcopy(QEMUFile *f)
2530{
2531 int flags = 0, ret = 0;
2532 bool place_needed = false;
28abd200 2533 bool matching_page_sizes = false;
a7180877
DDAG
2534 MigrationIncomingState *mis = migration_incoming_get_current();
2535 /* Temporary page that is later 'placed' */
2536 void *postcopy_host_page = postcopy_get_tmp_page(mis);
c53b7ddc 2537 void *last_host = NULL;
a3b6ff6d 2538 bool all_zero = false;
a7180877
DDAG
2539
2540 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2541 ram_addr_t addr;
2542 void *host = NULL;
2543 void *page_buffer = NULL;
2544 void *place_source = NULL;
df9ff5e1 2545 RAMBlock *block = NULL;
a7180877 2546 uint8_t ch;
a7180877
DDAG
2547
2548 addr = qemu_get_be64(f);
2549 flags = addr & ~TARGET_PAGE_MASK;
2550 addr &= TARGET_PAGE_MASK;
2551
2552 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2553 place_needed = false;
2554 if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) {
df9ff5e1 2555 block = ram_block_from_stream(f, flags);
4c4bad48
HZ
2556
2557 host = host_from_ram_block_offset(block, addr);
a7180877
DDAG
2558 if (!host) {
2559 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2560 ret = -EINVAL;
2561 break;
2562 }
28abd200 2563 matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
a7180877 2564 /*
28abd200
DDAG
2565 * Postcopy requires that we place whole host pages atomically;
2566 * these may be huge pages for RAMBlocks that are backed by
2567 * hugetlbfs.
a7180877
DDAG
2568 * To make it atomic, the data is read into a temporary page
2569 * that's moved into place later.
2570 * The migration protocol uses, possibly smaller, target-pages
2571 * however the source ensures it always sends all the components
2572 * of a host page in order.
2573 */
2574 page_buffer = postcopy_host_page +
28abd200 2575 ((uintptr_t)host & (block->page_size - 1));
a7180877 2576 /* If all TP are zero then we can optimise the place */
28abd200 2577 if (!((uintptr_t)host & (block->page_size - 1))) {
a7180877 2578 all_zero = true;
c53b7ddc
DDAG
2579 } else {
2580 /* not the 1st TP within the HP */
2581 if (host != (last_host + TARGET_PAGE_SIZE)) {
9af9e0fe 2582 error_report("Non-sequential target page %p/%p",
c53b7ddc
DDAG
2583 host, last_host);
2584 ret = -EINVAL;
2585 break;
2586 }
a7180877
DDAG
2587 }
2588
c53b7ddc 2589
a7180877
DDAG
2590 /*
2591 * If it's the last part of a host page then we place the host
2592 * page
2593 */
2594 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
28abd200 2595 (block->page_size - 1)) == 0;
a7180877
DDAG
2596 place_source = postcopy_host_page;
2597 }
c53b7ddc 2598 last_host = host;
a7180877
DDAG
2599
2600 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2601 case RAM_SAVE_FLAG_COMPRESS:
2602 ch = qemu_get_byte(f);
2603 memset(page_buffer, ch, TARGET_PAGE_SIZE);
2604 if (ch) {
2605 all_zero = false;
2606 }
2607 break;
2608
2609 case RAM_SAVE_FLAG_PAGE:
2610 all_zero = false;
2611 if (!place_needed || !matching_page_sizes) {
2612 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2613 } else {
2614 /* Avoids the qemu_file copy during postcopy, which is
2615 * going to do a copy later; can only do it when we
2616 * do this read in one go (matching page sizes)
2617 */
2618 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2619 TARGET_PAGE_SIZE);
2620 }
2621 break;
2622 case RAM_SAVE_FLAG_EOS:
2623 /* normal exit */
2624 break;
2625 default:
2626 error_report("Unknown combination of migration flags: %#x"
2627 " (postcopy mode)", flags);
2628 ret = -EINVAL;
2629 }
2630
2631 if (place_needed) {
2632 /* This gets called at the last target page in the host page */
df9ff5e1
DDAG
2633 void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2634
a7180877 2635 if (all_zero) {
df9ff5e1
DDAG
2636 ret = postcopy_place_page_zero(mis, place_dest,
2637 block->page_size);
a7180877 2638 } else {
df9ff5e1
DDAG
2639 ret = postcopy_place_page(mis, place_dest,
2640 place_source, block->page_size);
a7180877
DDAG
2641 }
2642 }
2643 if (!ret) {
2644 ret = qemu_file_get_error(f);
2645 }
2646 }
2647
2648 return ret;
2649}
2650
56e93d26
JQ
2651static int ram_load(QEMUFile *f, void *opaque, int version_id)
2652{
2653 int flags = 0, ret = 0;
2654 static uint64_t seq_iter;
2655 int len = 0;
a7180877
DDAG
2656 /*
2657 * If system is running in postcopy mode, page inserts to host memory must
2658 * be atomic
2659 */
2660 bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
ef08fb38
DDAG
2661 /* ADVISE is earlier, it shows the source has the postcopy capability on */
2662 bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
56e93d26
JQ
2663
2664 seq_iter++;
2665
2666 if (version_id != 4) {
2667 ret = -EINVAL;
2668 }
2669
2670 /* This RCU critical section can be very long running.
2671 * When RCU reclaims in the code start to become numerous,
2672 * it will be necessary to reduce the granularity of this
2673 * critical section.
2674 */
2675 rcu_read_lock();
a7180877
DDAG
2676
2677 if (postcopy_running) {
2678 ret = ram_load_postcopy(f);
2679 }
2680
2681 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
56e93d26 2682 ram_addr_t addr, total_ram_bytes;
a776aa15 2683 void *host = NULL;
56e93d26
JQ
2684 uint8_t ch;
2685
2686 addr = qemu_get_be64(f);
2687 flags = addr & ~TARGET_PAGE_MASK;
2688 addr &= TARGET_PAGE_MASK;
2689
a776aa15
DDAG
2690 if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE |
2691 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4c4bad48
HZ
2692 RAMBlock *block = ram_block_from_stream(f, flags);
2693
2694 host = host_from_ram_block_offset(block, addr);
a776aa15
DDAG
2695 if (!host) {
2696 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2697 ret = -EINVAL;
2698 break;
2699 }
2700 }
2701
56e93d26
JQ
2702 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2703 case RAM_SAVE_FLAG_MEM_SIZE:
2704 /* Synchronize RAM block list */
2705 total_ram_bytes = addr;
2706 while (!ret && total_ram_bytes) {
2707 RAMBlock *block;
56e93d26
JQ
2708 char id[256];
2709 ram_addr_t length;
2710
2711 len = qemu_get_byte(f);
2712 qemu_get_buffer(f, (uint8_t *)id, len);
2713 id[len] = 0;
2714 length = qemu_get_be64(f);
2715
e3dd7493
DDAG
2716 block = qemu_ram_block_by_name(id);
2717 if (block) {
2718 if (length != block->used_length) {
2719 Error *local_err = NULL;
56e93d26 2720
fa53a0e5 2721 ret = qemu_ram_resize(block, length,
e3dd7493
DDAG
2722 &local_err);
2723 if (local_err) {
2724 error_report_err(local_err);
56e93d26 2725 }
56e93d26 2726 }
ef08fb38
DDAG
2727 /* For postcopy we need to check hugepage sizes match */
2728 if (postcopy_advised &&
2729 block->page_size != qemu_host_page_size) {
2730 uint64_t remote_page_size = qemu_get_be64(f);
2731 if (remote_page_size != block->page_size) {
2732 error_report("Mismatched RAM page size %s "
2733 "(local) %zd != %" PRId64,
2734 id, block->page_size,
2735 remote_page_size);
2736 ret = -EINVAL;
2737 }
2738 }
e3dd7493
DDAG
2739 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2740 block->idstr);
2741 } else {
56e93d26
JQ
2742 error_report("Unknown ramblock \"%s\", cannot "
2743 "accept migration", id);
2744 ret = -EINVAL;
2745 }
2746
2747 total_ram_bytes -= length;
2748 }
2749 break;
a776aa15 2750
56e93d26 2751 case RAM_SAVE_FLAG_COMPRESS:
56e93d26
JQ
2752 ch = qemu_get_byte(f);
2753 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2754 break;
a776aa15 2755
56e93d26 2756 case RAM_SAVE_FLAG_PAGE:
56e93d26
JQ
2757 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2758 break;
56e93d26 2759
a776aa15 2760 case RAM_SAVE_FLAG_COMPRESS_PAGE:
56e93d26
JQ
2761 len = qemu_get_be32(f);
2762 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2763 error_report("Invalid compressed data length: %d", len);
2764 ret = -EINVAL;
2765 break;
2766 }
c1bc6626 2767 decompress_data_with_multi_threads(f, host, len);
56e93d26 2768 break;
a776aa15 2769
56e93d26 2770 case RAM_SAVE_FLAG_XBZRLE:
56e93d26
JQ
2771 if (load_xbzrle(f, addr, host) < 0) {
2772 error_report("Failed to decompress XBZRLE page at "
2773 RAM_ADDR_FMT, addr);
2774 ret = -EINVAL;
2775 break;
2776 }
2777 break;
2778 case RAM_SAVE_FLAG_EOS:
2779 /* normal exit */
2780 break;
2781 default:
2782 if (flags & RAM_SAVE_FLAG_HOOK) {
632e3a5c 2783 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
56e93d26
JQ
2784 } else {
2785 error_report("Unknown combination of migration flags: %#x",
2786 flags);
2787 ret = -EINVAL;
2788 }
2789 }
2790 if (!ret) {
2791 ret = qemu_file_get_error(f);
2792 }
2793 }
2794
5533b2e9 2795 wait_for_decompress_done();
56e93d26 2796 rcu_read_unlock();
55c4446b 2797 trace_ram_load_complete(ret, seq_iter);
56e93d26
JQ
2798 return ret;
2799}
2800
2801static SaveVMHandlers savevm_ram_handlers = {
2802 .save_live_setup = ram_save_setup,
2803 .save_live_iterate = ram_save_iterate,
763c906b 2804 .save_live_complete_postcopy = ram_save_complete,
a3e06c3d 2805 .save_live_complete_precopy = ram_save_complete,
56e93d26
JQ
2806 .save_live_pending = ram_save_pending,
2807 .load_state = ram_load,
6ad2a215 2808 .cleanup = ram_migration_cleanup,
56e93d26
JQ
2809};
2810
2811void ram_mig_init(void)
2812{
2813 qemu_mutex_init(&XBZRLE.lock);
6f37bb8b 2814 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
56e93d26 2815}