]> git.proxmox.com Git - mirror_qemu.git/blame - migration/ram.c
ram: Create RAMState
[mirror_qemu.git] / migration / ram.c
CommitLineData
56e93d26
JQ
1/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
76cc7b58
JQ
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
56e93d26
JQ
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
1393a485 28#include "qemu/osdep.h"
33c11879
PB
29#include "qemu-common.h"
30#include "cpu.h"
56e93d26 31#include <zlib.h>
4addcd4f 32#include "qapi-event.h"
f348b6d1 33#include "qemu/cutils.h"
56e93d26
JQ
34#include "qemu/bitops.h"
35#include "qemu/bitmap.h"
7205c9ec
JQ
36#include "qemu/timer.h"
37#include "qemu/main-loop.h"
56e93d26 38#include "migration/migration.h"
e0b266f0 39#include "migration/postcopy-ram.h"
56e93d26
JQ
40#include "exec/address-spaces.h"
41#include "migration/page_cache.h"
56e93d26 42#include "qemu/error-report.h"
56e93d26 43#include "trace.h"
56e93d26 44#include "exec/ram_addr.h"
56e93d26 45#include "qemu/rcu_queue.h"
a91246c9 46#include "migration/colo.h"
56e93d26 47
56e93d26 48static int dirty_rate_high_cnt;
56e93d26
JQ
49
50static uint64_t bitmap_sync_count;
51
52/***********************************************************/
53/* ram save/restore */
54
55#define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
56#define RAM_SAVE_FLAG_COMPRESS 0x02
57#define RAM_SAVE_FLAG_MEM_SIZE 0x04
58#define RAM_SAVE_FLAG_PAGE 0x08
59#define RAM_SAVE_FLAG_EOS 0x10
60#define RAM_SAVE_FLAG_CONTINUE 0x20
61#define RAM_SAVE_FLAG_XBZRLE 0x40
62/* 0x80 is reserved in migration.h start with 0x100 next */
63#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
64
adb65dec 65static uint8_t *ZERO_TARGET_PAGE;
56e93d26
JQ
66
67static inline bool is_zero_range(uint8_t *p, uint64_t size)
68{
a1febc49 69 return buffer_is_zero(p, size);
56e93d26
JQ
70}
71
72/* struct contains XBZRLE cache and a static page
73 used by the compression */
74static struct {
75 /* buffer used for XBZRLE encoding */
76 uint8_t *encoded_buf;
77 /* buffer for storing page content */
78 uint8_t *current_buf;
79 /* Cache for XBZRLE, Protected by lock. */
80 PageCache *cache;
81 QemuMutex lock;
82} XBZRLE;
83
84/* buffer used for XBZRLE decoding */
85static uint8_t *xbzrle_decoded_buf;
86
87static void XBZRLE_cache_lock(void)
88{
89 if (migrate_use_xbzrle())
90 qemu_mutex_lock(&XBZRLE.lock);
91}
92
93static void XBZRLE_cache_unlock(void)
94{
95 if (migrate_use_xbzrle())
96 qemu_mutex_unlock(&XBZRLE.lock);
97}
98
3d0684b2
JQ
99/**
100 * xbzrle_cache_resize: resize the xbzrle cache
101 *
102 * This function is called from qmp_migrate_set_cache_size in main
103 * thread, possibly while a migration is in progress. A running
104 * migration may be using the cache and might finish during this call,
105 * hence changes to the cache are protected by XBZRLE.lock().
106 *
107 * Returns the new_size or negative in case of error.
108 *
109 * @new_size: new cache size
56e93d26
JQ
110 */
111int64_t xbzrle_cache_resize(int64_t new_size)
112{
113 PageCache *new_cache;
114 int64_t ret;
115
116 if (new_size < TARGET_PAGE_SIZE) {
117 return -1;
118 }
119
120 XBZRLE_cache_lock();
121
122 if (XBZRLE.cache != NULL) {
123 if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
124 goto out_new_size;
125 }
126 new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
127 TARGET_PAGE_SIZE);
128 if (!new_cache) {
129 error_report("Error creating cache");
130 ret = -1;
131 goto out;
132 }
133
134 cache_fini(XBZRLE.cache);
135 XBZRLE.cache = new_cache;
136 }
137
138out_new_size:
139 ret = pow2floor(new_size);
140out:
141 XBZRLE_cache_unlock();
142 return ret;
143}
144
6f37bb8b
JQ
145/* State of RAM for migration */
146struct RAMState {
147 /* Last block that we have visited searching for dirty pages */
148 RAMBlock *last_seen_block;
149 /* Last block from where we have sent data */
150 RAMBlock *last_sent_block;
151 /* Last offset we have sent data from */
152 ram_addr_t last_offset;
153 /* last ram version we have seen */
154 uint32_t last_version;
155 /* We are in the first round */
156 bool ram_bulk_stage;
157};
158typedef struct RAMState RAMState;
159
160static RAMState ram_state;
161
56e93d26
JQ
162/* accounting for migration statistics */
163typedef struct AccountingInfo {
164 uint64_t dup_pages;
165 uint64_t skipped_pages;
166 uint64_t norm_pages;
167 uint64_t iterations;
168 uint64_t xbzrle_bytes;
169 uint64_t xbzrle_pages;
170 uint64_t xbzrle_cache_miss;
171 double xbzrle_cache_miss_rate;
172 uint64_t xbzrle_overflows;
173} AccountingInfo;
174
175static AccountingInfo acct_info;
176
177static void acct_clear(void)
178{
179 memset(&acct_info, 0, sizeof(acct_info));
180}
181
182uint64_t dup_mig_bytes_transferred(void)
183{
184 return acct_info.dup_pages * TARGET_PAGE_SIZE;
185}
186
187uint64_t dup_mig_pages_transferred(void)
188{
189 return acct_info.dup_pages;
190}
191
192uint64_t skipped_mig_bytes_transferred(void)
193{
194 return acct_info.skipped_pages * TARGET_PAGE_SIZE;
195}
196
197uint64_t skipped_mig_pages_transferred(void)
198{
199 return acct_info.skipped_pages;
200}
201
202uint64_t norm_mig_bytes_transferred(void)
203{
204 return acct_info.norm_pages * TARGET_PAGE_SIZE;
205}
206
207uint64_t norm_mig_pages_transferred(void)
208{
209 return acct_info.norm_pages;
210}
211
212uint64_t xbzrle_mig_bytes_transferred(void)
213{
214 return acct_info.xbzrle_bytes;
215}
216
217uint64_t xbzrle_mig_pages_transferred(void)
218{
219 return acct_info.xbzrle_pages;
220}
221
222uint64_t xbzrle_mig_pages_cache_miss(void)
223{
224 return acct_info.xbzrle_cache_miss;
225}
226
227double xbzrle_mig_cache_miss_rate(void)
228{
229 return acct_info.xbzrle_cache_miss_rate;
230}
231
232uint64_t xbzrle_mig_pages_overflow(void)
233{
234 return acct_info.xbzrle_overflows;
235}
236
dd631697 237static QemuMutex migration_bitmap_mutex;
56e93d26 238static uint64_t migration_dirty_pages;
56e93d26 239
b8fb8cb7
DDAG
240/* used by the search for pages to send */
241struct PageSearchStatus {
242 /* Current block being searched */
243 RAMBlock *block;
244 /* Current offset to search from */
245 ram_addr_t offset;
246 /* Set once we wrap around */
247 bool complete_round;
248};
249typedef struct PageSearchStatus PageSearchStatus;
250
60be6340
DL
251static struct BitmapRcu {
252 struct rcu_head rcu;
f3f491fc 253 /* Main migration bitmap */
60be6340 254 unsigned long *bmap;
f3f491fc
DDAG
255 /* bitmap of pages that haven't been sent even once
256 * only maintained and used in postcopy at the moment
257 * where it's used to send the dirtymap at the start
258 * of the postcopy phase
259 */
260 unsigned long *unsentmap;
60be6340
DL
261} *migration_bitmap_rcu;
262
56e93d26 263struct CompressParam {
56e93d26 264 bool done;
90e56fb4 265 bool quit;
56e93d26
JQ
266 QEMUFile *file;
267 QemuMutex mutex;
268 QemuCond cond;
269 RAMBlock *block;
270 ram_addr_t offset;
271};
272typedef struct CompressParam CompressParam;
273
274struct DecompressParam {
73a8912b 275 bool done;
90e56fb4 276 bool quit;
56e93d26
JQ
277 QemuMutex mutex;
278 QemuCond cond;
279 void *des;
d341d9f3 280 uint8_t *compbuf;
56e93d26
JQ
281 int len;
282};
283typedef struct DecompressParam DecompressParam;
284
285static CompressParam *comp_param;
286static QemuThread *compress_threads;
287/* comp_done_cond is used to wake up the migration thread when
288 * one of the compression threads has finished the compression.
289 * comp_done_lock is used to co-work with comp_done_cond.
290 */
0d9f9a5c
LL
291static QemuMutex comp_done_lock;
292static QemuCond comp_done_cond;
56e93d26
JQ
293/* The empty QEMUFileOps will be used by file in CompressParam */
294static const QEMUFileOps empty_ops = { };
295
296static bool compression_switch;
56e93d26
JQ
297static DecompressParam *decomp_param;
298static QemuThread *decompress_threads;
73a8912b
LL
299static QemuMutex decomp_done_lock;
300static QemuCond decomp_done_cond;
56e93d26 301
a7a9a88f
LL
302static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
303 ram_addr_t offset);
56e93d26
JQ
304
305static void *do_data_compress(void *opaque)
306{
307 CompressParam *param = opaque;
a7a9a88f
LL
308 RAMBlock *block;
309 ram_addr_t offset;
56e93d26 310
a7a9a88f 311 qemu_mutex_lock(&param->mutex);
90e56fb4 312 while (!param->quit) {
a7a9a88f
LL
313 if (param->block) {
314 block = param->block;
315 offset = param->offset;
316 param->block = NULL;
317 qemu_mutex_unlock(&param->mutex);
318
319 do_compress_ram_page(param->file, block, offset);
320
0d9f9a5c 321 qemu_mutex_lock(&comp_done_lock);
a7a9a88f 322 param->done = true;
0d9f9a5c
LL
323 qemu_cond_signal(&comp_done_cond);
324 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
325
326 qemu_mutex_lock(&param->mutex);
327 } else {
56e93d26
JQ
328 qemu_cond_wait(&param->cond, &param->mutex);
329 }
56e93d26 330 }
a7a9a88f 331 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
332
333 return NULL;
334}
335
336static inline void terminate_compression_threads(void)
337{
338 int idx, thread_count;
339
340 thread_count = migrate_compress_threads();
3d0684b2 341
56e93d26
JQ
342 for (idx = 0; idx < thread_count; idx++) {
343 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 344 comp_param[idx].quit = true;
56e93d26
JQ
345 qemu_cond_signal(&comp_param[idx].cond);
346 qemu_mutex_unlock(&comp_param[idx].mutex);
347 }
348}
349
350void migrate_compress_threads_join(void)
351{
352 int i, thread_count;
353
354 if (!migrate_use_compression()) {
355 return;
356 }
357 terminate_compression_threads();
358 thread_count = migrate_compress_threads();
359 for (i = 0; i < thread_count; i++) {
360 qemu_thread_join(compress_threads + i);
361 qemu_fclose(comp_param[i].file);
362 qemu_mutex_destroy(&comp_param[i].mutex);
363 qemu_cond_destroy(&comp_param[i].cond);
364 }
0d9f9a5c
LL
365 qemu_mutex_destroy(&comp_done_lock);
366 qemu_cond_destroy(&comp_done_cond);
56e93d26
JQ
367 g_free(compress_threads);
368 g_free(comp_param);
56e93d26
JQ
369 compress_threads = NULL;
370 comp_param = NULL;
56e93d26
JQ
371}
372
373void migrate_compress_threads_create(void)
374{
375 int i, thread_count;
376
377 if (!migrate_use_compression()) {
378 return;
379 }
56e93d26
JQ
380 compression_switch = true;
381 thread_count = migrate_compress_threads();
382 compress_threads = g_new0(QemuThread, thread_count);
383 comp_param = g_new0(CompressParam, thread_count);
0d9f9a5c
LL
384 qemu_cond_init(&comp_done_cond);
385 qemu_mutex_init(&comp_done_lock);
56e93d26 386 for (i = 0; i < thread_count; i++) {
e110aa91
C
387 /* comp_param[i].file is just used as a dummy buffer to save data,
388 * set its ops to empty.
56e93d26
JQ
389 */
390 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
391 comp_param[i].done = true;
90e56fb4 392 comp_param[i].quit = false;
56e93d26
JQ
393 qemu_mutex_init(&comp_param[i].mutex);
394 qemu_cond_init(&comp_param[i].cond);
395 qemu_thread_create(compress_threads + i, "compress",
396 do_data_compress, comp_param + i,
397 QEMU_THREAD_JOINABLE);
398 }
399}
400
401/**
3d0684b2 402 * save_page_header: write page header to wire
56e93d26
JQ
403 *
404 * If this is the 1st block, it also writes the block identification
405 *
3d0684b2 406 * Returns the number of bytes written
56e93d26
JQ
407 *
408 * @f: QEMUFile where to send the data
409 * @block: block that contains the page we want to send
410 * @offset: offset inside the block for the page
411 * in the lower bits, it contains flags
412 */
413static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
414{
9f5f380b 415 size_t size, len;
56e93d26
JQ
416
417 qemu_put_be64(f, offset);
418 size = 8;
419
420 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
9f5f380b
LL
421 len = strlen(block->idstr);
422 qemu_put_byte(f, len);
423 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
424 size += 1 + len;
56e93d26
JQ
425 }
426 return size;
427}
428
3d0684b2
JQ
429/**
430 * mig_throttle_guest_down: throotle down the guest
431 *
432 * Reduce amount of guest cpu execution to hopefully slow down memory
433 * writes. If guest dirty memory rate is reduced below the rate at
434 * which we can transfer pages to the destination then we should be
435 * able to complete migration. Some workloads dirty memory way too
436 * fast and will not effectively converge, even with auto-converge.
070afca2
JH
437 */
438static void mig_throttle_guest_down(void)
439{
440 MigrationState *s = migrate_get_current();
2594f56d
DB
441 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
442 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
070afca2
JH
443
444 /* We have not started throttling yet. Let's start it. */
445 if (!cpu_throttle_active()) {
446 cpu_throttle_set(pct_initial);
447 } else {
448 /* Throttling already on, just increase the rate */
449 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
450 }
451}
452
3d0684b2
JQ
453/**
454 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
455 *
6f37bb8b 456 * @rs: current RAM state
3d0684b2
JQ
457 * @current_addr: address for the zero page
458 *
459 * Update the xbzrle cache to reflect a page that's been sent as all 0.
56e93d26
JQ
460 * The important thing is that a stale (not-yet-0'd) page be replaced
461 * by the new data.
462 * As a bonus, if the page wasn't in the cache it gets added so that
3d0684b2 463 * when a small write is made into the 0'd page it gets XBZRLE sent.
56e93d26 464 */
6f37bb8b 465static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
56e93d26 466{
6f37bb8b 467 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
56e93d26
JQ
468 return;
469 }
470
471 /* We don't care if this fails to allocate a new cache page
472 * as long as it updated an old one */
473 cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
474 bitmap_sync_count);
475}
476
477#define ENCODING_FLAG_XBZRLE 0x1
478
479/**
480 * save_xbzrle_page: compress and send current page
481 *
482 * Returns: 1 means that we wrote the page
483 * 0 means that page is identical to the one already sent
484 * -1 means that xbzrle would be longer than normal
485 *
486 * @f: QEMUFile where to send the data
3d0684b2
JQ
487 * @current_data: pointer to the address of the page contents
488 * @current_addr: addr of the page
56e93d26
JQ
489 * @block: block that contains the page we want to send
490 * @offset: offset inside the block for the page
491 * @last_stage: if we are at the completion stage
492 * @bytes_transferred: increase it with the number of transferred bytes
493 */
494static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data,
495 ram_addr_t current_addr, RAMBlock *block,
496 ram_addr_t offset, bool last_stage,
497 uint64_t *bytes_transferred)
498{
499 int encoded_len = 0, bytes_xbzrle;
500 uint8_t *prev_cached_page;
501
502 if (!cache_is_cached(XBZRLE.cache, current_addr, bitmap_sync_count)) {
503 acct_info.xbzrle_cache_miss++;
504 if (!last_stage) {
505 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
506 bitmap_sync_count) == -1) {
507 return -1;
508 } else {
509 /* update *current_data when the page has been
510 inserted into cache */
511 *current_data = get_cached_data(XBZRLE.cache, current_addr);
512 }
513 }
514 return -1;
515 }
516
517 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
518
519 /* save current buffer into memory */
520 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
521
522 /* XBZRLE encoding (if there is no overflow) */
523 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
524 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
525 TARGET_PAGE_SIZE);
526 if (encoded_len == 0) {
55c4446b 527 trace_save_xbzrle_page_skipping();
56e93d26
JQ
528 return 0;
529 } else if (encoded_len == -1) {
55c4446b 530 trace_save_xbzrle_page_overflow();
56e93d26
JQ
531 acct_info.xbzrle_overflows++;
532 /* update data in the cache */
533 if (!last_stage) {
534 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
535 *current_data = prev_cached_page;
536 }
537 return -1;
538 }
539
540 /* we need to update the data in the cache, in order to get the same data */
541 if (!last_stage) {
542 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
543 }
544
545 /* Send XBZRLE based compressed page */
546 bytes_xbzrle = save_page_header(f, block, offset | RAM_SAVE_FLAG_XBZRLE);
547 qemu_put_byte(f, ENCODING_FLAG_XBZRLE);
548 qemu_put_be16(f, encoded_len);
549 qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len);
550 bytes_xbzrle += encoded_len + 1 + 2;
551 acct_info.xbzrle_pages++;
552 acct_info.xbzrle_bytes += bytes_xbzrle;
553 *bytes_transferred += bytes_xbzrle;
554
555 return 1;
556}
557
3d0684b2
JQ
558/**
559 * migration_bitmap_find_dirty: find the next dirty page from start
f3f491fc 560 *
3d0684b2
JQ
561 * Called with rcu_read_lock() to protect migration_bitmap
562 *
563 * Returns the byte offset within memory region of the start of a dirty page
564 *
6f37bb8b 565 * @rs: current RAM state
3d0684b2
JQ
566 * @rb: RAMBlock where to search for dirty pages
567 * @start: starting address (typically so we can continue from previous page)
568 * @ram_addr_abs: pointer into which to store the address of the dirty page
569 * within the global ram_addr space
f3f491fc 570 */
56e93d26 571static inline
6f37bb8b 572ram_addr_t migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
a82d593b
DDAG
573 ram_addr_t start,
574 ram_addr_t *ram_addr_abs)
56e93d26 575{
2f68e399 576 unsigned long base = rb->offset >> TARGET_PAGE_BITS;
56e93d26 577 unsigned long nr = base + (start >> TARGET_PAGE_BITS);
2f68e399
DDAG
578 uint64_t rb_size = rb->used_length;
579 unsigned long size = base + (rb_size >> TARGET_PAGE_BITS);
2ff64038 580 unsigned long *bitmap;
56e93d26
JQ
581
582 unsigned long next;
583
60be6340 584 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
6f37bb8b 585 if (rs->ram_bulk_stage && nr > base) {
56e93d26
JQ
586 next = nr + 1;
587 } else {
2ff64038 588 next = find_next_bit(bitmap, size, nr);
56e93d26
JQ
589 }
590
f3f491fc 591 *ram_addr_abs = next << TARGET_PAGE_BITS;
56e93d26
JQ
592 return (next - base) << TARGET_PAGE_BITS;
593}
594
a82d593b
DDAG
595static inline bool migration_bitmap_clear_dirty(ram_addr_t addr)
596{
597 bool ret;
598 int nr = addr >> TARGET_PAGE_BITS;
599 unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
600
601 ret = test_and_clear_bit(nr, bitmap);
602
603 if (ret) {
604 migration_dirty_pages--;
605 }
606 return ret;
607}
608
1ffb5dfd 609static int64_t num_dirty_pages_period;
56e93d26
JQ
610static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length)
611{
2ff64038 612 unsigned long *bitmap;
60be6340 613 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1ffb5dfd
CF
614 migration_dirty_pages += cpu_physical_memory_sync_dirty_bitmap(bitmap,
615 start, length, &num_dirty_pages_period);
56e93d26
JQ
616}
617
56e93d26
JQ
618/* Fix me: there are too many global variables used in migration process. */
619static int64_t start_time;
620static int64_t bytes_xfer_prev;
56e93d26
JQ
621static uint64_t xbzrle_cache_miss_prev;
622static uint64_t iterations_prev;
623
624static void migration_bitmap_sync_init(void)
625{
626 start_time = 0;
627 bytes_xfer_prev = 0;
628 num_dirty_pages_period = 0;
629 xbzrle_cache_miss_prev = 0;
630 iterations_prev = 0;
631}
632
3d0684b2
JQ
633/**
634 * ram_pagesize_summary: calculate all the pagesizes of a VM
635 *
636 * Returns a summary bitmap of the page sizes of all RAMBlocks
637 *
638 * For VMs with just normal pages this is equivalent to the host page
639 * size. If it's got some huge pages then it's the OR of all the
640 * different page sizes.
e8ca1db2
DDAG
641 */
642uint64_t ram_pagesize_summary(void)
643{
644 RAMBlock *block;
645 uint64_t summary = 0;
646
647 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
648 summary |= block->page_size;
649 }
650
651 return summary;
652}
653
56e93d26
JQ
654static void migration_bitmap_sync(void)
655{
656 RAMBlock *block;
56e93d26
JQ
657 MigrationState *s = migrate_get_current();
658 int64_t end_time;
659 int64_t bytes_xfer_now;
660
661 bitmap_sync_count++;
662
663 if (!bytes_xfer_prev) {
664 bytes_xfer_prev = ram_bytes_transferred();
665 }
666
667 if (!start_time) {
668 start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
669 }
670
671 trace_migration_bitmap_sync_start();
9c1f8f44 672 memory_global_dirty_log_sync();
56e93d26 673
dd631697 674 qemu_mutex_lock(&migration_bitmap_mutex);
56e93d26
JQ
675 rcu_read_lock();
676 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2f68e399 677 migration_bitmap_sync_range(block->offset, block->used_length);
56e93d26
JQ
678 }
679 rcu_read_unlock();
dd631697 680 qemu_mutex_unlock(&migration_bitmap_mutex);
56e93d26 681
1ffb5dfd
CF
682 trace_migration_bitmap_sync_end(num_dirty_pages_period);
683
56e93d26
JQ
684 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
685
686 /* more than 1 second = 1000 millisecons */
687 if (end_time > start_time + 1000) {
688 if (migrate_auto_converge()) {
689 /* The following detection logic can be refined later. For now:
690 Check to see if the dirtied bytes is 50% more than the approx.
691 amount of bytes that just got transferred since the last time we
070afca2
JH
692 were in this routine. If that happens twice, start or increase
693 throttling */
56e93d26 694 bytes_xfer_now = ram_bytes_transferred();
070afca2 695
56e93d26
JQ
696 if (s->dirty_pages_rate &&
697 (num_dirty_pages_period * TARGET_PAGE_SIZE >
698 (bytes_xfer_now - bytes_xfer_prev)/2) &&
070afca2 699 (dirty_rate_high_cnt++ >= 2)) {
56e93d26 700 trace_migration_throttle();
56e93d26 701 dirty_rate_high_cnt = 0;
070afca2 702 mig_throttle_guest_down();
56e93d26
JQ
703 }
704 bytes_xfer_prev = bytes_xfer_now;
56e93d26 705 }
070afca2 706
56e93d26
JQ
707 if (migrate_use_xbzrle()) {
708 if (iterations_prev != acct_info.iterations) {
709 acct_info.xbzrle_cache_miss_rate =
710 (double)(acct_info.xbzrle_cache_miss -
711 xbzrle_cache_miss_prev) /
712 (acct_info.iterations - iterations_prev);
713 }
714 iterations_prev = acct_info.iterations;
715 xbzrle_cache_miss_prev = acct_info.xbzrle_cache_miss;
716 }
717 s->dirty_pages_rate = num_dirty_pages_period * 1000
718 / (end_time - start_time);
719 s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE;
720 start_time = end_time;
721 num_dirty_pages_period = 0;
722 }
723 s->dirty_sync_count = bitmap_sync_count;
4addcd4f
DDAG
724 if (migrate_use_events()) {
725 qapi_event_send_migration_pass(bitmap_sync_count, NULL);
726 }
56e93d26
JQ
727}
728
729/**
3d0684b2 730 * save_zero_page: send the zero page to the stream
56e93d26 731 *
3d0684b2 732 * Returns the number of pages written.
56e93d26
JQ
733 *
734 * @f: QEMUFile where to send the data
735 * @block: block that contains the page we want to send
736 * @offset: offset inside the block for the page
737 * @p: pointer to the page
738 * @bytes_transferred: increase it with the number of transferred bytes
739 */
740static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
741 uint8_t *p, uint64_t *bytes_transferred)
742{
743 int pages = -1;
744
745 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
746 acct_info.dup_pages++;
747 *bytes_transferred += save_page_header(f, block,
748 offset | RAM_SAVE_FLAG_COMPRESS);
749 qemu_put_byte(f, 0);
750 *bytes_transferred += 1;
751 pages = 1;
752 }
753
754 return pages;
755}
756
36449157 757static void ram_release_pages(MigrationState *ms, const char *rbname,
53f09a10
PB
758 uint64_t offset, int pages)
759{
760 if (!migrate_release_ram() || !migration_in_postcopy(ms)) {
761 return;
762 }
763
36449157 764 ram_discard_range(NULL, rbname, offset, pages << TARGET_PAGE_BITS);
53f09a10
PB
765}
766
56e93d26 767/**
3d0684b2 768 * ram_save_page: send the given page to the stream
56e93d26 769 *
3d0684b2 770 * Returns the number of pages written.
3fd3c4b3
DDAG
771 * < 0 - error
772 * >=0 - Number of pages written - this might legally be 0
773 * if xbzrle noticed the page was the same.
56e93d26 774 *
6f37bb8b 775 * @rs: current RAM state
3d0684b2 776 * @ms: current migration state
56e93d26
JQ
777 * @f: QEMUFile where to send the data
778 * @block: block that contains the page we want to send
779 * @offset: offset inside the block for the page
780 * @last_stage: if we are at the completion stage
781 * @bytes_transferred: increase it with the number of transferred bytes
782 */
6f37bb8b
JQ
783static int ram_save_page(RAMState *rs, MigrationState *ms, QEMUFile *f,
784 PageSearchStatus *pss, bool last_stage,
785 uint64_t *bytes_transferred)
56e93d26
JQ
786{
787 int pages = -1;
788 uint64_t bytes_xmit;
789 ram_addr_t current_addr;
56e93d26
JQ
790 uint8_t *p;
791 int ret;
792 bool send_async = true;
a08f6890
HZ
793 RAMBlock *block = pss->block;
794 ram_addr_t offset = pss->offset;
56e93d26 795
2f68e399 796 p = block->host + offset;
56e93d26
JQ
797
798 /* In doubt sent page as normal */
799 bytes_xmit = 0;
800 ret = ram_control_save_page(f, block->offset,
801 offset, TARGET_PAGE_SIZE, &bytes_xmit);
802 if (bytes_xmit) {
803 *bytes_transferred += bytes_xmit;
804 pages = 1;
805 }
806
807 XBZRLE_cache_lock();
808
809 current_addr = block->offset + offset;
810
6f37bb8b 811 if (block == rs->last_sent_block) {
56e93d26
JQ
812 offset |= RAM_SAVE_FLAG_CONTINUE;
813 }
814 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
815 if (ret != RAM_SAVE_CONTROL_DELAYED) {
816 if (bytes_xmit > 0) {
817 acct_info.norm_pages++;
818 } else if (bytes_xmit == 0) {
819 acct_info.dup_pages++;
820 }
821 }
822 } else {
823 pages = save_zero_page(f, block, offset, p, bytes_transferred);
824 if (pages > 0) {
825 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
826 * page would be stale
827 */
6f37bb8b 828 xbzrle_cache_zero_page(rs, current_addr);
53f09a10 829 ram_release_pages(ms, block->idstr, pss->offset, pages);
6f37bb8b 830 } else if (!rs->ram_bulk_stage &&
9eb14766 831 !migration_in_postcopy(ms) && migrate_use_xbzrle()) {
56e93d26
JQ
832 pages = save_xbzrle_page(f, &p, current_addr, block,
833 offset, last_stage, bytes_transferred);
834 if (!last_stage) {
835 /* Can't send this cached data async, since the cache page
836 * might get updated before it gets to the wire
837 */
838 send_async = false;
839 }
840 }
841 }
842
843 /* XBZRLE overflow or normal page */
844 if (pages == -1) {
845 *bytes_transferred += save_page_header(f, block,
846 offset | RAM_SAVE_FLAG_PAGE);
847 if (send_async) {
53f09a10
PB
848 qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE,
849 migrate_release_ram() &
850 migration_in_postcopy(ms));
56e93d26
JQ
851 } else {
852 qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
853 }
854 *bytes_transferred += TARGET_PAGE_SIZE;
855 pages = 1;
856 acct_info.norm_pages++;
857 }
858
859 XBZRLE_cache_unlock();
860
861 return pages;
862}
863
a7a9a88f
LL
864static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
865 ram_addr_t offset)
56e93d26
JQ
866{
867 int bytes_sent, blen;
a7a9a88f 868 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
56e93d26 869
a7a9a88f 870 bytes_sent = save_page_header(f, block, offset |
56e93d26 871 RAM_SAVE_FLAG_COMPRESS_PAGE);
a7a9a88f 872 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
56e93d26 873 migrate_compress_level());
b3be2896
LL
874 if (blen < 0) {
875 bytes_sent = 0;
876 qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
877 error_report("compressed data failed!");
878 } else {
879 bytes_sent += blen;
53f09a10
PB
880 ram_release_pages(migrate_get_current(), block->idstr,
881 offset & TARGET_PAGE_MASK, 1);
b3be2896 882 }
56e93d26
JQ
883
884 return bytes_sent;
885}
886
56e93d26
JQ
887static uint64_t bytes_transferred;
888
889static void flush_compressed_data(QEMUFile *f)
890{
891 int idx, len, thread_count;
892
893 if (!migrate_use_compression()) {
894 return;
895 }
896 thread_count = migrate_compress_threads();
a7a9a88f 897
0d9f9a5c 898 qemu_mutex_lock(&comp_done_lock);
56e93d26 899 for (idx = 0; idx < thread_count; idx++) {
a7a9a88f 900 while (!comp_param[idx].done) {
0d9f9a5c 901 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26 902 }
a7a9a88f 903 }
0d9f9a5c 904 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
905
906 for (idx = 0; idx < thread_count; idx++) {
907 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 908 if (!comp_param[idx].quit) {
56e93d26
JQ
909 len = qemu_put_qemu_file(f, comp_param[idx].file);
910 bytes_transferred += len;
911 }
a7a9a88f 912 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26
JQ
913 }
914}
915
916static inline void set_compress_params(CompressParam *param, RAMBlock *block,
917 ram_addr_t offset)
918{
919 param->block = block;
920 param->offset = offset;
921}
922
923static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block,
924 ram_addr_t offset,
925 uint64_t *bytes_transferred)
926{
927 int idx, thread_count, bytes_xmit = -1, pages = -1;
928
929 thread_count = migrate_compress_threads();
0d9f9a5c 930 qemu_mutex_lock(&comp_done_lock);
56e93d26
JQ
931 while (true) {
932 for (idx = 0; idx < thread_count; idx++) {
933 if (comp_param[idx].done) {
a7a9a88f 934 comp_param[idx].done = false;
56e93d26 935 bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file);
a7a9a88f 936 qemu_mutex_lock(&comp_param[idx].mutex);
56e93d26 937 set_compress_params(&comp_param[idx], block, offset);
a7a9a88f
LL
938 qemu_cond_signal(&comp_param[idx].cond);
939 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26
JQ
940 pages = 1;
941 acct_info.norm_pages++;
942 *bytes_transferred += bytes_xmit;
943 break;
944 }
945 }
946 if (pages > 0) {
947 break;
948 } else {
0d9f9a5c 949 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26
JQ
950 }
951 }
0d9f9a5c 952 qemu_mutex_unlock(&comp_done_lock);
56e93d26
JQ
953
954 return pages;
955}
956
957/**
958 * ram_save_compressed_page: compress the given page and send it to the stream
959 *
3d0684b2 960 * Returns the number of pages written.
56e93d26 961 *
6f37bb8b 962 * @rs: current RAM state
3d0684b2 963 * @ms: current migration state
56e93d26
JQ
964 * @f: QEMUFile where to send the data
965 * @block: block that contains the page we want to send
966 * @offset: offset inside the block for the page
967 * @last_stage: if we are at the completion stage
968 * @bytes_transferred: increase it with the number of transferred bytes
969 */
6f37bb8b
JQ
970static int ram_save_compressed_page(RAMState *rs, MigrationState *ms,
971 QEMUFile *f,
9eb14766 972 PageSearchStatus *pss, bool last_stage,
56e93d26
JQ
973 uint64_t *bytes_transferred)
974{
975 int pages = -1;
fc50438e 976 uint64_t bytes_xmit = 0;
56e93d26 977 uint8_t *p;
fc50438e 978 int ret, blen;
a08f6890
HZ
979 RAMBlock *block = pss->block;
980 ram_addr_t offset = pss->offset;
56e93d26 981
2f68e399 982 p = block->host + offset;
56e93d26 983
56e93d26
JQ
984 ret = ram_control_save_page(f, block->offset,
985 offset, TARGET_PAGE_SIZE, &bytes_xmit);
986 if (bytes_xmit) {
987 *bytes_transferred += bytes_xmit;
988 pages = 1;
989 }
56e93d26
JQ
990 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
991 if (ret != RAM_SAVE_CONTROL_DELAYED) {
992 if (bytes_xmit > 0) {
993 acct_info.norm_pages++;
994 } else if (bytes_xmit == 0) {
995 acct_info.dup_pages++;
996 }
997 }
998 } else {
999 /* When starting the process of a new block, the first page of
1000 * the block should be sent out before other pages in the same
1001 * block, and all the pages in last block should have been sent
1002 * out, keeping this order is important, because the 'cont' flag
1003 * is used to avoid resending the block name.
1004 */
6f37bb8b 1005 if (block != rs->last_sent_block) {
56e93d26
JQ
1006 flush_compressed_data(f);
1007 pages = save_zero_page(f, block, offset, p, bytes_transferred);
1008 if (pages == -1) {
fc50438e
LL
1009 /* Make sure the first page is sent out before other pages */
1010 bytes_xmit = save_page_header(f, block, offset |
1011 RAM_SAVE_FLAG_COMPRESS_PAGE);
1012 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
1013 migrate_compress_level());
1014 if (blen > 0) {
1015 *bytes_transferred += bytes_xmit + blen;
b3be2896 1016 acct_info.norm_pages++;
b3be2896 1017 pages = 1;
fc50438e
LL
1018 } else {
1019 qemu_file_set_error(f, blen);
1020 error_report("compressed data failed!");
b3be2896 1021 }
56e93d26 1022 }
53f09a10
PB
1023 if (pages > 0) {
1024 ram_release_pages(ms, block->idstr, pss->offset, pages);
1025 }
56e93d26 1026 } else {
fc50438e 1027 offset |= RAM_SAVE_FLAG_CONTINUE;
56e93d26
JQ
1028 pages = save_zero_page(f, block, offset, p, bytes_transferred);
1029 if (pages == -1) {
1030 pages = compress_page_with_multi_thread(f, block, offset,
1031 bytes_transferred);
53f09a10
PB
1032 } else {
1033 ram_release_pages(ms, block->idstr, pss->offset, pages);
56e93d26
JQ
1034 }
1035 }
1036 }
1037
1038 return pages;
1039}
1040
3d0684b2
JQ
1041/**
1042 * find_dirty_block: find the next dirty page and update any state
1043 * associated with the search process.
b9e60928 1044 *
3d0684b2 1045 * Returns if a page is found
b9e60928 1046 *
6f37bb8b 1047 * @rs: current RAM state
3d0684b2
JQ
1048 * @f: QEMUFile where to send the data
1049 * @pss: data about the state of the current dirty page scan
1050 * @again: set to false if the search has scanned the whole of RAM
1051 * @ram_addr_abs: pointer into which to store the address of the dirty page
1052 * within the global ram_addr space
b9e60928 1053 */
6f37bb8b 1054static bool find_dirty_block(RAMState *rs, QEMUFile *f, PageSearchStatus *pss,
f3f491fc 1055 bool *again, ram_addr_t *ram_addr_abs)
b9e60928 1056{
6f37bb8b 1057 pss->offset = migration_bitmap_find_dirty(rs, pss->block, pss->offset,
a82d593b 1058 ram_addr_abs);
6f37bb8b
JQ
1059 if (pss->complete_round && pss->block == rs->last_seen_block &&
1060 pss->offset >= rs->last_offset) {
b9e60928
DDAG
1061 /*
1062 * We've been once around the RAM and haven't found anything.
1063 * Give up.
1064 */
1065 *again = false;
1066 return false;
1067 }
1068 if (pss->offset >= pss->block->used_length) {
1069 /* Didn't find anything in this RAM Block */
1070 pss->offset = 0;
1071 pss->block = QLIST_NEXT_RCU(pss->block, next);
1072 if (!pss->block) {
1073 /* Hit the end of the list */
1074 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1075 /* Flag that we've looped */
1076 pss->complete_round = true;
6f37bb8b 1077 rs->ram_bulk_stage = false;
b9e60928
DDAG
1078 if (migrate_use_xbzrle()) {
1079 /* If xbzrle is on, stop using the data compression at this
1080 * point. In theory, xbzrle can do better than compression.
1081 */
1082 flush_compressed_data(f);
1083 compression_switch = false;
1084 }
1085 }
1086 /* Didn't find anything this time, but try again on the new block */
1087 *again = true;
1088 return false;
1089 } else {
1090 /* Can go around again, but... */
1091 *again = true;
1092 /* We've found something so probably don't need to */
1093 return true;
1094 }
1095}
1096
3d0684b2
JQ
1097/**
1098 * unqueue_page: gets a page of the queue
1099 *
a82d593b 1100 * Helper for 'get_queued_page' - gets a page off the queue
a82d593b 1101 *
3d0684b2
JQ
1102 * Returns the block of the page (or NULL if none available)
1103 *
1104 * @ms: current migration state
1105 * @offset: used to return the offset within the RAMBlock
1106 * @ram_addr_abs: pointer into which to store the address of the dirty page
1107 * within the global ram_addr space
a82d593b
DDAG
1108 */
1109static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset,
1110 ram_addr_t *ram_addr_abs)
1111{
1112 RAMBlock *block = NULL;
1113
1114 qemu_mutex_lock(&ms->src_page_req_mutex);
1115 if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) {
1116 struct MigrationSrcPageRequest *entry =
1117 QSIMPLEQ_FIRST(&ms->src_page_requests);
1118 block = entry->rb;
1119 *offset = entry->offset;
1120 *ram_addr_abs = (entry->offset + entry->rb->offset) &
1121 TARGET_PAGE_MASK;
1122
1123 if (entry->len > TARGET_PAGE_SIZE) {
1124 entry->len -= TARGET_PAGE_SIZE;
1125 entry->offset += TARGET_PAGE_SIZE;
1126 } else {
1127 memory_region_unref(block->mr);
1128 QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1129 g_free(entry);
1130 }
1131 }
1132 qemu_mutex_unlock(&ms->src_page_req_mutex);
1133
1134 return block;
1135}
1136
3d0684b2
JQ
1137/**
1138 * get_queued_page: unqueue a page from the postocpy requests
1139 *
1140 * Skips pages that are already sent (!dirty)
a82d593b 1141 *
3d0684b2 1142 * Returns if a queued page is found
a82d593b 1143 *
6f37bb8b 1144 * @rs: current RAM state
3d0684b2
JQ
1145 * @ms: current migration state
1146 * @pss: data about the state of the current dirty page scan
1147 * @ram_addr_abs: pointer into which to store the address of the dirty page
1148 * within the global ram_addr space
a82d593b 1149 */
6f37bb8b
JQ
1150static bool get_queued_page(RAMState *rs, MigrationState *ms,
1151 PageSearchStatus *pss,
a82d593b
DDAG
1152 ram_addr_t *ram_addr_abs)
1153{
1154 RAMBlock *block;
1155 ram_addr_t offset;
1156 bool dirty;
1157
1158 do {
1159 block = unqueue_page(ms, &offset, ram_addr_abs);
1160 /*
1161 * We're sending this page, and since it's postcopy nothing else
1162 * will dirty it, and we must make sure it doesn't get sent again
1163 * even if this queue request was received after the background
1164 * search already sent it.
1165 */
1166 if (block) {
1167 unsigned long *bitmap;
1168 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1169 dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap);
1170 if (!dirty) {
1171 trace_get_queued_page_not_dirty(
1172 block->idstr, (uint64_t)offset,
1173 (uint64_t)*ram_addr_abs,
1174 test_bit(*ram_addr_abs >> TARGET_PAGE_BITS,
1175 atomic_rcu_read(&migration_bitmap_rcu)->unsentmap));
1176 } else {
1177 trace_get_queued_page(block->idstr,
1178 (uint64_t)offset,
1179 (uint64_t)*ram_addr_abs);
1180 }
1181 }
1182
1183 } while (block && !dirty);
1184
1185 if (block) {
1186 /*
1187 * As soon as we start servicing pages out of order, then we have
1188 * to kill the bulk stage, since the bulk stage assumes
1189 * in (migration_bitmap_find_and_reset_dirty) that every page is
1190 * dirty, that's no longer true.
1191 */
6f37bb8b 1192 rs->ram_bulk_stage = false;
a82d593b
DDAG
1193
1194 /*
1195 * We want the background search to continue from the queued page
1196 * since the guest is likely to want other pages near to the page
1197 * it just requested.
1198 */
1199 pss->block = block;
1200 pss->offset = offset;
1201 }
1202
1203 return !!block;
1204}
1205
6c595cde 1206/**
5e58f968
JQ
1207 * migration_page_queue_free: drop any remaining pages in the ram
1208 * request queue
6c595cde 1209 *
3d0684b2
JQ
1210 * It should be empty at the end anyway, but in error cases there may
1211 * be some left. in case that there is any page left, we drop it.
1212 *
1213 * @ms: current migration state
6c595cde 1214 */
5e58f968 1215void migration_page_queue_free(MigrationState *ms)
6c595cde
DDAG
1216{
1217 struct MigrationSrcPageRequest *mspr, *next_mspr;
1218 /* This queue generally should be empty - but in the case of a failed
1219 * migration might have some droppings in.
1220 */
1221 rcu_read_lock();
1222 QSIMPLEQ_FOREACH_SAFE(mspr, &ms->src_page_requests, next_req, next_mspr) {
1223 memory_region_unref(mspr->rb->mr);
1224 QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1225 g_free(mspr);
1226 }
1227 rcu_read_unlock();
1228}
1229
1230/**
3d0684b2
JQ
1231 * ram_save_queue_pages: queue the page for transmission
1232 *
1233 * A request from postcopy destination for example.
1234 *
1235 * Returns zero on success or negative on error
1236 *
1237 * @ms: current migration state
1238 * @rbname: Name of the RAMBLock of the request. NULL means the
1239 * same that last one.
1240 * @start: starting address from the start of the RAMBlock
1241 * @len: length (in bytes) to send
6c595cde
DDAG
1242 */
1243int ram_save_queue_pages(MigrationState *ms, const char *rbname,
1244 ram_addr_t start, ram_addr_t len)
1245{
1246 RAMBlock *ramblock;
1247
d3bf5418 1248 ms->postcopy_requests++;
6c595cde
DDAG
1249 rcu_read_lock();
1250 if (!rbname) {
1251 /* Reuse last RAMBlock */
1252 ramblock = ms->last_req_rb;
1253
1254 if (!ramblock) {
1255 /*
1256 * Shouldn't happen, we can't reuse the last RAMBlock if
1257 * it's the 1st request.
1258 */
1259 error_report("ram_save_queue_pages no previous block");
1260 goto err;
1261 }
1262 } else {
1263 ramblock = qemu_ram_block_by_name(rbname);
1264
1265 if (!ramblock) {
1266 /* We shouldn't be asked for a non-existent RAMBlock */
1267 error_report("ram_save_queue_pages no block '%s'", rbname);
1268 goto err;
1269 }
1270 ms->last_req_rb = ramblock;
1271 }
1272 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1273 if (start+len > ramblock->used_length) {
9458ad6b
JQ
1274 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1275 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
6c595cde
DDAG
1276 __func__, start, len, ramblock->used_length);
1277 goto err;
1278 }
1279
1280 struct MigrationSrcPageRequest *new_entry =
1281 g_malloc0(sizeof(struct MigrationSrcPageRequest));
1282 new_entry->rb = ramblock;
1283 new_entry->offset = start;
1284 new_entry->len = len;
1285
1286 memory_region_ref(ramblock->mr);
1287 qemu_mutex_lock(&ms->src_page_req_mutex);
1288 QSIMPLEQ_INSERT_TAIL(&ms->src_page_requests, new_entry, next_req);
1289 qemu_mutex_unlock(&ms->src_page_req_mutex);
1290 rcu_read_unlock();
1291
1292 return 0;
1293
1294err:
1295 rcu_read_unlock();
1296 return -1;
1297}
1298
a82d593b 1299/**
3d0684b2 1300 * ram_save_target_page: save one target page
a82d593b 1301 *
3d0684b2 1302 * Returns the number of pages written
a82d593b 1303 *
6f37bb8b 1304 * @rs: current RAM state
3d0684b2 1305 * @ms: current migration state
a82d593b 1306 * @f: QEMUFile where to send the data
3d0684b2 1307 * @pss: data about the page we want to send
a82d593b
DDAG
1308 * @last_stage: if we are at the completion stage
1309 * @bytes_transferred: increase it with the number of transferred bytes
3d0684b2 1310 * @dirty_ram_abs: address of the start of the dirty page in ram_addr_t space
a82d593b 1311 */
6f37bb8b 1312static int ram_save_target_page(RAMState *rs, MigrationState *ms, QEMUFile *f,
a08f6890 1313 PageSearchStatus *pss,
a82d593b
DDAG
1314 bool last_stage,
1315 uint64_t *bytes_transferred,
1316 ram_addr_t dirty_ram_abs)
1317{
1318 int res = 0;
1319
1320 /* Check the pages is dirty and if it is send it */
1321 if (migration_bitmap_clear_dirty(dirty_ram_abs)) {
1322 unsigned long *unsentmap;
1323 if (compression_switch && migrate_use_compression()) {
6f37bb8b 1324 res = ram_save_compressed_page(rs, ms, f, pss,
a82d593b
DDAG
1325 last_stage,
1326 bytes_transferred);
1327 } else {
6f37bb8b 1328 res = ram_save_page(rs, ms, f, pss, last_stage,
a82d593b
DDAG
1329 bytes_transferred);
1330 }
1331
1332 if (res < 0) {
1333 return res;
1334 }
1335 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1336 if (unsentmap) {
1337 clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap);
1338 }
3fd3c4b3
DDAG
1339 /* Only update last_sent_block if a block was actually sent; xbzrle
1340 * might have decided the page was identical so didn't bother writing
1341 * to the stream.
1342 */
1343 if (res > 0) {
6f37bb8b 1344 rs->last_sent_block = pss->block;
3fd3c4b3 1345 }
a82d593b
DDAG
1346 }
1347
1348 return res;
1349}
1350
1351/**
3d0684b2 1352 * ram_save_host_page: save a whole host page
a82d593b 1353 *
3d0684b2
JQ
1354 * Starting at *offset send pages up to the end of the current host
1355 * page. It's valid for the initial offset to point into the middle of
1356 * a host page in which case the remainder of the hostpage is sent.
1357 * Only dirty target pages are sent. Note that the host page size may
1358 * be a huge page for this block.
a82d593b 1359 *
3d0684b2
JQ
1360 * Returns the number of pages written or negative on error
1361 *
6f37bb8b 1362 * @rs: current RAM state
3d0684b2 1363 * @ms: current migration state
a82d593b 1364 * @f: QEMUFile where to send the data
3d0684b2 1365 * @pss: data about the page we want to send
a82d593b
DDAG
1366 * @last_stage: if we are at the completion stage
1367 * @bytes_transferred: increase it with the number of transferred bytes
1368 * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1369 */
6f37bb8b 1370static int ram_save_host_page(RAMState *rs, MigrationState *ms, QEMUFile *f,
a08f6890
HZ
1371 PageSearchStatus *pss,
1372 bool last_stage,
a82d593b
DDAG
1373 uint64_t *bytes_transferred,
1374 ram_addr_t dirty_ram_abs)
1375{
1376 int tmppages, pages = 0;
4c011c37
DDAG
1377 size_t pagesize = qemu_ram_pagesize(pss->block);
1378
a82d593b 1379 do {
6f37bb8b 1380 tmppages = ram_save_target_page(rs, ms, f, pss, last_stage,
a82d593b
DDAG
1381 bytes_transferred, dirty_ram_abs);
1382 if (tmppages < 0) {
1383 return tmppages;
1384 }
1385
1386 pages += tmppages;
a08f6890 1387 pss->offset += TARGET_PAGE_SIZE;
a82d593b 1388 dirty_ram_abs += TARGET_PAGE_SIZE;
4c011c37 1389 } while (pss->offset & (pagesize - 1));
a82d593b
DDAG
1390
1391 /* The offset we leave with is the last one we looked at */
a08f6890 1392 pss->offset -= TARGET_PAGE_SIZE;
a82d593b
DDAG
1393 return pages;
1394}
6c595cde 1395
56e93d26 1396/**
3d0684b2 1397 * ram_find_and_save_block: finds a dirty page and sends it to f
56e93d26
JQ
1398 *
1399 * Called within an RCU critical section.
1400 *
3d0684b2 1401 * Returns the number of pages written where zero means no dirty pages
56e93d26 1402 *
6f37bb8b 1403 * @rs: current RAM state
56e93d26
JQ
1404 * @f: QEMUFile where to send the data
1405 * @last_stage: if we are at the completion stage
1406 * @bytes_transferred: increase it with the number of transferred bytes
a82d593b
DDAG
1407 *
1408 * On systems where host-page-size > target-page-size it will send all the
1409 * pages in a host page that are dirty.
56e93d26
JQ
1410 */
1411
6f37bb8b 1412static int ram_find_and_save_block(RAMState *rs, QEMUFile *f, bool last_stage,
56e93d26
JQ
1413 uint64_t *bytes_transferred)
1414{
b8fb8cb7 1415 PageSearchStatus pss;
a82d593b 1416 MigrationState *ms = migrate_get_current();
56e93d26 1417 int pages = 0;
b9e60928 1418 bool again, found;
f3f491fc
DDAG
1419 ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
1420 ram_addr_t space */
56e93d26 1421
0827b9e9
AA
1422 /* No dirty page as there is zero RAM */
1423 if (!ram_bytes_total()) {
1424 return pages;
1425 }
1426
6f37bb8b
JQ
1427 pss.block = rs->last_seen_block;
1428 pss.offset = rs->last_offset;
b8fb8cb7
DDAG
1429 pss.complete_round = false;
1430
1431 if (!pss.block) {
1432 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1433 }
56e93d26 1434
b9e60928 1435 do {
a82d593b 1436 again = true;
6f37bb8b 1437 found = get_queued_page(rs, ms, &pss, &dirty_ram_abs);
b9e60928 1438
a82d593b
DDAG
1439 if (!found) {
1440 /* priority queue empty, so just search for something dirty */
6f37bb8b 1441 found = find_dirty_block(rs, f, &pss, &again, &dirty_ram_abs);
a82d593b 1442 }
f3f491fc 1443
a82d593b 1444 if (found) {
6f37bb8b 1445 pages = ram_save_host_page(rs, ms, f, &pss,
a82d593b
DDAG
1446 last_stage, bytes_transferred,
1447 dirty_ram_abs);
56e93d26 1448 }
b9e60928 1449 } while (!pages && again);
56e93d26 1450
6f37bb8b
JQ
1451 rs->last_seen_block = pss.block;
1452 rs->last_offset = pss.offset;
56e93d26
JQ
1453
1454 return pages;
1455}
1456
1457void acct_update_position(QEMUFile *f, size_t size, bool zero)
1458{
1459 uint64_t pages = size / TARGET_PAGE_SIZE;
1460 if (zero) {
1461 acct_info.dup_pages += pages;
1462 } else {
1463 acct_info.norm_pages += pages;
1464 bytes_transferred += size;
1465 qemu_update_position(f, size);
1466 }
1467}
1468
1469static ram_addr_t ram_save_remaining(void)
1470{
1471 return migration_dirty_pages;
1472}
1473
1474uint64_t ram_bytes_remaining(void)
1475{
1476 return ram_save_remaining() * TARGET_PAGE_SIZE;
1477}
1478
1479uint64_t ram_bytes_transferred(void)
1480{
1481 return bytes_transferred;
1482}
1483
1484uint64_t ram_bytes_total(void)
1485{
1486 RAMBlock *block;
1487 uint64_t total = 0;
1488
1489 rcu_read_lock();
1490 QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
1491 total += block->used_length;
1492 rcu_read_unlock();
1493 return total;
1494}
1495
1496void free_xbzrle_decoded_buf(void)
1497{
1498 g_free(xbzrle_decoded_buf);
1499 xbzrle_decoded_buf = NULL;
1500}
1501
60be6340
DL
1502static void migration_bitmap_free(struct BitmapRcu *bmap)
1503{
1504 g_free(bmap->bmap);
f3f491fc 1505 g_free(bmap->unsentmap);
60be6340
DL
1506 g_free(bmap);
1507}
1508
6ad2a215 1509static void ram_migration_cleanup(void *opaque)
56e93d26 1510{
2ff64038
LZ
1511 /* caller have hold iothread lock or is in a bh, so there is
1512 * no writing race against this migration_bitmap
1513 */
60be6340
DL
1514 struct BitmapRcu *bitmap = migration_bitmap_rcu;
1515 atomic_rcu_set(&migration_bitmap_rcu, NULL);
2ff64038 1516 if (bitmap) {
56e93d26 1517 memory_global_dirty_log_stop();
60be6340 1518 call_rcu(bitmap, migration_bitmap_free, rcu);
56e93d26
JQ
1519 }
1520
1521 XBZRLE_cache_lock();
1522 if (XBZRLE.cache) {
1523 cache_fini(XBZRLE.cache);
1524 g_free(XBZRLE.encoded_buf);
1525 g_free(XBZRLE.current_buf);
adb65dec 1526 g_free(ZERO_TARGET_PAGE);
56e93d26
JQ
1527 XBZRLE.cache = NULL;
1528 XBZRLE.encoded_buf = NULL;
1529 XBZRLE.current_buf = NULL;
1530 }
1531 XBZRLE_cache_unlock();
1532}
1533
6f37bb8b 1534static void ram_state_reset(RAMState *rs)
56e93d26 1535{
6f37bb8b
JQ
1536 rs->last_seen_block = NULL;
1537 rs->last_sent_block = NULL;
1538 rs->last_offset = 0;
1539 rs->last_version = ram_list.version;
1540 rs->ram_bulk_stage = true;
56e93d26
JQ
1541}
1542
1543#define MAX_WAIT 50 /* ms, half buffered_file limit */
1544
dd631697
LZ
1545void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
1546{
1547 /* called in qemu main thread, so there is
1548 * no writing race against this migration_bitmap
1549 */
60be6340
DL
1550 if (migration_bitmap_rcu) {
1551 struct BitmapRcu *old_bitmap = migration_bitmap_rcu, *bitmap;
1552 bitmap = g_new(struct BitmapRcu, 1);
1553 bitmap->bmap = bitmap_new(new);
dd631697
LZ
1554
1555 /* prevent migration_bitmap content from being set bit
1556 * by migration_bitmap_sync_range() at the same time.
1557 * it is safe to migration if migration_bitmap is cleared bit
1558 * at the same time.
1559 */
1560 qemu_mutex_lock(&migration_bitmap_mutex);
60be6340
DL
1561 bitmap_copy(bitmap->bmap, old_bitmap->bmap, old);
1562 bitmap_set(bitmap->bmap, old, new - old);
f3f491fc
DDAG
1563
1564 /* We don't have a way to safely extend the sentmap
1565 * with RCU; so mark it as missing, entry to postcopy
1566 * will fail.
1567 */
1568 bitmap->unsentmap = NULL;
1569
60be6340 1570 atomic_rcu_set(&migration_bitmap_rcu, bitmap);
dd631697
LZ
1571 qemu_mutex_unlock(&migration_bitmap_mutex);
1572 migration_dirty_pages += new - old;
60be6340 1573 call_rcu(old_bitmap, migration_bitmap_free, rcu);
dd631697
LZ
1574 }
1575}
56e93d26 1576
4f2e4252
DDAG
1577/*
1578 * 'expected' is the value you expect the bitmap mostly to be full
1579 * of; it won't bother printing lines that are all this value.
1580 * If 'todump' is null the migration bitmap is dumped.
1581 */
1582void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
1583{
1584 int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1585
1586 int64_t cur;
1587 int64_t linelen = 128;
1588 char linebuf[129];
1589
1590 if (!todump) {
1591 todump = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1592 }
1593
1594 for (cur = 0; cur < ram_pages; cur += linelen) {
1595 int64_t curb;
1596 bool found = false;
1597 /*
1598 * Last line; catch the case where the line length
1599 * is longer than remaining ram
1600 */
1601 if (cur + linelen > ram_pages) {
1602 linelen = ram_pages - cur;
1603 }
1604 for (curb = 0; curb < linelen; curb++) {
1605 bool thisbit = test_bit(cur + curb, todump);
1606 linebuf[curb] = thisbit ? '1' : '.';
1607 found = found || (thisbit != expected);
1608 }
1609 if (found) {
1610 linebuf[curb] = '\0';
1611 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
1612 }
1613 }
1614}
1615
e0b266f0
DDAG
1616/* **** functions for postcopy ***** */
1617
ced1c616
PB
1618void ram_postcopy_migrated_memory_release(MigrationState *ms)
1619{
1620 struct RAMBlock *block;
1621 unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1622
1623 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1624 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1625 unsigned long range = first + (block->used_length >> TARGET_PAGE_BITS);
1626 unsigned long run_start = find_next_zero_bit(bitmap, range, first);
1627
1628 while (run_start < range) {
1629 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1630 ram_discard_range(NULL, block->idstr, run_start << TARGET_PAGE_BITS,
1631 (run_end - run_start) << TARGET_PAGE_BITS);
1632 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1633 }
1634 }
1635}
1636
3d0684b2
JQ
1637/**
1638 * postcopy_send_discard_bm_ram: discard a RAMBlock
1639 *
1640 * Returns zero on success
1641 *
e0b266f0
DDAG
1642 * Callback from postcopy_each_ram_send_discard for each RAMBlock
1643 * Note: At this point the 'unsentmap' is the processed bitmap combined
1644 * with the dirtymap; so a '1' means it's either dirty or unsent.
3d0684b2
JQ
1645 *
1646 * @ms: current migration state
1647 * @pds: state for postcopy
1648 * @start: RAMBlock starting page
1649 * @length: RAMBlock size
e0b266f0
DDAG
1650 */
1651static int postcopy_send_discard_bm_ram(MigrationState *ms,
1652 PostcopyDiscardState *pds,
1653 unsigned long start,
1654 unsigned long length)
1655{
1656 unsigned long end = start + length; /* one after the end */
1657 unsigned long current;
1658 unsigned long *unsentmap;
1659
1660 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1661 for (current = start; current < end; ) {
1662 unsigned long one = find_next_bit(unsentmap, end, current);
1663
1664 if (one <= end) {
1665 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1666 unsigned long discard_length;
1667
1668 if (zero >= end) {
1669 discard_length = end - one;
1670 } else {
1671 discard_length = zero - one;
1672 }
d688c62d
DDAG
1673 if (discard_length) {
1674 postcopy_discard_send_range(ms, pds, one, discard_length);
1675 }
e0b266f0
DDAG
1676 current = one + discard_length;
1677 } else {
1678 current = one;
1679 }
1680 }
1681
1682 return 0;
1683}
1684
3d0684b2
JQ
1685/**
1686 * postcopy_each_ram_send_discard: discard all RAMBlocks
1687 *
1688 * Returns 0 for success or negative for error
1689 *
e0b266f0
DDAG
1690 * Utility for the outgoing postcopy code.
1691 * Calls postcopy_send_discard_bm_ram for each RAMBlock
1692 * passing it bitmap indexes and name.
e0b266f0
DDAG
1693 * (qemu_ram_foreach_block ends up passing unscaled lengths
1694 * which would mean postcopy code would have to deal with target page)
3d0684b2
JQ
1695 *
1696 * @ms: current migration state
e0b266f0
DDAG
1697 */
1698static int postcopy_each_ram_send_discard(MigrationState *ms)
1699{
1700 struct RAMBlock *block;
1701 int ret;
1702
1703 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1704 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1705 PostcopyDiscardState *pds = postcopy_discard_send_init(ms,
1706 first,
1707 block->idstr);
1708
1709 /*
1710 * Postcopy sends chunks of bitmap over the wire, but it
1711 * just needs indexes at this point, avoids it having
1712 * target page specific code.
1713 */
1714 ret = postcopy_send_discard_bm_ram(ms, pds, first,
1715 block->used_length >> TARGET_PAGE_BITS);
1716 postcopy_discard_send_finish(ms, pds);
1717 if (ret) {
1718 return ret;
1719 }
1720 }
1721
1722 return 0;
1723}
1724
3d0684b2
JQ
1725/**
1726 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1727 *
1728 * Helper for postcopy_chunk_hostpages; it's called twice to
1729 * canonicalize the two bitmaps, that are similar, but one is
1730 * inverted.
99e314eb 1731 *
3d0684b2
JQ
1732 * Postcopy requires that all target pages in a hostpage are dirty or
1733 * clean, not a mix. This function canonicalizes the bitmaps.
99e314eb 1734 *
3d0684b2
JQ
1735 * @ms: current migration state
1736 * @unsent_pass: if true we need to canonicalize partially unsent host pages
1737 * otherwise we need to canonicalize partially dirty host pages
1738 * @block: block that contains the page we want to canonicalize
1739 * @pds: state for postcopy
99e314eb
DDAG
1740 */
1741static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1742 RAMBlock *block,
1743 PostcopyDiscardState *pds)
1744{
1745 unsigned long *bitmap;
1746 unsigned long *unsentmap;
29c59172 1747 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
99e314eb
DDAG
1748 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1749 unsigned long len = block->used_length >> TARGET_PAGE_BITS;
1750 unsigned long last = first + (len - 1);
1751 unsigned long run_start;
1752
29c59172
DDAG
1753 if (block->page_size == TARGET_PAGE_SIZE) {
1754 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1755 return;
1756 }
1757
99e314eb
DDAG
1758 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1759 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1760
1761 if (unsent_pass) {
1762 /* Find a sent page */
1763 run_start = find_next_zero_bit(unsentmap, last + 1, first);
1764 } else {
1765 /* Find a dirty page */
1766 run_start = find_next_bit(bitmap, last + 1, first);
1767 }
1768
1769 while (run_start <= last) {
1770 bool do_fixup = false;
1771 unsigned long fixup_start_addr;
1772 unsigned long host_offset;
1773
1774 /*
1775 * If the start of this run of pages is in the middle of a host
1776 * page, then we need to fixup this host page.
1777 */
1778 host_offset = run_start % host_ratio;
1779 if (host_offset) {
1780 do_fixup = true;
1781 run_start -= host_offset;
1782 fixup_start_addr = run_start;
1783 /* For the next pass */
1784 run_start = run_start + host_ratio;
1785 } else {
1786 /* Find the end of this run */
1787 unsigned long run_end;
1788 if (unsent_pass) {
1789 run_end = find_next_bit(unsentmap, last + 1, run_start + 1);
1790 } else {
1791 run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1);
1792 }
1793 /*
1794 * If the end isn't at the start of a host page, then the
1795 * run doesn't finish at the end of a host page
1796 * and we need to discard.
1797 */
1798 host_offset = run_end % host_ratio;
1799 if (host_offset) {
1800 do_fixup = true;
1801 fixup_start_addr = run_end - host_offset;
1802 /*
1803 * This host page has gone, the next loop iteration starts
1804 * from after the fixup
1805 */
1806 run_start = fixup_start_addr + host_ratio;
1807 } else {
1808 /*
1809 * No discards on this iteration, next loop starts from
1810 * next sent/dirty page
1811 */
1812 run_start = run_end + 1;
1813 }
1814 }
1815
1816 if (do_fixup) {
1817 unsigned long page;
1818
1819 /* Tell the destination to discard this page */
1820 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1821 /* For the unsent_pass we:
1822 * discard partially sent pages
1823 * For the !unsent_pass (dirty) we:
1824 * discard partially dirty pages that were sent
1825 * (any partially sent pages were already discarded
1826 * by the previous unsent_pass)
1827 */
1828 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1829 host_ratio);
1830 }
1831
1832 /* Clean up the bitmap */
1833 for (page = fixup_start_addr;
1834 page < fixup_start_addr + host_ratio; page++) {
1835 /* All pages in this host page are now not sent */
1836 set_bit(page, unsentmap);
1837
1838 /*
1839 * Remark them as dirty, updating the count for any pages
1840 * that weren't previously dirty.
1841 */
1842 migration_dirty_pages += !test_and_set_bit(page, bitmap);
1843 }
1844 }
1845
1846 if (unsent_pass) {
1847 /* Find the next sent page for the next iteration */
1848 run_start = find_next_zero_bit(unsentmap, last + 1,
1849 run_start);
1850 } else {
1851 /* Find the next dirty page for the next iteration */
1852 run_start = find_next_bit(bitmap, last + 1, run_start);
1853 }
1854 }
1855}
1856
3d0684b2
JQ
1857/**
1858 * postcopy_chuck_hostpages: discrad any partially sent host page
1859 *
99e314eb
DDAG
1860 * Utility for the outgoing postcopy code.
1861 *
1862 * Discard any partially sent host-page size chunks, mark any partially
29c59172
DDAG
1863 * dirty host-page size chunks as all dirty. In this case the host-page
1864 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
99e314eb 1865 *
3d0684b2
JQ
1866 * Returns zero on success
1867 *
1868 * @ms: current migration state
99e314eb
DDAG
1869 */
1870static int postcopy_chunk_hostpages(MigrationState *ms)
1871{
6f37bb8b 1872 RAMState *rs = &ram_state;
99e314eb
DDAG
1873 struct RAMBlock *block;
1874
99e314eb 1875 /* Easiest way to make sure we don't resume in the middle of a host-page */
6f37bb8b
JQ
1876 rs->last_seen_block = NULL;
1877 rs->last_sent_block = NULL;
1878 rs->last_offset = 0;
99e314eb
DDAG
1879
1880 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1881 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1882
1883 PostcopyDiscardState *pds =
1884 postcopy_discard_send_init(ms, first, block->idstr);
1885
1886 /* First pass: Discard all partially sent host pages */
1887 postcopy_chunk_hostpages_pass(ms, true, block, pds);
1888 /*
1889 * Second pass: Ensure that all partially dirty host pages are made
1890 * fully dirty.
1891 */
1892 postcopy_chunk_hostpages_pass(ms, false, block, pds);
1893
1894 postcopy_discard_send_finish(ms, pds);
1895 } /* ram_list loop */
1896
1897 return 0;
1898}
1899
3d0684b2
JQ
1900/**
1901 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1902 *
1903 * Returns zero on success
1904 *
e0b266f0
DDAG
1905 * Transmit the set of pages to be discarded after precopy to the target
1906 * these are pages that:
1907 * a) Have been previously transmitted but are now dirty again
1908 * b) Pages that have never been transmitted, this ensures that
1909 * any pages on the destination that have been mapped by background
1910 * tasks get discarded (transparent huge pages is the specific concern)
1911 * Hopefully this is pretty sparse
3d0684b2
JQ
1912 *
1913 * @ms: current migration state
e0b266f0
DDAG
1914 */
1915int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1916{
1917 int ret;
1918 unsigned long *bitmap, *unsentmap;
1919
1920 rcu_read_lock();
1921
1922 /* This should be our last sync, the src is now paused */
1923 migration_bitmap_sync();
1924
1925 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1926 if (!unsentmap) {
1927 /* We don't have a safe way to resize the sentmap, so
1928 * if the bitmap was resized it will be NULL at this
1929 * point.
1930 */
1931 error_report("migration ram resized during precopy phase");
1932 rcu_read_unlock();
1933 return -EINVAL;
1934 }
1935
29c59172 1936 /* Deal with TPS != HPS and huge pages */
99e314eb
DDAG
1937 ret = postcopy_chunk_hostpages(ms);
1938 if (ret) {
1939 rcu_read_unlock();
1940 return ret;
1941 }
1942
e0b266f0
DDAG
1943 /*
1944 * Update the unsentmap to be unsentmap = unsentmap | dirty
1945 */
1946 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1947 bitmap_or(unsentmap, unsentmap, bitmap,
1948 last_ram_offset() >> TARGET_PAGE_BITS);
1949
1950
1951 trace_ram_postcopy_send_discard_bitmap();
1952#ifdef DEBUG_POSTCOPY
1953 ram_debug_dump_bitmap(unsentmap, true);
1954#endif
1955
1956 ret = postcopy_each_ram_send_discard(ms);
1957 rcu_read_unlock();
1958
1959 return ret;
1960}
1961
3d0684b2
JQ
1962/**
1963 * ram_discard_range: discard dirtied pages at the beginning of postcopy
e0b266f0 1964 *
3d0684b2 1965 * Returns zero on success
e0b266f0 1966 *
3d0684b2 1967 * @mis: current migration incoming state
36449157
JQ
1968 * @rbname: name of the RAMBlock of the request. NULL means the
1969 * same that last one.
3d0684b2
JQ
1970 * @start: RAMBlock starting page
1971 * @length: RAMBlock size
e0b266f0
DDAG
1972 */
1973int ram_discard_range(MigrationIncomingState *mis,
36449157 1974 const char *rbname,
e0b266f0
DDAG
1975 uint64_t start, size_t length)
1976{
1977 int ret = -1;
1978
36449157 1979 trace_ram_discard_range(rbname, start, length);
d3a5038c 1980
e0b266f0 1981 rcu_read_lock();
36449157 1982 RAMBlock *rb = qemu_ram_block_by_name(rbname);
e0b266f0
DDAG
1983
1984 if (!rb) {
36449157 1985 error_report("ram_discard_range: Failed to find block '%s'", rbname);
e0b266f0
DDAG
1986 goto err;
1987 }
1988
d3a5038c 1989 ret = ram_block_discard_range(rb, start, length);
e0b266f0
DDAG
1990
1991err:
1992 rcu_read_unlock();
1993
1994 return ret;
1995}
1996
6f37bb8b 1997static int ram_save_init_globals(RAMState *rs)
56e93d26 1998{
56e93d26
JQ
1999 int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
2000
56e93d26
JQ
2001 dirty_rate_high_cnt = 0;
2002 bitmap_sync_count = 0;
2003 migration_bitmap_sync_init();
dd631697 2004 qemu_mutex_init(&migration_bitmap_mutex);
56e93d26
JQ
2005
2006 if (migrate_use_xbzrle()) {
2007 XBZRLE_cache_lock();
adb65dec 2008 ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE);
56e93d26
JQ
2009 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
2010 TARGET_PAGE_SIZE,
2011 TARGET_PAGE_SIZE);
2012 if (!XBZRLE.cache) {
2013 XBZRLE_cache_unlock();
2014 error_report("Error creating cache");
2015 return -1;
2016 }
2017 XBZRLE_cache_unlock();
2018
2019 /* We prefer not to abort if there is no memory */
2020 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2021 if (!XBZRLE.encoded_buf) {
2022 error_report("Error allocating encoded_buf");
2023 return -1;
2024 }
2025
2026 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2027 if (!XBZRLE.current_buf) {
2028 error_report("Error allocating current_buf");
2029 g_free(XBZRLE.encoded_buf);
2030 XBZRLE.encoded_buf = NULL;
2031 return -1;
2032 }
2033
2034 acct_clear();
2035 }
2036
49877834
PB
2037 /* For memory_global_dirty_log_start below. */
2038 qemu_mutex_lock_iothread();
2039
56e93d26
JQ
2040 qemu_mutex_lock_ramlist();
2041 rcu_read_lock();
2042 bytes_transferred = 0;
6f37bb8b 2043 ram_state_reset(rs);
56e93d26 2044
f3f491fc 2045 migration_bitmap_rcu = g_new0(struct BitmapRcu, 1);
0827b9e9
AA
2046 /* Skip setting bitmap if there is no RAM */
2047 if (ram_bytes_total()) {
2048 ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
2049 migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages);
2050 bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages);
2051
2052 if (migrate_postcopy_ram()) {
2053 migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages);
2054 bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages);
2055 }
f3f491fc
DDAG
2056 }
2057
56e93d26
JQ
2058 /*
2059 * Count the total number of pages used by ram blocks not including any
2060 * gaps due to alignment or unplugs.
2061 */
2062 migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2063
2064 memory_global_dirty_log_start();
2065 migration_bitmap_sync();
2066 qemu_mutex_unlock_ramlist();
49877834 2067 qemu_mutex_unlock_iothread();
a91246c9
HZ
2068 rcu_read_unlock();
2069
2070 return 0;
2071}
2072
3d0684b2
JQ
2073/*
2074 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
a91246c9
HZ
2075 * long-running RCU critical section. When rcu-reclaims in the code
2076 * start to become numerous it will be necessary to reduce the
2077 * granularity of these critical sections.
2078 */
2079
3d0684b2
JQ
2080/**
2081 * ram_save_setup: Setup RAM for migration
2082 *
2083 * Returns zero to indicate success and negative for error
2084 *
2085 * @f: QEMUFile where to send the data
2086 * @opaque: RAMState pointer
2087 */
a91246c9
HZ
2088static int ram_save_setup(QEMUFile *f, void *opaque)
2089{
6f37bb8b 2090 RAMState *rs = opaque;
a91246c9
HZ
2091 RAMBlock *block;
2092
2093 /* migration has already setup the bitmap, reuse it. */
2094 if (!migration_in_colo_state()) {
6f37bb8b 2095 if (ram_save_init_globals(rs) < 0) {
a91246c9
HZ
2096 return -1;
2097 }
2098 }
2099
2100 rcu_read_lock();
56e93d26
JQ
2101
2102 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2103
2104 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2105 qemu_put_byte(f, strlen(block->idstr));
2106 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2107 qemu_put_be64(f, block->used_length);
ef08fb38
DDAG
2108 if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2109 qemu_put_be64(f, block->page_size);
2110 }
56e93d26
JQ
2111 }
2112
2113 rcu_read_unlock();
2114
2115 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2116 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2117
2118 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2119
2120 return 0;
2121}
2122
3d0684b2
JQ
2123/**
2124 * ram_save_iterate: iterative stage for migration
2125 *
2126 * Returns zero to indicate success and negative for error
2127 *
2128 * @f: QEMUFile where to send the data
2129 * @opaque: RAMState pointer
2130 */
56e93d26
JQ
2131static int ram_save_iterate(QEMUFile *f, void *opaque)
2132{
6f37bb8b 2133 RAMState *rs = opaque;
56e93d26
JQ
2134 int ret;
2135 int i;
2136 int64_t t0;
5c90308f 2137 int done = 0;
56e93d26
JQ
2138
2139 rcu_read_lock();
6f37bb8b
JQ
2140 if (ram_list.version != rs->last_version) {
2141 ram_state_reset(rs);
56e93d26
JQ
2142 }
2143
2144 /* Read version before ram_list.blocks */
2145 smp_rmb();
2146
2147 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2148
2149 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2150 i = 0;
2151 while ((ret = qemu_file_rate_limit(f)) == 0) {
2152 int pages;
2153
6f37bb8b 2154 pages = ram_find_and_save_block(rs, f, false, &bytes_transferred);
56e93d26
JQ
2155 /* no more pages to sent */
2156 if (pages == 0) {
5c90308f 2157 done = 1;
56e93d26
JQ
2158 break;
2159 }
56e93d26 2160 acct_info.iterations++;
070afca2 2161
56e93d26
JQ
2162 /* we want to check in the 1st loop, just in case it was the 1st time
2163 and we had to sync the dirty bitmap.
2164 qemu_get_clock_ns() is a bit expensive, so we only check each some
2165 iterations
2166 */
2167 if ((i & 63) == 0) {
2168 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2169 if (t1 > MAX_WAIT) {
55c4446b 2170 trace_ram_save_iterate_big_wait(t1, i);
56e93d26
JQ
2171 break;
2172 }
2173 }
2174 i++;
2175 }
2176 flush_compressed_data(f);
2177 rcu_read_unlock();
2178
2179 /*
2180 * Must occur before EOS (or any QEMUFile operation)
2181 * because of RDMA protocol.
2182 */
2183 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2184
2185 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2186 bytes_transferred += 8;
2187
2188 ret = qemu_file_get_error(f);
2189 if (ret < 0) {
2190 return ret;
2191 }
2192
5c90308f 2193 return done;
56e93d26
JQ
2194}
2195
3d0684b2
JQ
2196/**
2197 * ram_save_complete: function called to send the remaining amount of ram
2198 *
2199 * Returns zero to indicate success
2200 *
2201 * Called with iothread lock
2202 *
2203 * @f: QEMUFile where to send the data
2204 * @opaque: RAMState pointer
2205 */
56e93d26
JQ
2206static int ram_save_complete(QEMUFile *f, void *opaque)
2207{
6f37bb8b
JQ
2208 RAMState *rs = opaque;
2209
56e93d26
JQ
2210 rcu_read_lock();
2211
663e6c1d
DDAG
2212 if (!migration_in_postcopy(migrate_get_current())) {
2213 migration_bitmap_sync();
2214 }
56e93d26
JQ
2215
2216 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2217
2218 /* try transferring iterative blocks of memory */
2219
2220 /* flush all remaining blocks regardless of rate limiting */
2221 while (true) {
2222 int pages;
2223
6f37bb8b 2224 pages = ram_find_and_save_block(rs, f, !migration_in_colo_state(),
a91246c9 2225 &bytes_transferred);
56e93d26
JQ
2226 /* no more blocks to sent */
2227 if (pages == 0) {
2228 break;
2229 }
2230 }
2231
2232 flush_compressed_data(f);
2233 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
56e93d26
JQ
2234
2235 rcu_read_unlock();
d09a6fde 2236
56e93d26
JQ
2237 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2238
2239 return 0;
2240}
2241
c31b098f
DDAG
2242static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2243 uint64_t *non_postcopiable_pending,
2244 uint64_t *postcopiable_pending)
56e93d26
JQ
2245{
2246 uint64_t remaining_size;
2247
2248 remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2249
663e6c1d
DDAG
2250 if (!migration_in_postcopy(migrate_get_current()) &&
2251 remaining_size < max_size) {
56e93d26
JQ
2252 qemu_mutex_lock_iothread();
2253 rcu_read_lock();
2254 migration_bitmap_sync();
2255 rcu_read_unlock();
2256 qemu_mutex_unlock_iothread();
2257 remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2258 }
c31b098f
DDAG
2259
2260 /* We can do postcopy, and all the data is postcopiable */
2261 *postcopiable_pending += remaining_size;
56e93d26
JQ
2262}
2263
2264static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2265{
2266 unsigned int xh_len;
2267 int xh_flags;
063e760a 2268 uint8_t *loaded_data;
56e93d26
JQ
2269
2270 if (!xbzrle_decoded_buf) {
2271 xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2272 }
063e760a 2273 loaded_data = xbzrle_decoded_buf;
56e93d26
JQ
2274
2275 /* extract RLE header */
2276 xh_flags = qemu_get_byte(f);
2277 xh_len = qemu_get_be16(f);
2278
2279 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2280 error_report("Failed to load XBZRLE page - wrong compression!");
2281 return -1;
2282 }
2283
2284 if (xh_len > TARGET_PAGE_SIZE) {
2285 error_report("Failed to load XBZRLE page - len overflow!");
2286 return -1;
2287 }
2288 /* load data and decode */
063e760a 2289 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
56e93d26
JQ
2290
2291 /* decode RLE */
063e760a 2292 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
56e93d26
JQ
2293 TARGET_PAGE_SIZE) == -1) {
2294 error_report("Failed to load XBZRLE page - decode error!");
2295 return -1;
2296 }
2297
2298 return 0;
2299}
2300
3d0684b2
JQ
2301/**
2302 * ram_block_from_stream: read a RAMBlock id from the migration stream
2303 *
2304 * Must be called from within a rcu critical section.
2305 *
56e93d26 2306 * Returns a pointer from within the RCU-protected ram_list.
a7180877 2307 *
3d0684b2
JQ
2308 * @f: QEMUFile where to read the data from
2309 * @flags: Page flags (mostly to see if it's a continuation of previous block)
a7180877 2310 */
3d0684b2 2311static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
56e93d26
JQ
2312{
2313 static RAMBlock *block = NULL;
2314 char id[256];
2315 uint8_t len;
2316
2317 if (flags & RAM_SAVE_FLAG_CONTINUE) {
4c4bad48 2318 if (!block) {
56e93d26
JQ
2319 error_report("Ack, bad migration stream!");
2320 return NULL;
2321 }
4c4bad48 2322 return block;
56e93d26
JQ
2323 }
2324
2325 len = qemu_get_byte(f);
2326 qemu_get_buffer(f, (uint8_t *)id, len);
2327 id[len] = 0;
2328
e3dd7493 2329 block = qemu_ram_block_by_name(id);
4c4bad48
HZ
2330 if (!block) {
2331 error_report("Can't find block %s", id);
2332 return NULL;
56e93d26
JQ
2333 }
2334
4c4bad48
HZ
2335 return block;
2336}
2337
2338static inline void *host_from_ram_block_offset(RAMBlock *block,
2339 ram_addr_t offset)
2340{
2341 if (!offset_in_ramblock(block, offset)) {
2342 return NULL;
2343 }
2344
2345 return block->host + offset;
56e93d26
JQ
2346}
2347
3d0684b2
JQ
2348/**
2349 * ram_handle_compressed: handle the zero page case
2350 *
56e93d26
JQ
2351 * If a page (or a whole RDMA chunk) has been
2352 * determined to be zero, then zap it.
3d0684b2
JQ
2353 *
2354 * @host: host address for the zero page
2355 * @ch: what the page is filled from. We only support zero
2356 * @size: size of the zero page
56e93d26
JQ
2357 */
2358void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2359{
2360 if (ch != 0 || !is_zero_range(host, size)) {
2361 memset(host, ch, size);
2362 }
2363}
2364
2365static void *do_data_decompress(void *opaque)
2366{
2367 DecompressParam *param = opaque;
2368 unsigned long pagesize;
33d151f4
LL
2369 uint8_t *des;
2370 int len;
56e93d26 2371
33d151f4 2372 qemu_mutex_lock(&param->mutex);
90e56fb4 2373 while (!param->quit) {
33d151f4
LL
2374 if (param->des) {
2375 des = param->des;
2376 len = param->len;
2377 param->des = 0;
2378 qemu_mutex_unlock(&param->mutex);
2379
56e93d26 2380 pagesize = TARGET_PAGE_SIZE;
73a8912b
LL
2381 /* uncompress() will return failed in some case, especially
2382 * when the page is dirted when doing the compression, it's
2383 * not a problem because the dirty page will be retransferred
2384 * and uncompress() won't break the data in other pages.
2385 */
33d151f4
LL
2386 uncompress((Bytef *)des, &pagesize,
2387 (const Bytef *)param->compbuf, len);
73a8912b 2388
33d151f4
LL
2389 qemu_mutex_lock(&decomp_done_lock);
2390 param->done = true;
2391 qemu_cond_signal(&decomp_done_cond);
2392 qemu_mutex_unlock(&decomp_done_lock);
2393
2394 qemu_mutex_lock(&param->mutex);
2395 } else {
2396 qemu_cond_wait(&param->cond, &param->mutex);
2397 }
56e93d26 2398 }
33d151f4 2399 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
2400
2401 return NULL;
2402}
2403
5533b2e9
LL
2404static void wait_for_decompress_done(void)
2405{
2406 int idx, thread_count;
2407
2408 if (!migrate_use_compression()) {
2409 return;
2410 }
2411
2412 thread_count = migrate_decompress_threads();
2413 qemu_mutex_lock(&decomp_done_lock);
2414 for (idx = 0; idx < thread_count; idx++) {
2415 while (!decomp_param[idx].done) {
2416 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2417 }
2418 }
2419 qemu_mutex_unlock(&decomp_done_lock);
2420}
2421
56e93d26
JQ
2422void migrate_decompress_threads_create(void)
2423{
2424 int i, thread_count;
2425
2426 thread_count = migrate_decompress_threads();
2427 decompress_threads = g_new0(QemuThread, thread_count);
2428 decomp_param = g_new0(DecompressParam, thread_count);
73a8912b
LL
2429 qemu_mutex_init(&decomp_done_lock);
2430 qemu_cond_init(&decomp_done_cond);
56e93d26
JQ
2431 for (i = 0; i < thread_count; i++) {
2432 qemu_mutex_init(&decomp_param[i].mutex);
2433 qemu_cond_init(&decomp_param[i].cond);
2434 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
73a8912b 2435 decomp_param[i].done = true;
90e56fb4 2436 decomp_param[i].quit = false;
56e93d26
JQ
2437 qemu_thread_create(decompress_threads + i, "decompress",
2438 do_data_decompress, decomp_param + i,
2439 QEMU_THREAD_JOINABLE);
2440 }
2441}
2442
2443void migrate_decompress_threads_join(void)
2444{
2445 int i, thread_count;
2446
56e93d26
JQ
2447 thread_count = migrate_decompress_threads();
2448 for (i = 0; i < thread_count; i++) {
2449 qemu_mutex_lock(&decomp_param[i].mutex);
90e56fb4 2450 decomp_param[i].quit = true;
56e93d26
JQ
2451 qemu_cond_signal(&decomp_param[i].cond);
2452 qemu_mutex_unlock(&decomp_param[i].mutex);
2453 }
2454 for (i = 0; i < thread_count; i++) {
2455 qemu_thread_join(decompress_threads + i);
2456 qemu_mutex_destroy(&decomp_param[i].mutex);
2457 qemu_cond_destroy(&decomp_param[i].cond);
2458 g_free(decomp_param[i].compbuf);
2459 }
2460 g_free(decompress_threads);
2461 g_free(decomp_param);
56e93d26
JQ
2462 decompress_threads = NULL;
2463 decomp_param = NULL;
56e93d26
JQ
2464}
2465
c1bc6626 2466static void decompress_data_with_multi_threads(QEMUFile *f,
56e93d26
JQ
2467 void *host, int len)
2468{
2469 int idx, thread_count;
2470
2471 thread_count = migrate_decompress_threads();
73a8912b 2472 qemu_mutex_lock(&decomp_done_lock);
56e93d26
JQ
2473 while (true) {
2474 for (idx = 0; idx < thread_count; idx++) {
73a8912b 2475 if (decomp_param[idx].done) {
33d151f4
LL
2476 decomp_param[idx].done = false;
2477 qemu_mutex_lock(&decomp_param[idx].mutex);
c1bc6626 2478 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
56e93d26
JQ
2479 decomp_param[idx].des = host;
2480 decomp_param[idx].len = len;
33d151f4
LL
2481 qemu_cond_signal(&decomp_param[idx].cond);
2482 qemu_mutex_unlock(&decomp_param[idx].mutex);
56e93d26
JQ
2483 break;
2484 }
2485 }
2486 if (idx < thread_count) {
2487 break;
73a8912b
LL
2488 } else {
2489 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
56e93d26
JQ
2490 }
2491 }
73a8912b 2492 qemu_mutex_unlock(&decomp_done_lock);
56e93d26
JQ
2493}
2494
3d0684b2
JQ
2495/**
2496 * ram_postcopy_incoming_init: allocate postcopy data structures
2497 *
2498 * Returns 0 for success and negative if there was one error
2499 *
2500 * @mis: current migration incoming state
2501 *
2502 * Allocate data structures etc needed by incoming migration with
2503 * postcopy-ram. postcopy-ram's similarly names
2504 * postcopy_ram_incoming_init does the work.
1caddf8a
DDAG
2505 */
2506int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2507{
2508 size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
2509
2510 return postcopy_ram_incoming_init(mis, ram_pages);
2511}
2512
3d0684b2
JQ
2513/**
2514 * ram_load_postcopy: load a page in postcopy case
2515 *
2516 * Returns 0 for success or -errno in case of error
2517 *
a7180877
DDAG
2518 * Called in postcopy mode by ram_load().
2519 * rcu_read_lock is taken prior to this being called.
3d0684b2
JQ
2520 *
2521 * @f: QEMUFile where to send the data
a7180877
DDAG
2522 */
2523static int ram_load_postcopy(QEMUFile *f)
2524{
2525 int flags = 0, ret = 0;
2526 bool place_needed = false;
28abd200 2527 bool matching_page_sizes = false;
a7180877
DDAG
2528 MigrationIncomingState *mis = migration_incoming_get_current();
2529 /* Temporary page that is later 'placed' */
2530 void *postcopy_host_page = postcopy_get_tmp_page(mis);
c53b7ddc 2531 void *last_host = NULL;
a3b6ff6d 2532 bool all_zero = false;
a7180877
DDAG
2533
2534 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2535 ram_addr_t addr;
2536 void *host = NULL;
2537 void *page_buffer = NULL;
2538 void *place_source = NULL;
df9ff5e1 2539 RAMBlock *block = NULL;
a7180877 2540 uint8_t ch;
a7180877
DDAG
2541
2542 addr = qemu_get_be64(f);
2543 flags = addr & ~TARGET_PAGE_MASK;
2544 addr &= TARGET_PAGE_MASK;
2545
2546 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2547 place_needed = false;
2548 if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) {
df9ff5e1 2549 block = ram_block_from_stream(f, flags);
4c4bad48
HZ
2550
2551 host = host_from_ram_block_offset(block, addr);
a7180877
DDAG
2552 if (!host) {
2553 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2554 ret = -EINVAL;
2555 break;
2556 }
28abd200 2557 matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
a7180877 2558 /*
28abd200
DDAG
2559 * Postcopy requires that we place whole host pages atomically;
2560 * these may be huge pages for RAMBlocks that are backed by
2561 * hugetlbfs.
a7180877
DDAG
2562 * To make it atomic, the data is read into a temporary page
2563 * that's moved into place later.
2564 * The migration protocol uses, possibly smaller, target-pages
2565 * however the source ensures it always sends all the components
2566 * of a host page in order.
2567 */
2568 page_buffer = postcopy_host_page +
28abd200 2569 ((uintptr_t)host & (block->page_size - 1));
a7180877 2570 /* If all TP are zero then we can optimise the place */
28abd200 2571 if (!((uintptr_t)host & (block->page_size - 1))) {
a7180877 2572 all_zero = true;
c53b7ddc
DDAG
2573 } else {
2574 /* not the 1st TP within the HP */
2575 if (host != (last_host + TARGET_PAGE_SIZE)) {
9af9e0fe 2576 error_report("Non-sequential target page %p/%p",
c53b7ddc
DDAG
2577 host, last_host);
2578 ret = -EINVAL;
2579 break;
2580 }
a7180877
DDAG
2581 }
2582
c53b7ddc 2583
a7180877
DDAG
2584 /*
2585 * If it's the last part of a host page then we place the host
2586 * page
2587 */
2588 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
28abd200 2589 (block->page_size - 1)) == 0;
a7180877
DDAG
2590 place_source = postcopy_host_page;
2591 }
c53b7ddc 2592 last_host = host;
a7180877
DDAG
2593
2594 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2595 case RAM_SAVE_FLAG_COMPRESS:
2596 ch = qemu_get_byte(f);
2597 memset(page_buffer, ch, TARGET_PAGE_SIZE);
2598 if (ch) {
2599 all_zero = false;
2600 }
2601 break;
2602
2603 case RAM_SAVE_FLAG_PAGE:
2604 all_zero = false;
2605 if (!place_needed || !matching_page_sizes) {
2606 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2607 } else {
2608 /* Avoids the qemu_file copy during postcopy, which is
2609 * going to do a copy later; can only do it when we
2610 * do this read in one go (matching page sizes)
2611 */
2612 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2613 TARGET_PAGE_SIZE);
2614 }
2615 break;
2616 case RAM_SAVE_FLAG_EOS:
2617 /* normal exit */
2618 break;
2619 default:
2620 error_report("Unknown combination of migration flags: %#x"
2621 " (postcopy mode)", flags);
2622 ret = -EINVAL;
2623 }
2624
2625 if (place_needed) {
2626 /* This gets called at the last target page in the host page */
df9ff5e1
DDAG
2627 void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2628
a7180877 2629 if (all_zero) {
df9ff5e1
DDAG
2630 ret = postcopy_place_page_zero(mis, place_dest,
2631 block->page_size);
a7180877 2632 } else {
df9ff5e1
DDAG
2633 ret = postcopy_place_page(mis, place_dest,
2634 place_source, block->page_size);
a7180877
DDAG
2635 }
2636 }
2637 if (!ret) {
2638 ret = qemu_file_get_error(f);
2639 }
2640 }
2641
2642 return ret;
2643}
2644
56e93d26
JQ
2645static int ram_load(QEMUFile *f, void *opaque, int version_id)
2646{
2647 int flags = 0, ret = 0;
2648 static uint64_t seq_iter;
2649 int len = 0;
a7180877
DDAG
2650 /*
2651 * If system is running in postcopy mode, page inserts to host memory must
2652 * be atomic
2653 */
2654 bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
ef08fb38
DDAG
2655 /* ADVISE is earlier, it shows the source has the postcopy capability on */
2656 bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
56e93d26
JQ
2657
2658 seq_iter++;
2659
2660 if (version_id != 4) {
2661 ret = -EINVAL;
2662 }
2663
2664 /* This RCU critical section can be very long running.
2665 * When RCU reclaims in the code start to become numerous,
2666 * it will be necessary to reduce the granularity of this
2667 * critical section.
2668 */
2669 rcu_read_lock();
a7180877
DDAG
2670
2671 if (postcopy_running) {
2672 ret = ram_load_postcopy(f);
2673 }
2674
2675 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
56e93d26 2676 ram_addr_t addr, total_ram_bytes;
a776aa15 2677 void *host = NULL;
56e93d26
JQ
2678 uint8_t ch;
2679
2680 addr = qemu_get_be64(f);
2681 flags = addr & ~TARGET_PAGE_MASK;
2682 addr &= TARGET_PAGE_MASK;
2683
a776aa15
DDAG
2684 if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE |
2685 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4c4bad48
HZ
2686 RAMBlock *block = ram_block_from_stream(f, flags);
2687
2688 host = host_from_ram_block_offset(block, addr);
a776aa15
DDAG
2689 if (!host) {
2690 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2691 ret = -EINVAL;
2692 break;
2693 }
2694 }
2695
56e93d26
JQ
2696 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2697 case RAM_SAVE_FLAG_MEM_SIZE:
2698 /* Synchronize RAM block list */
2699 total_ram_bytes = addr;
2700 while (!ret && total_ram_bytes) {
2701 RAMBlock *block;
56e93d26
JQ
2702 char id[256];
2703 ram_addr_t length;
2704
2705 len = qemu_get_byte(f);
2706 qemu_get_buffer(f, (uint8_t *)id, len);
2707 id[len] = 0;
2708 length = qemu_get_be64(f);
2709
e3dd7493
DDAG
2710 block = qemu_ram_block_by_name(id);
2711 if (block) {
2712 if (length != block->used_length) {
2713 Error *local_err = NULL;
56e93d26 2714
fa53a0e5 2715 ret = qemu_ram_resize(block, length,
e3dd7493
DDAG
2716 &local_err);
2717 if (local_err) {
2718 error_report_err(local_err);
56e93d26 2719 }
56e93d26 2720 }
ef08fb38
DDAG
2721 /* For postcopy we need to check hugepage sizes match */
2722 if (postcopy_advised &&
2723 block->page_size != qemu_host_page_size) {
2724 uint64_t remote_page_size = qemu_get_be64(f);
2725 if (remote_page_size != block->page_size) {
2726 error_report("Mismatched RAM page size %s "
2727 "(local) %zd != %" PRId64,
2728 id, block->page_size,
2729 remote_page_size);
2730 ret = -EINVAL;
2731 }
2732 }
e3dd7493
DDAG
2733 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2734 block->idstr);
2735 } else {
56e93d26
JQ
2736 error_report("Unknown ramblock \"%s\", cannot "
2737 "accept migration", id);
2738 ret = -EINVAL;
2739 }
2740
2741 total_ram_bytes -= length;
2742 }
2743 break;
a776aa15 2744
56e93d26 2745 case RAM_SAVE_FLAG_COMPRESS:
56e93d26
JQ
2746 ch = qemu_get_byte(f);
2747 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2748 break;
a776aa15 2749
56e93d26 2750 case RAM_SAVE_FLAG_PAGE:
56e93d26
JQ
2751 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2752 break;
56e93d26 2753
a776aa15 2754 case RAM_SAVE_FLAG_COMPRESS_PAGE:
56e93d26
JQ
2755 len = qemu_get_be32(f);
2756 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2757 error_report("Invalid compressed data length: %d", len);
2758 ret = -EINVAL;
2759 break;
2760 }
c1bc6626 2761 decompress_data_with_multi_threads(f, host, len);
56e93d26 2762 break;
a776aa15 2763
56e93d26 2764 case RAM_SAVE_FLAG_XBZRLE:
56e93d26
JQ
2765 if (load_xbzrle(f, addr, host) < 0) {
2766 error_report("Failed to decompress XBZRLE page at "
2767 RAM_ADDR_FMT, addr);
2768 ret = -EINVAL;
2769 break;
2770 }
2771 break;
2772 case RAM_SAVE_FLAG_EOS:
2773 /* normal exit */
2774 break;
2775 default:
2776 if (flags & RAM_SAVE_FLAG_HOOK) {
632e3a5c 2777 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
56e93d26
JQ
2778 } else {
2779 error_report("Unknown combination of migration flags: %#x",
2780 flags);
2781 ret = -EINVAL;
2782 }
2783 }
2784 if (!ret) {
2785 ret = qemu_file_get_error(f);
2786 }
2787 }
2788
5533b2e9 2789 wait_for_decompress_done();
56e93d26 2790 rcu_read_unlock();
55c4446b 2791 trace_ram_load_complete(ret, seq_iter);
56e93d26
JQ
2792 return ret;
2793}
2794
2795static SaveVMHandlers savevm_ram_handlers = {
2796 .save_live_setup = ram_save_setup,
2797 .save_live_iterate = ram_save_iterate,
763c906b 2798 .save_live_complete_postcopy = ram_save_complete,
a3e06c3d 2799 .save_live_complete_precopy = ram_save_complete,
56e93d26
JQ
2800 .save_live_pending = ram_save_pending,
2801 .load_state = ram_load,
6ad2a215 2802 .cleanup = ram_migration_cleanup,
56e93d26
JQ
2803};
2804
2805void ram_mig_init(void)
2806{
2807 qemu_mutex_init(&XBZRLE.lock);
6f37bb8b 2808 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
56e93d26 2809}