]> git.proxmox.com Git - mirror_qemu.git/blame - migration/ram.c
ram: Change byte_xfer_{prev,now} type to uint64_t
[mirror_qemu.git] / migration / ram.c
CommitLineData
56e93d26
JQ
1/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
76cc7b58
JQ
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
56e93d26
JQ
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
1393a485 28#include "qemu/osdep.h"
33c11879
PB
29#include "qemu-common.h"
30#include "cpu.h"
56e93d26 31#include <zlib.h>
4addcd4f 32#include "qapi-event.h"
f348b6d1 33#include "qemu/cutils.h"
56e93d26
JQ
34#include "qemu/bitops.h"
35#include "qemu/bitmap.h"
7205c9ec
JQ
36#include "qemu/timer.h"
37#include "qemu/main-loop.h"
56e93d26 38#include "migration/migration.h"
e0b266f0 39#include "migration/postcopy-ram.h"
56e93d26
JQ
40#include "exec/address-spaces.h"
41#include "migration/page_cache.h"
56e93d26 42#include "qemu/error-report.h"
56e93d26 43#include "trace.h"
56e93d26 44#include "exec/ram_addr.h"
56e93d26 45#include "qemu/rcu_queue.h"
a91246c9 46#include "migration/colo.h"
56e93d26 47
56e93d26
JQ
48/***********************************************************/
49/* ram save/restore */
50
51#define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
52#define RAM_SAVE_FLAG_COMPRESS 0x02
53#define RAM_SAVE_FLAG_MEM_SIZE 0x04
54#define RAM_SAVE_FLAG_PAGE 0x08
55#define RAM_SAVE_FLAG_EOS 0x10
56#define RAM_SAVE_FLAG_CONTINUE 0x20
57#define RAM_SAVE_FLAG_XBZRLE 0x40
58/* 0x80 is reserved in migration.h start with 0x100 next */
59#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
60
adb65dec 61static uint8_t *ZERO_TARGET_PAGE;
56e93d26
JQ
62
63static inline bool is_zero_range(uint8_t *p, uint64_t size)
64{
a1febc49 65 return buffer_is_zero(p, size);
56e93d26
JQ
66}
67
68/* struct contains XBZRLE cache and a static page
69 used by the compression */
70static struct {
71 /* buffer used for XBZRLE encoding */
72 uint8_t *encoded_buf;
73 /* buffer for storing page content */
74 uint8_t *current_buf;
75 /* Cache for XBZRLE, Protected by lock. */
76 PageCache *cache;
77 QemuMutex lock;
78} XBZRLE;
79
80/* buffer used for XBZRLE decoding */
81static uint8_t *xbzrle_decoded_buf;
82
83static void XBZRLE_cache_lock(void)
84{
85 if (migrate_use_xbzrle())
86 qemu_mutex_lock(&XBZRLE.lock);
87}
88
89static void XBZRLE_cache_unlock(void)
90{
91 if (migrate_use_xbzrle())
92 qemu_mutex_unlock(&XBZRLE.lock);
93}
94
3d0684b2
JQ
95/**
96 * xbzrle_cache_resize: resize the xbzrle cache
97 *
98 * This function is called from qmp_migrate_set_cache_size in main
99 * thread, possibly while a migration is in progress. A running
100 * migration may be using the cache and might finish during this call,
101 * hence changes to the cache are protected by XBZRLE.lock().
102 *
103 * Returns the new_size or negative in case of error.
104 *
105 * @new_size: new cache size
56e93d26
JQ
106 */
107int64_t xbzrle_cache_resize(int64_t new_size)
108{
109 PageCache *new_cache;
110 int64_t ret;
111
112 if (new_size < TARGET_PAGE_SIZE) {
113 return -1;
114 }
115
116 XBZRLE_cache_lock();
117
118 if (XBZRLE.cache != NULL) {
119 if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
120 goto out_new_size;
121 }
122 new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
123 TARGET_PAGE_SIZE);
124 if (!new_cache) {
125 error_report("Error creating cache");
126 ret = -1;
127 goto out;
128 }
129
130 cache_fini(XBZRLE.cache);
131 XBZRLE.cache = new_cache;
132 }
133
134out_new_size:
135 ret = pow2floor(new_size);
136out:
137 XBZRLE_cache_unlock();
138 return ret;
139}
140
6f37bb8b
JQ
141/* State of RAM for migration */
142struct RAMState {
143 /* Last block that we have visited searching for dirty pages */
144 RAMBlock *last_seen_block;
145 /* Last block from where we have sent data */
146 RAMBlock *last_sent_block;
147 /* Last offset we have sent data from */
148 ram_addr_t last_offset;
149 /* last ram version we have seen */
150 uint32_t last_version;
151 /* We are in the first round */
152 bool ram_bulk_stage;
8d820d6f
JQ
153 /* How many times we have dirty too many pages */
154 int dirty_rate_high_cnt;
5a987738
JQ
155 /* How many times we have synchronized the bitmap */
156 uint64_t bitmap_sync_count;
f664da80
JQ
157 /* these variables are used for bitmap sync */
158 /* last time we did a full bitmap_sync */
159 int64_t time_last_bitmap_sync;
eac74159 160 /* bytes transferred at start_time */
c4bdf0cf 161 uint64_t bytes_xfer_prev;
6f37bb8b
JQ
162};
163typedef struct RAMState RAMState;
164
165static RAMState ram_state;
166
56e93d26
JQ
167/* accounting for migration statistics */
168typedef struct AccountingInfo {
169 uint64_t dup_pages;
170 uint64_t skipped_pages;
171 uint64_t norm_pages;
172 uint64_t iterations;
173 uint64_t xbzrle_bytes;
174 uint64_t xbzrle_pages;
175 uint64_t xbzrle_cache_miss;
176 double xbzrle_cache_miss_rate;
177 uint64_t xbzrle_overflows;
178} AccountingInfo;
179
180static AccountingInfo acct_info;
181
182static void acct_clear(void)
183{
184 memset(&acct_info, 0, sizeof(acct_info));
185}
186
187uint64_t dup_mig_bytes_transferred(void)
188{
189 return acct_info.dup_pages * TARGET_PAGE_SIZE;
190}
191
192uint64_t dup_mig_pages_transferred(void)
193{
194 return acct_info.dup_pages;
195}
196
197uint64_t skipped_mig_bytes_transferred(void)
198{
199 return acct_info.skipped_pages * TARGET_PAGE_SIZE;
200}
201
202uint64_t skipped_mig_pages_transferred(void)
203{
204 return acct_info.skipped_pages;
205}
206
207uint64_t norm_mig_bytes_transferred(void)
208{
209 return acct_info.norm_pages * TARGET_PAGE_SIZE;
210}
211
212uint64_t norm_mig_pages_transferred(void)
213{
214 return acct_info.norm_pages;
215}
216
217uint64_t xbzrle_mig_bytes_transferred(void)
218{
219 return acct_info.xbzrle_bytes;
220}
221
222uint64_t xbzrle_mig_pages_transferred(void)
223{
224 return acct_info.xbzrle_pages;
225}
226
227uint64_t xbzrle_mig_pages_cache_miss(void)
228{
229 return acct_info.xbzrle_cache_miss;
230}
231
232double xbzrle_mig_cache_miss_rate(void)
233{
234 return acct_info.xbzrle_cache_miss_rate;
235}
236
237uint64_t xbzrle_mig_pages_overflow(void)
238{
239 return acct_info.xbzrle_overflows;
240}
241
dd631697 242static QemuMutex migration_bitmap_mutex;
56e93d26 243static uint64_t migration_dirty_pages;
56e93d26 244
b8fb8cb7
DDAG
245/* used by the search for pages to send */
246struct PageSearchStatus {
247 /* Current block being searched */
248 RAMBlock *block;
249 /* Current offset to search from */
250 ram_addr_t offset;
251 /* Set once we wrap around */
252 bool complete_round;
253};
254typedef struct PageSearchStatus PageSearchStatus;
255
60be6340
DL
256static struct BitmapRcu {
257 struct rcu_head rcu;
f3f491fc 258 /* Main migration bitmap */
60be6340 259 unsigned long *bmap;
f3f491fc
DDAG
260 /* bitmap of pages that haven't been sent even once
261 * only maintained and used in postcopy at the moment
262 * where it's used to send the dirtymap at the start
263 * of the postcopy phase
264 */
265 unsigned long *unsentmap;
60be6340
DL
266} *migration_bitmap_rcu;
267
56e93d26 268struct CompressParam {
56e93d26 269 bool done;
90e56fb4 270 bool quit;
56e93d26
JQ
271 QEMUFile *file;
272 QemuMutex mutex;
273 QemuCond cond;
274 RAMBlock *block;
275 ram_addr_t offset;
276};
277typedef struct CompressParam CompressParam;
278
279struct DecompressParam {
73a8912b 280 bool done;
90e56fb4 281 bool quit;
56e93d26
JQ
282 QemuMutex mutex;
283 QemuCond cond;
284 void *des;
d341d9f3 285 uint8_t *compbuf;
56e93d26
JQ
286 int len;
287};
288typedef struct DecompressParam DecompressParam;
289
290static CompressParam *comp_param;
291static QemuThread *compress_threads;
292/* comp_done_cond is used to wake up the migration thread when
293 * one of the compression threads has finished the compression.
294 * comp_done_lock is used to co-work with comp_done_cond.
295 */
0d9f9a5c
LL
296static QemuMutex comp_done_lock;
297static QemuCond comp_done_cond;
56e93d26
JQ
298/* The empty QEMUFileOps will be used by file in CompressParam */
299static const QEMUFileOps empty_ops = { };
300
301static bool compression_switch;
56e93d26
JQ
302static DecompressParam *decomp_param;
303static QemuThread *decompress_threads;
73a8912b
LL
304static QemuMutex decomp_done_lock;
305static QemuCond decomp_done_cond;
56e93d26 306
a7a9a88f
LL
307static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
308 ram_addr_t offset);
56e93d26
JQ
309
310static void *do_data_compress(void *opaque)
311{
312 CompressParam *param = opaque;
a7a9a88f
LL
313 RAMBlock *block;
314 ram_addr_t offset;
56e93d26 315
a7a9a88f 316 qemu_mutex_lock(&param->mutex);
90e56fb4 317 while (!param->quit) {
a7a9a88f
LL
318 if (param->block) {
319 block = param->block;
320 offset = param->offset;
321 param->block = NULL;
322 qemu_mutex_unlock(&param->mutex);
323
324 do_compress_ram_page(param->file, block, offset);
325
0d9f9a5c 326 qemu_mutex_lock(&comp_done_lock);
a7a9a88f 327 param->done = true;
0d9f9a5c
LL
328 qemu_cond_signal(&comp_done_cond);
329 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
330
331 qemu_mutex_lock(&param->mutex);
332 } else {
56e93d26
JQ
333 qemu_cond_wait(&param->cond, &param->mutex);
334 }
56e93d26 335 }
a7a9a88f 336 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
337
338 return NULL;
339}
340
341static inline void terminate_compression_threads(void)
342{
343 int idx, thread_count;
344
345 thread_count = migrate_compress_threads();
3d0684b2 346
56e93d26
JQ
347 for (idx = 0; idx < thread_count; idx++) {
348 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 349 comp_param[idx].quit = true;
56e93d26
JQ
350 qemu_cond_signal(&comp_param[idx].cond);
351 qemu_mutex_unlock(&comp_param[idx].mutex);
352 }
353}
354
355void migrate_compress_threads_join(void)
356{
357 int i, thread_count;
358
359 if (!migrate_use_compression()) {
360 return;
361 }
362 terminate_compression_threads();
363 thread_count = migrate_compress_threads();
364 for (i = 0; i < thread_count; i++) {
365 qemu_thread_join(compress_threads + i);
366 qemu_fclose(comp_param[i].file);
367 qemu_mutex_destroy(&comp_param[i].mutex);
368 qemu_cond_destroy(&comp_param[i].cond);
369 }
0d9f9a5c
LL
370 qemu_mutex_destroy(&comp_done_lock);
371 qemu_cond_destroy(&comp_done_cond);
56e93d26
JQ
372 g_free(compress_threads);
373 g_free(comp_param);
56e93d26
JQ
374 compress_threads = NULL;
375 comp_param = NULL;
56e93d26
JQ
376}
377
378void migrate_compress_threads_create(void)
379{
380 int i, thread_count;
381
382 if (!migrate_use_compression()) {
383 return;
384 }
56e93d26
JQ
385 compression_switch = true;
386 thread_count = migrate_compress_threads();
387 compress_threads = g_new0(QemuThread, thread_count);
388 comp_param = g_new0(CompressParam, thread_count);
0d9f9a5c
LL
389 qemu_cond_init(&comp_done_cond);
390 qemu_mutex_init(&comp_done_lock);
56e93d26 391 for (i = 0; i < thread_count; i++) {
e110aa91
C
392 /* comp_param[i].file is just used as a dummy buffer to save data,
393 * set its ops to empty.
56e93d26
JQ
394 */
395 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
396 comp_param[i].done = true;
90e56fb4 397 comp_param[i].quit = false;
56e93d26
JQ
398 qemu_mutex_init(&comp_param[i].mutex);
399 qemu_cond_init(&comp_param[i].cond);
400 qemu_thread_create(compress_threads + i, "compress",
401 do_data_compress, comp_param + i,
402 QEMU_THREAD_JOINABLE);
403 }
404}
405
406/**
3d0684b2 407 * save_page_header: write page header to wire
56e93d26
JQ
408 *
409 * If this is the 1st block, it also writes the block identification
410 *
3d0684b2 411 * Returns the number of bytes written
56e93d26
JQ
412 *
413 * @f: QEMUFile where to send the data
414 * @block: block that contains the page we want to send
415 * @offset: offset inside the block for the page
416 * in the lower bits, it contains flags
417 */
418static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
419{
9f5f380b 420 size_t size, len;
56e93d26
JQ
421
422 qemu_put_be64(f, offset);
423 size = 8;
424
425 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
9f5f380b
LL
426 len = strlen(block->idstr);
427 qemu_put_byte(f, len);
428 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
429 size += 1 + len;
56e93d26
JQ
430 }
431 return size;
432}
433
3d0684b2
JQ
434/**
435 * mig_throttle_guest_down: throotle down the guest
436 *
437 * Reduce amount of guest cpu execution to hopefully slow down memory
438 * writes. If guest dirty memory rate is reduced below the rate at
439 * which we can transfer pages to the destination then we should be
440 * able to complete migration. Some workloads dirty memory way too
441 * fast and will not effectively converge, even with auto-converge.
070afca2
JH
442 */
443static void mig_throttle_guest_down(void)
444{
445 MigrationState *s = migrate_get_current();
2594f56d
DB
446 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
447 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
070afca2
JH
448
449 /* We have not started throttling yet. Let's start it. */
450 if (!cpu_throttle_active()) {
451 cpu_throttle_set(pct_initial);
452 } else {
453 /* Throttling already on, just increase the rate */
454 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
455 }
456}
457
3d0684b2
JQ
458/**
459 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
460 *
6f37bb8b 461 * @rs: current RAM state
3d0684b2
JQ
462 * @current_addr: address for the zero page
463 *
464 * Update the xbzrle cache to reflect a page that's been sent as all 0.
56e93d26
JQ
465 * The important thing is that a stale (not-yet-0'd) page be replaced
466 * by the new data.
467 * As a bonus, if the page wasn't in the cache it gets added so that
3d0684b2 468 * when a small write is made into the 0'd page it gets XBZRLE sent.
56e93d26 469 */
6f37bb8b 470static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
56e93d26 471{
6f37bb8b 472 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
56e93d26
JQ
473 return;
474 }
475
476 /* We don't care if this fails to allocate a new cache page
477 * as long as it updated an old one */
478 cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
5a987738 479 rs->bitmap_sync_count);
56e93d26
JQ
480}
481
482#define ENCODING_FLAG_XBZRLE 0x1
483
484/**
485 * save_xbzrle_page: compress and send current page
486 *
487 * Returns: 1 means that we wrote the page
488 * 0 means that page is identical to the one already sent
489 * -1 means that xbzrle would be longer than normal
490 *
5a987738 491 * @rs: current RAM state
56e93d26 492 * @f: QEMUFile where to send the data
3d0684b2
JQ
493 * @current_data: pointer to the address of the page contents
494 * @current_addr: addr of the page
56e93d26
JQ
495 * @block: block that contains the page we want to send
496 * @offset: offset inside the block for the page
497 * @last_stage: if we are at the completion stage
498 * @bytes_transferred: increase it with the number of transferred bytes
499 */
5a987738 500static int save_xbzrle_page(RAMState *rs, QEMUFile *f, uint8_t **current_data,
56e93d26
JQ
501 ram_addr_t current_addr, RAMBlock *block,
502 ram_addr_t offset, bool last_stage,
503 uint64_t *bytes_transferred)
504{
505 int encoded_len = 0, bytes_xbzrle;
506 uint8_t *prev_cached_page;
507
5a987738 508 if (!cache_is_cached(XBZRLE.cache, current_addr, rs->bitmap_sync_count)) {
56e93d26
JQ
509 acct_info.xbzrle_cache_miss++;
510 if (!last_stage) {
511 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
5a987738 512 rs->bitmap_sync_count) == -1) {
56e93d26
JQ
513 return -1;
514 } else {
515 /* update *current_data when the page has been
516 inserted into cache */
517 *current_data = get_cached_data(XBZRLE.cache, current_addr);
518 }
519 }
520 return -1;
521 }
522
523 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
524
525 /* save current buffer into memory */
526 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
527
528 /* XBZRLE encoding (if there is no overflow) */
529 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
530 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
531 TARGET_PAGE_SIZE);
532 if (encoded_len == 0) {
55c4446b 533 trace_save_xbzrle_page_skipping();
56e93d26
JQ
534 return 0;
535 } else if (encoded_len == -1) {
55c4446b 536 trace_save_xbzrle_page_overflow();
56e93d26
JQ
537 acct_info.xbzrle_overflows++;
538 /* update data in the cache */
539 if (!last_stage) {
540 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
541 *current_data = prev_cached_page;
542 }
543 return -1;
544 }
545
546 /* we need to update the data in the cache, in order to get the same data */
547 if (!last_stage) {
548 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
549 }
550
551 /* Send XBZRLE based compressed page */
552 bytes_xbzrle = save_page_header(f, block, offset | RAM_SAVE_FLAG_XBZRLE);
553 qemu_put_byte(f, ENCODING_FLAG_XBZRLE);
554 qemu_put_be16(f, encoded_len);
555 qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len);
556 bytes_xbzrle += encoded_len + 1 + 2;
557 acct_info.xbzrle_pages++;
558 acct_info.xbzrle_bytes += bytes_xbzrle;
559 *bytes_transferred += bytes_xbzrle;
560
561 return 1;
562}
563
3d0684b2
JQ
564/**
565 * migration_bitmap_find_dirty: find the next dirty page from start
f3f491fc 566 *
3d0684b2
JQ
567 * Called with rcu_read_lock() to protect migration_bitmap
568 *
569 * Returns the byte offset within memory region of the start of a dirty page
570 *
6f37bb8b 571 * @rs: current RAM state
3d0684b2
JQ
572 * @rb: RAMBlock where to search for dirty pages
573 * @start: starting address (typically so we can continue from previous page)
574 * @ram_addr_abs: pointer into which to store the address of the dirty page
575 * within the global ram_addr space
f3f491fc 576 */
56e93d26 577static inline
6f37bb8b 578ram_addr_t migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
a82d593b
DDAG
579 ram_addr_t start,
580 ram_addr_t *ram_addr_abs)
56e93d26 581{
2f68e399 582 unsigned long base = rb->offset >> TARGET_PAGE_BITS;
56e93d26 583 unsigned long nr = base + (start >> TARGET_PAGE_BITS);
2f68e399
DDAG
584 uint64_t rb_size = rb->used_length;
585 unsigned long size = base + (rb_size >> TARGET_PAGE_BITS);
2ff64038 586 unsigned long *bitmap;
56e93d26
JQ
587
588 unsigned long next;
589
60be6340 590 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
6f37bb8b 591 if (rs->ram_bulk_stage && nr > base) {
56e93d26
JQ
592 next = nr + 1;
593 } else {
2ff64038 594 next = find_next_bit(bitmap, size, nr);
56e93d26
JQ
595 }
596
f3f491fc 597 *ram_addr_abs = next << TARGET_PAGE_BITS;
56e93d26
JQ
598 return (next - base) << TARGET_PAGE_BITS;
599}
600
a82d593b
DDAG
601static inline bool migration_bitmap_clear_dirty(ram_addr_t addr)
602{
603 bool ret;
604 int nr = addr >> TARGET_PAGE_BITS;
605 unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
606
607 ret = test_and_clear_bit(nr, bitmap);
608
609 if (ret) {
610 migration_dirty_pages--;
611 }
612 return ret;
613}
614
1ffb5dfd 615static int64_t num_dirty_pages_period;
56e93d26
JQ
616static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length)
617{
2ff64038 618 unsigned long *bitmap;
60be6340 619 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1ffb5dfd
CF
620 migration_dirty_pages += cpu_physical_memory_sync_dirty_bitmap(bitmap,
621 start, length, &num_dirty_pages_period);
56e93d26
JQ
622}
623
56e93d26 624/* Fix me: there are too many global variables used in migration process. */
56e93d26
JQ
625static uint64_t xbzrle_cache_miss_prev;
626static uint64_t iterations_prev;
627
f664da80 628static void migration_bitmap_sync_init(RAMState *rs)
56e93d26 629{
f664da80 630 rs->time_last_bitmap_sync = 0;
eac74159 631 rs->bytes_xfer_prev = 0;
56e93d26
JQ
632 num_dirty_pages_period = 0;
633 xbzrle_cache_miss_prev = 0;
634 iterations_prev = 0;
635}
636
3d0684b2
JQ
637/**
638 * ram_pagesize_summary: calculate all the pagesizes of a VM
639 *
640 * Returns a summary bitmap of the page sizes of all RAMBlocks
641 *
642 * For VMs with just normal pages this is equivalent to the host page
643 * size. If it's got some huge pages then it's the OR of all the
644 * different page sizes.
e8ca1db2
DDAG
645 */
646uint64_t ram_pagesize_summary(void)
647{
648 RAMBlock *block;
649 uint64_t summary = 0;
650
651 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
652 summary |= block->page_size;
653 }
654
655 return summary;
656}
657
8d820d6f 658static void migration_bitmap_sync(RAMState *rs)
56e93d26
JQ
659{
660 RAMBlock *block;
56e93d26
JQ
661 MigrationState *s = migrate_get_current();
662 int64_t end_time;
c4bdf0cf 663 uint64_t bytes_xfer_now;
56e93d26 664
5a987738 665 rs->bitmap_sync_count++;
56e93d26 666
eac74159
JQ
667 if (!rs->bytes_xfer_prev) {
668 rs->bytes_xfer_prev = ram_bytes_transferred();
56e93d26
JQ
669 }
670
f664da80
JQ
671 if (!rs->time_last_bitmap_sync) {
672 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
56e93d26
JQ
673 }
674
675 trace_migration_bitmap_sync_start();
9c1f8f44 676 memory_global_dirty_log_sync();
56e93d26 677
dd631697 678 qemu_mutex_lock(&migration_bitmap_mutex);
56e93d26
JQ
679 rcu_read_lock();
680 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2f68e399 681 migration_bitmap_sync_range(block->offset, block->used_length);
56e93d26
JQ
682 }
683 rcu_read_unlock();
dd631697 684 qemu_mutex_unlock(&migration_bitmap_mutex);
56e93d26 685
1ffb5dfd
CF
686 trace_migration_bitmap_sync_end(num_dirty_pages_period);
687
56e93d26
JQ
688 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
689
690 /* more than 1 second = 1000 millisecons */
f664da80 691 if (end_time > rs->time_last_bitmap_sync + 1000) {
56e93d26
JQ
692 if (migrate_auto_converge()) {
693 /* The following detection logic can be refined later. For now:
694 Check to see if the dirtied bytes is 50% more than the approx.
695 amount of bytes that just got transferred since the last time we
070afca2
JH
696 were in this routine. If that happens twice, start or increase
697 throttling */
56e93d26 698 bytes_xfer_now = ram_bytes_transferred();
070afca2 699
56e93d26
JQ
700 if (s->dirty_pages_rate &&
701 (num_dirty_pages_period * TARGET_PAGE_SIZE >
eac74159 702 (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
8d820d6f 703 (rs->dirty_rate_high_cnt++ >= 2)) {
56e93d26 704 trace_migration_throttle();
8d820d6f 705 rs->dirty_rate_high_cnt = 0;
070afca2 706 mig_throttle_guest_down();
56e93d26 707 }
eac74159 708 rs->bytes_xfer_prev = bytes_xfer_now;
56e93d26 709 }
070afca2 710
56e93d26
JQ
711 if (migrate_use_xbzrle()) {
712 if (iterations_prev != acct_info.iterations) {
713 acct_info.xbzrle_cache_miss_rate =
714 (double)(acct_info.xbzrle_cache_miss -
715 xbzrle_cache_miss_prev) /
716 (acct_info.iterations - iterations_prev);
717 }
718 iterations_prev = acct_info.iterations;
719 xbzrle_cache_miss_prev = acct_info.xbzrle_cache_miss;
720 }
721 s->dirty_pages_rate = num_dirty_pages_period * 1000
f664da80 722 / (end_time - rs->time_last_bitmap_sync);
56e93d26 723 s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE;
f664da80 724 rs->time_last_bitmap_sync = end_time;
56e93d26
JQ
725 num_dirty_pages_period = 0;
726 }
5a987738 727 s->dirty_sync_count = rs->bitmap_sync_count;
4addcd4f 728 if (migrate_use_events()) {
5a987738 729 qapi_event_send_migration_pass(rs->bitmap_sync_count, NULL);
4addcd4f 730 }
56e93d26
JQ
731}
732
733/**
3d0684b2 734 * save_zero_page: send the zero page to the stream
56e93d26 735 *
3d0684b2 736 * Returns the number of pages written.
56e93d26
JQ
737 *
738 * @f: QEMUFile where to send the data
739 * @block: block that contains the page we want to send
740 * @offset: offset inside the block for the page
741 * @p: pointer to the page
742 * @bytes_transferred: increase it with the number of transferred bytes
743 */
744static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
745 uint8_t *p, uint64_t *bytes_transferred)
746{
747 int pages = -1;
748
749 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
750 acct_info.dup_pages++;
751 *bytes_transferred += save_page_header(f, block,
752 offset | RAM_SAVE_FLAG_COMPRESS);
753 qemu_put_byte(f, 0);
754 *bytes_transferred += 1;
755 pages = 1;
756 }
757
758 return pages;
759}
760
36449157 761static void ram_release_pages(MigrationState *ms, const char *rbname,
53f09a10
PB
762 uint64_t offset, int pages)
763{
764 if (!migrate_release_ram() || !migration_in_postcopy(ms)) {
765 return;
766 }
767
36449157 768 ram_discard_range(NULL, rbname, offset, pages << TARGET_PAGE_BITS);
53f09a10
PB
769}
770
56e93d26 771/**
3d0684b2 772 * ram_save_page: send the given page to the stream
56e93d26 773 *
3d0684b2 774 * Returns the number of pages written.
3fd3c4b3
DDAG
775 * < 0 - error
776 * >=0 - Number of pages written - this might legally be 0
777 * if xbzrle noticed the page was the same.
56e93d26 778 *
6f37bb8b 779 * @rs: current RAM state
3d0684b2 780 * @ms: current migration state
56e93d26
JQ
781 * @f: QEMUFile where to send the data
782 * @block: block that contains the page we want to send
783 * @offset: offset inside the block for the page
784 * @last_stage: if we are at the completion stage
785 * @bytes_transferred: increase it with the number of transferred bytes
786 */
6f37bb8b
JQ
787static int ram_save_page(RAMState *rs, MigrationState *ms, QEMUFile *f,
788 PageSearchStatus *pss, bool last_stage,
789 uint64_t *bytes_transferred)
56e93d26
JQ
790{
791 int pages = -1;
792 uint64_t bytes_xmit;
793 ram_addr_t current_addr;
56e93d26
JQ
794 uint8_t *p;
795 int ret;
796 bool send_async = true;
a08f6890
HZ
797 RAMBlock *block = pss->block;
798 ram_addr_t offset = pss->offset;
56e93d26 799
2f68e399 800 p = block->host + offset;
56e93d26
JQ
801
802 /* In doubt sent page as normal */
803 bytes_xmit = 0;
804 ret = ram_control_save_page(f, block->offset,
805 offset, TARGET_PAGE_SIZE, &bytes_xmit);
806 if (bytes_xmit) {
807 *bytes_transferred += bytes_xmit;
808 pages = 1;
809 }
810
811 XBZRLE_cache_lock();
812
813 current_addr = block->offset + offset;
814
6f37bb8b 815 if (block == rs->last_sent_block) {
56e93d26
JQ
816 offset |= RAM_SAVE_FLAG_CONTINUE;
817 }
818 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
819 if (ret != RAM_SAVE_CONTROL_DELAYED) {
820 if (bytes_xmit > 0) {
821 acct_info.norm_pages++;
822 } else if (bytes_xmit == 0) {
823 acct_info.dup_pages++;
824 }
825 }
826 } else {
827 pages = save_zero_page(f, block, offset, p, bytes_transferred);
828 if (pages > 0) {
829 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
830 * page would be stale
831 */
6f37bb8b 832 xbzrle_cache_zero_page(rs, current_addr);
53f09a10 833 ram_release_pages(ms, block->idstr, pss->offset, pages);
6f37bb8b 834 } else if (!rs->ram_bulk_stage &&
9eb14766 835 !migration_in_postcopy(ms) && migrate_use_xbzrle()) {
5a987738 836 pages = save_xbzrle_page(rs, f, &p, current_addr, block,
56e93d26
JQ
837 offset, last_stage, bytes_transferred);
838 if (!last_stage) {
839 /* Can't send this cached data async, since the cache page
840 * might get updated before it gets to the wire
841 */
842 send_async = false;
843 }
844 }
845 }
846
847 /* XBZRLE overflow or normal page */
848 if (pages == -1) {
849 *bytes_transferred += save_page_header(f, block,
850 offset | RAM_SAVE_FLAG_PAGE);
851 if (send_async) {
53f09a10
PB
852 qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE,
853 migrate_release_ram() &
854 migration_in_postcopy(ms));
56e93d26
JQ
855 } else {
856 qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
857 }
858 *bytes_transferred += TARGET_PAGE_SIZE;
859 pages = 1;
860 acct_info.norm_pages++;
861 }
862
863 XBZRLE_cache_unlock();
864
865 return pages;
866}
867
a7a9a88f
LL
868static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
869 ram_addr_t offset)
56e93d26
JQ
870{
871 int bytes_sent, blen;
a7a9a88f 872 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
56e93d26 873
a7a9a88f 874 bytes_sent = save_page_header(f, block, offset |
56e93d26 875 RAM_SAVE_FLAG_COMPRESS_PAGE);
a7a9a88f 876 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
56e93d26 877 migrate_compress_level());
b3be2896
LL
878 if (blen < 0) {
879 bytes_sent = 0;
880 qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
881 error_report("compressed data failed!");
882 } else {
883 bytes_sent += blen;
53f09a10
PB
884 ram_release_pages(migrate_get_current(), block->idstr,
885 offset & TARGET_PAGE_MASK, 1);
b3be2896 886 }
56e93d26
JQ
887
888 return bytes_sent;
889}
890
56e93d26
JQ
891static uint64_t bytes_transferred;
892
893static void flush_compressed_data(QEMUFile *f)
894{
895 int idx, len, thread_count;
896
897 if (!migrate_use_compression()) {
898 return;
899 }
900 thread_count = migrate_compress_threads();
a7a9a88f 901
0d9f9a5c 902 qemu_mutex_lock(&comp_done_lock);
56e93d26 903 for (idx = 0; idx < thread_count; idx++) {
a7a9a88f 904 while (!comp_param[idx].done) {
0d9f9a5c 905 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26 906 }
a7a9a88f 907 }
0d9f9a5c 908 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
909
910 for (idx = 0; idx < thread_count; idx++) {
911 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 912 if (!comp_param[idx].quit) {
56e93d26
JQ
913 len = qemu_put_qemu_file(f, comp_param[idx].file);
914 bytes_transferred += len;
915 }
a7a9a88f 916 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26
JQ
917 }
918}
919
920static inline void set_compress_params(CompressParam *param, RAMBlock *block,
921 ram_addr_t offset)
922{
923 param->block = block;
924 param->offset = offset;
925}
926
927static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block,
928 ram_addr_t offset,
929 uint64_t *bytes_transferred)
930{
931 int idx, thread_count, bytes_xmit = -1, pages = -1;
932
933 thread_count = migrate_compress_threads();
0d9f9a5c 934 qemu_mutex_lock(&comp_done_lock);
56e93d26
JQ
935 while (true) {
936 for (idx = 0; idx < thread_count; idx++) {
937 if (comp_param[idx].done) {
a7a9a88f 938 comp_param[idx].done = false;
56e93d26 939 bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file);
a7a9a88f 940 qemu_mutex_lock(&comp_param[idx].mutex);
56e93d26 941 set_compress_params(&comp_param[idx], block, offset);
a7a9a88f
LL
942 qemu_cond_signal(&comp_param[idx].cond);
943 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26
JQ
944 pages = 1;
945 acct_info.norm_pages++;
946 *bytes_transferred += bytes_xmit;
947 break;
948 }
949 }
950 if (pages > 0) {
951 break;
952 } else {
0d9f9a5c 953 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26
JQ
954 }
955 }
0d9f9a5c 956 qemu_mutex_unlock(&comp_done_lock);
56e93d26
JQ
957
958 return pages;
959}
960
961/**
962 * ram_save_compressed_page: compress the given page and send it to the stream
963 *
3d0684b2 964 * Returns the number of pages written.
56e93d26 965 *
6f37bb8b 966 * @rs: current RAM state
3d0684b2 967 * @ms: current migration state
56e93d26
JQ
968 * @f: QEMUFile where to send the data
969 * @block: block that contains the page we want to send
970 * @offset: offset inside the block for the page
971 * @last_stage: if we are at the completion stage
972 * @bytes_transferred: increase it with the number of transferred bytes
973 */
6f37bb8b
JQ
974static int ram_save_compressed_page(RAMState *rs, MigrationState *ms,
975 QEMUFile *f,
9eb14766 976 PageSearchStatus *pss, bool last_stage,
56e93d26
JQ
977 uint64_t *bytes_transferred)
978{
979 int pages = -1;
fc50438e 980 uint64_t bytes_xmit = 0;
56e93d26 981 uint8_t *p;
fc50438e 982 int ret, blen;
a08f6890
HZ
983 RAMBlock *block = pss->block;
984 ram_addr_t offset = pss->offset;
56e93d26 985
2f68e399 986 p = block->host + offset;
56e93d26 987
56e93d26
JQ
988 ret = ram_control_save_page(f, block->offset,
989 offset, TARGET_PAGE_SIZE, &bytes_xmit);
990 if (bytes_xmit) {
991 *bytes_transferred += bytes_xmit;
992 pages = 1;
993 }
56e93d26
JQ
994 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
995 if (ret != RAM_SAVE_CONTROL_DELAYED) {
996 if (bytes_xmit > 0) {
997 acct_info.norm_pages++;
998 } else if (bytes_xmit == 0) {
999 acct_info.dup_pages++;
1000 }
1001 }
1002 } else {
1003 /* When starting the process of a new block, the first page of
1004 * the block should be sent out before other pages in the same
1005 * block, and all the pages in last block should have been sent
1006 * out, keeping this order is important, because the 'cont' flag
1007 * is used to avoid resending the block name.
1008 */
6f37bb8b 1009 if (block != rs->last_sent_block) {
56e93d26
JQ
1010 flush_compressed_data(f);
1011 pages = save_zero_page(f, block, offset, p, bytes_transferred);
1012 if (pages == -1) {
fc50438e
LL
1013 /* Make sure the first page is sent out before other pages */
1014 bytes_xmit = save_page_header(f, block, offset |
1015 RAM_SAVE_FLAG_COMPRESS_PAGE);
1016 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
1017 migrate_compress_level());
1018 if (blen > 0) {
1019 *bytes_transferred += bytes_xmit + blen;
b3be2896 1020 acct_info.norm_pages++;
b3be2896 1021 pages = 1;
fc50438e
LL
1022 } else {
1023 qemu_file_set_error(f, blen);
1024 error_report("compressed data failed!");
b3be2896 1025 }
56e93d26 1026 }
53f09a10
PB
1027 if (pages > 0) {
1028 ram_release_pages(ms, block->idstr, pss->offset, pages);
1029 }
56e93d26 1030 } else {
fc50438e 1031 offset |= RAM_SAVE_FLAG_CONTINUE;
56e93d26
JQ
1032 pages = save_zero_page(f, block, offset, p, bytes_transferred);
1033 if (pages == -1) {
1034 pages = compress_page_with_multi_thread(f, block, offset,
1035 bytes_transferred);
53f09a10
PB
1036 } else {
1037 ram_release_pages(ms, block->idstr, pss->offset, pages);
56e93d26
JQ
1038 }
1039 }
1040 }
1041
1042 return pages;
1043}
1044
3d0684b2
JQ
1045/**
1046 * find_dirty_block: find the next dirty page and update any state
1047 * associated with the search process.
b9e60928 1048 *
3d0684b2 1049 * Returns if a page is found
b9e60928 1050 *
6f37bb8b 1051 * @rs: current RAM state
3d0684b2
JQ
1052 * @f: QEMUFile where to send the data
1053 * @pss: data about the state of the current dirty page scan
1054 * @again: set to false if the search has scanned the whole of RAM
1055 * @ram_addr_abs: pointer into which to store the address of the dirty page
1056 * within the global ram_addr space
b9e60928 1057 */
6f37bb8b 1058static bool find_dirty_block(RAMState *rs, QEMUFile *f, PageSearchStatus *pss,
f3f491fc 1059 bool *again, ram_addr_t *ram_addr_abs)
b9e60928 1060{
6f37bb8b 1061 pss->offset = migration_bitmap_find_dirty(rs, pss->block, pss->offset,
a82d593b 1062 ram_addr_abs);
6f37bb8b
JQ
1063 if (pss->complete_round && pss->block == rs->last_seen_block &&
1064 pss->offset >= rs->last_offset) {
b9e60928
DDAG
1065 /*
1066 * We've been once around the RAM and haven't found anything.
1067 * Give up.
1068 */
1069 *again = false;
1070 return false;
1071 }
1072 if (pss->offset >= pss->block->used_length) {
1073 /* Didn't find anything in this RAM Block */
1074 pss->offset = 0;
1075 pss->block = QLIST_NEXT_RCU(pss->block, next);
1076 if (!pss->block) {
1077 /* Hit the end of the list */
1078 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1079 /* Flag that we've looped */
1080 pss->complete_round = true;
6f37bb8b 1081 rs->ram_bulk_stage = false;
b9e60928
DDAG
1082 if (migrate_use_xbzrle()) {
1083 /* If xbzrle is on, stop using the data compression at this
1084 * point. In theory, xbzrle can do better than compression.
1085 */
1086 flush_compressed_data(f);
1087 compression_switch = false;
1088 }
1089 }
1090 /* Didn't find anything this time, but try again on the new block */
1091 *again = true;
1092 return false;
1093 } else {
1094 /* Can go around again, but... */
1095 *again = true;
1096 /* We've found something so probably don't need to */
1097 return true;
1098 }
1099}
1100
3d0684b2
JQ
1101/**
1102 * unqueue_page: gets a page of the queue
1103 *
a82d593b 1104 * Helper for 'get_queued_page' - gets a page off the queue
a82d593b 1105 *
3d0684b2
JQ
1106 * Returns the block of the page (or NULL if none available)
1107 *
1108 * @ms: current migration state
1109 * @offset: used to return the offset within the RAMBlock
1110 * @ram_addr_abs: pointer into which to store the address of the dirty page
1111 * within the global ram_addr space
a82d593b
DDAG
1112 */
1113static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset,
1114 ram_addr_t *ram_addr_abs)
1115{
1116 RAMBlock *block = NULL;
1117
1118 qemu_mutex_lock(&ms->src_page_req_mutex);
1119 if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) {
1120 struct MigrationSrcPageRequest *entry =
1121 QSIMPLEQ_FIRST(&ms->src_page_requests);
1122 block = entry->rb;
1123 *offset = entry->offset;
1124 *ram_addr_abs = (entry->offset + entry->rb->offset) &
1125 TARGET_PAGE_MASK;
1126
1127 if (entry->len > TARGET_PAGE_SIZE) {
1128 entry->len -= TARGET_PAGE_SIZE;
1129 entry->offset += TARGET_PAGE_SIZE;
1130 } else {
1131 memory_region_unref(block->mr);
1132 QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1133 g_free(entry);
1134 }
1135 }
1136 qemu_mutex_unlock(&ms->src_page_req_mutex);
1137
1138 return block;
1139}
1140
3d0684b2
JQ
1141/**
1142 * get_queued_page: unqueue a page from the postocpy requests
1143 *
1144 * Skips pages that are already sent (!dirty)
a82d593b 1145 *
3d0684b2 1146 * Returns if a queued page is found
a82d593b 1147 *
6f37bb8b 1148 * @rs: current RAM state
3d0684b2
JQ
1149 * @ms: current migration state
1150 * @pss: data about the state of the current dirty page scan
1151 * @ram_addr_abs: pointer into which to store the address of the dirty page
1152 * within the global ram_addr space
a82d593b 1153 */
6f37bb8b
JQ
1154static bool get_queued_page(RAMState *rs, MigrationState *ms,
1155 PageSearchStatus *pss,
a82d593b
DDAG
1156 ram_addr_t *ram_addr_abs)
1157{
1158 RAMBlock *block;
1159 ram_addr_t offset;
1160 bool dirty;
1161
1162 do {
1163 block = unqueue_page(ms, &offset, ram_addr_abs);
1164 /*
1165 * We're sending this page, and since it's postcopy nothing else
1166 * will dirty it, and we must make sure it doesn't get sent again
1167 * even if this queue request was received after the background
1168 * search already sent it.
1169 */
1170 if (block) {
1171 unsigned long *bitmap;
1172 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1173 dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap);
1174 if (!dirty) {
1175 trace_get_queued_page_not_dirty(
1176 block->idstr, (uint64_t)offset,
1177 (uint64_t)*ram_addr_abs,
1178 test_bit(*ram_addr_abs >> TARGET_PAGE_BITS,
1179 atomic_rcu_read(&migration_bitmap_rcu)->unsentmap));
1180 } else {
1181 trace_get_queued_page(block->idstr,
1182 (uint64_t)offset,
1183 (uint64_t)*ram_addr_abs);
1184 }
1185 }
1186
1187 } while (block && !dirty);
1188
1189 if (block) {
1190 /*
1191 * As soon as we start servicing pages out of order, then we have
1192 * to kill the bulk stage, since the bulk stage assumes
1193 * in (migration_bitmap_find_and_reset_dirty) that every page is
1194 * dirty, that's no longer true.
1195 */
6f37bb8b 1196 rs->ram_bulk_stage = false;
a82d593b
DDAG
1197
1198 /*
1199 * We want the background search to continue from the queued page
1200 * since the guest is likely to want other pages near to the page
1201 * it just requested.
1202 */
1203 pss->block = block;
1204 pss->offset = offset;
1205 }
1206
1207 return !!block;
1208}
1209
6c595cde 1210/**
5e58f968
JQ
1211 * migration_page_queue_free: drop any remaining pages in the ram
1212 * request queue
6c595cde 1213 *
3d0684b2
JQ
1214 * It should be empty at the end anyway, but in error cases there may
1215 * be some left. in case that there is any page left, we drop it.
1216 *
1217 * @ms: current migration state
6c595cde 1218 */
5e58f968 1219void migration_page_queue_free(MigrationState *ms)
6c595cde
DDAG
1220{
1221 struct MigrationSrcPageRequest *mspr, *next_mspr;
1222 /* This queue generally should be empty - but in the case of a failed
1223 * migration might have some droppings in.
1224 */
1225 rcu_read_lock();
1226 QSIMPLEQ_FOREACH_SAFE(mspr, &ms->src_page_requests, next_req, next_mspr) {
1227 memory_region_unref(mspr->rb->mr);
1228 QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1229 g_free(mspr);
1230 }
1231 rcu_read_unlock();
1232}
1233
1234/**
3d0684b2
JQ
1235 * ram_save_queue_pages: queue the page for transmission
1236 *
1237 * A request from postcopy destination for example.
1238 *
1239 * Returns zero on success or negative on error
1240 *
1241 * @ms: current migration state
1242 * @rbname: Name of the RAMBLock of the request. NULL means the
1243 * same that last one.
1244 * @start: starting address from the start of the RAMBlock
1245 * @len: length (in bytes) to send
6c595cde
DDAG
1246 */
1247int ram_save_queue_pages(MigrationState *ms, const char *rbname,
1248 ram_addr_t start, ram_addr_t len)
1249{
1250 RAMBlock *ramblock;
1251
d3bf5418 1252 ms->postcopy_requests++;
6c595cde
DDAG
1253 rcu_read_lock();
1254 if (!rbname) {
1255 /* Reuse last RAMBlock */
1256 ramblock = ms->last_req_rb;
1257
1258 if (!ramblock) {
1259 /*
1260 * Shouldn't happen, we can't reuse the last RAMBlock if
1261 * it's the 1st request.
1262 */
1263 error_report("ram_save_queue_pages no previous block");
1264 goto err;
1265 }
1266 } else {
1267 ramblock = qemu_ram_block_by_name(rbname);
1268
1269 if (!ramblock) {
1270 /* We shouldn't be asked for a non-existent RAMBlock */
1271 error_report("ram_save_queue_pages no block '%s'", rbname);
1272 goto err;
1273 }
1274 ms->last_req_rb = ramblock;
1275 }
1276 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1277 if (start+len > ramblock->used_length) {
9458ad6b
JQ
1278 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1279 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
6c595cde
DDAG
1280 __func__, start, len, ramblock->used_length);
1281 goto err;
1282 }
1283
1284 struct MigrationSrcPageRequest *new_entry =
1285 g_malloc0(sizeof(struct MigrationSrcPageRequest));
1286 new_entry->rb = ramblock;
1287 new_entry->offset = start;
1288 new_entry->len = len;
1289
1290 memory_region_ref(ramblock->mr);
1291 qemu_mutex_lock(&ms->src_page_req_mutex);
1292 QSIMPLEQ_INSERT_TAIL(&ms->src_page_requests, new_entry, next_req);
1293 qemu_mutex_unlock(&ms->src_page_req_mutex);
1294 rcu_read_unlock();
1295
1296 return 0;
1297
1298err:
1299 rcu_read_unlock();
1300 return -1;
1301}
1302
a82d593b 1303/**
3d0684b2 1304 * ram_save_target_page: save one target page
a82d593b 1305 *
3d0684b2 1306 * Returns the number of pages written
a82d593b 1307 *
6f37bb8b 1308 * @rs: current RAM state
3d0684b2 1309 * @ms: current migration state
a82d593b 1310 * @f: QEMUFile where to send the data
3d0684b2 1311 * @pss: data about the page we want to send
a82d593b
DDAG
1312 * @last_stage: if we are at the completion stage
1313 * @bytes_transferred: increase it with the number of transferred bytes
3d0684b2 1314 * @dirty_ram_abs: address of the start of the dirty page in ram_addr_t space
a82d593b 1315 */
6f37bb8b 1316static int ram_save_target_page(RAMState *rs, MigrationState *ms, QEMUFile *f,
a08f6890 1317 PageSearchStatus *pss,
a82d593b
DDAG
1318 bool last_stage,
1319 uint64_t *bytes_transferred,
1320 ram_addr_t dirty_ram_abs)
1321{
1322 int res = 0;
1323
1324 /* Check the pages is dirty and if it is send it */
1325 if (migration_bitmap_clear_dirty(dirty_ram_abs)) {
1326 unsigned long *unsentmap;
1327 if (compression_switch && migrate_use_compression()) {
6f37bb8b 1328 res = ram_save_compressed_page(rs, ms, f, pss,
a82d593b
DDAG
1329 last_stage,
1330 bytes_transferred);
1331 } else {
6f37bb8b 1332 res = ram_save_page(rs, ms, f, pss, last_stage,
a82d593b
DDAG
1333 bytes_transferred);
1334 }
1335
1336 if (res < 0) {
1337 return res;
1338 }
1339 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1340 if (unsentmap) {
1341 clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap);
1342 }
3fd3c4b3
DDAG
1343 /* Only update last_sent_block if a block was actually sent; xbzrle
1344 * might have decided the page was identical so didn't bother writing
1345 * to the stream.
1346 */
1347 if (res > 0) {
6f37bb8b 1348 rs->last_sent_block = pss->block;
3fd3c4b3 1349 }
a82d593b
DDAG
1350 }
1351
1352 return res;
1353}
1354
1355/**
3d0684b2 1356 * ram_save_host_page: save a whole host page
a82d593b 1357 *
3d0684b2
JQ
1358 * Starting at *offset send pages up to the end of the current host
1359 * page. It's valid for the initial offset to point into the middle of
1360 * a host page in which case the remainder of the hostpage is sent.
1361 * Only dirty target pages are sent. Note that the host page size may
1362 * be a huge page for this block.
a82d593b 1363 *
3d0684b2
JQ
1364 * Returns the number of pages written or negative on error
1365 *
6f37bb8b 1366 * @rs: current RAM state
3d0684b2 1367 * @ms: current migration state
a82d593b 1368 * @f: QEMUFile where to send the data
3d0684b2 1369 * @pss: data about the page we want to send
a82d593b
DDAG
1370 * @last_stage: if we are at the completion stage
1371 * @bytes_transferred: increase it with the number of transferred bytes
1372 * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1373 */
6f37bb8b 1374static int ram_save_host_page(RAMState *rs, MigrationState *ms, QEMUFile *f,
a08f6890
HZ
1375 PageSearchStatus *pss,
1376 bool last_stage,
a82d593b
DDAG
1377 uint64_t *bytes_transferred,
1378 ram_addr_t dirty_ram_abs)
1379{
1380 int tmppages, pages = 0;
4c011c37
DDAG
1381 size_t pagesize = qemu_ram_pagesize(pss->block);
1382
a82d593b 1383 do {
6f37bb8b 1384 tmppages = ram_save_target_page(rs, ms, f, pss, last_stage,
a82d593b
DDAG
1385 bytes_transferred, dirty_ram_abs);
1386 if (tmppages < 0) {
1387 return tmppages;
1388 }
1389
1390 pages += tmppages;
a08f6890 1391 pss->offset += TARGET_PAGE_SIZE;
a82d593b 1392 dirty_ram_abs += TARGET_PAGE_SIZE;
4c011c37 1393 } while (pss->offset & (pagesize - 1));
a82d593b
DDAG
1394
1395 /* The offset we leave with is the last one we looked at */
a08f6890 1396 pss->offset -= TARGET_PAGE_SIZE;
a82d593b
DDAG
1397 return pages;
1398}
6c595cde 1399
56e93d26 1400/**
3d0684b2 1401 * ram_find_and_save_block: finds a dirty page and sends it to f
56e93d26
JQ
1402 *
1403 * Called within an RCU critical section.
1404 *
3d0684b2 1405 * Returns the number of pages written where zero means no dirty pages
56e93d26 1406 *
6f37bb8b 1407 * @rs: current RAM state
56e93d26
JQ
1408 * @f: QEMUFile where to send the data
1409 * @last_stage: if we are at the completion stage
1410 * @bytes_transferred: increase it with the number of transferred bytes
a82d593b
DDAG
1411 *
1412 * On systems where host-page-size > target-page-size it will send all the
1413 * pages in a host page that are dirty.
56e93d26
JQ
1414 */
1415
6f37bb8b 1416static int ram_find_and_save_block(RAMState *rs, QEMUFile *f, bool last_stage,
56e93d26
JQ
1417 uint64_t *bytes_transferred)
1418{
b8fb8cb7 1419 PageSearchStatus pss;
a82d593b 1420 MigrationState *ms = migrate_get_current();
56e93d26 1421 int pages = 0;
b9e60928 1422 bool again, found;
f3f491fc
DDAG
1423 ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
1424 ram_addr_t space */
56e93d26 1425
0827b9e9
AA
1426 /* No dirty page as there is zero RAM */
1427 if (!ram_bytes_total()) {
1428 return pages;
1429 }
1430
6f37bb8b
JQ
1431 pss.block = rs->last_seen_block;
1432 pss.offset = rs->last_offset;
b8fb8cb7
DDAG
1433 pss.complete_round = false;
1434
1435 if (!pss.block) {
1436 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1437 }
56e93d26 1438
b9e60928 1439 do {
a82d593b 1440 again = true;
6f37bb8b 1441 found = get_queued_page(rs, ms, &pss, &dirty_ram_abs);
b9e60928 1442
a82d593b
DDAG
1443 if (!found) {
1444 /* priority queue empty, so just search for something dirty */
6f37bb8b 1445 found = find_dirty_block(rs, f, &pss, &again, &dirty_ram_abs);
a82d593b 1446 }
f3f491fc 1447
a82d593b 1448 if (found) {
6f37bb8b 1449 pages = ram_save_host_page(rs, ms, f, &pss,
a82d593b
DDAG
1450 last_stage, bytes_transferred,
1451 dirty_ram_abs);
56e93d26 1452 }
b9e60928 1453 } while (!pages && again);
56e93d26 1454
6f37bb8b
JQ
1455 rs->last_seen_block = pss.block;
1456 rs->last_offset = pss.offset;
56e93d26
JQ
1457
1458 return pages;
1459}
1460
1461void acct_update_position(QEMUFile *f, size_t size, bool zero)
1462{
1463 uint64_t pages = size / TARGET_PAGE_SIZE;
1464 if (zero) {
1465 acct_info.dup_pages += pages;
1466 } else {
1467 acct_info.norm_pages += pages;
1468 bytes_transferred += size;
1469 qemu_update_position(f, size);
1470 }
1471}
1472
1473static ram_addr_t ram_save_remaining(void)
1474{
1475 return migration_dirty_pages;
1476}
1477
1478uint64_t ram_bytes_remaining(void)
1479{
1480 return ram_save_remaining() * TARGET_PAGE_SIZE;
1481}
1482
1483uint64_t ram_bytes_transferred(void)
1484{
1485 return bytes_transferred;
1486}
1487
1488uint64_t ram_bytes_total(void)
1489{
1490 RAMBlock *block;
1491 uint64_t total = 0;
1492
1493 rcu_read_lock();
1494 QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
1495 total += block->used_length;
1496 rcu_read_unlock();
1497 return total;
1498}
1499
1500void free_xbzrle_decoded_buf(void)
1501{
1502 g_free(xbzrle_decoded_buf);
1503 xbzrle_decoded_buf = NULL;
1504}
1505
60be6340
DL
1506static void migration_bitmap_free(struct BitmapRcu *bmap)
1507{
1508 g_free(bmap->bmap);
f3f491fc 1509 g_free(bmap->unsentmap);
60be6340
DL
1510 g_free(bmap);
1511}
1512
6ad2a215 1513static void ram_migration_cleanup(void *opaque)
56e93d26 1514{
2ff64038
LZ
1515 /* caller have hold iothread lock or is in a bh, so there is
1516 * no writing race against this migration_bitmap
1517 */
60be6340
DL
1518 struct BitmapRcu *bitmap = migration_bitmap_rcu;
1519 atomic_rcu_set(&migration_bitmap_rcu, NULL);
2ff64038 1520 if (bitmap) {
56e93d26 1521 memory_global_dirty_log_stop();
60be6340 1522 call_rcu(bitmap, migration_bitmap_free, rcu);
56e93d26
JQ
1523 }
1524
1525 XBZRLE_cache_lock();
1526 if (XBZRLE.cache) {
1527 cache_fini(XBZRLE.cache);
1528 g_free(XBZRLE.encoded_buf);
1529 g_free(XBZRLE.current_buf);
adb65dec 1530 g_free(ZERO_TARGET_PAGE);
56e93d26
JQ
1531 XBZRLE.cache = NULL;
1532 XBZRLE.encoded_buf = NULL;
1533 XBZRLE.current_buf = NULL;
1534 }
1535 XBZRLE_cache_unlock();
1536}
1537
6f37bb8b 1538static void ram_state_reset(RAMState *rs)
56e93d26 1539{
6f37bb8b
JQ
1540 rs->last_seen_block = NULL;
1541 rs->last_sent_block = NULL;
1542 rs->last_offset = 0;
1543 rs->last_version = ram_list.version;
1544 rs->ram_bulk_stage = true;
56e93d26
JQ
1545}
1546
1547#define MAX_WAIT 50 /* ms, half buffered_file limit */
1548
dd631697
LZ
1549void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
1550{
1551 /* called in qemu main thread, so there is
1552 * no writing race against this migration_bitmap
1553 */
60be6340
DL
1554 if (migration_bitmap_rcu) {
1555 struct BitmapRcu *old_bitmap = migration_bitmap_rcu, *bitmap;
1556 bitmap = g_new(struct BitmapRcu, 1);
1557 bitmap->bmap = bitmap_new(new);
dd631697
LZ
1558
1559 /* prevent migration_bitmap content from being set bit
1560 * by migration_bitmap_sync_range() at the same time.
1561 * it is safe to migration if migration_bitmap is cleared bit
1562 * at the same time.
1563 */
1564 qemu_mutex_lock(&migration_bitmap_mutex);
60be6340
DL
1565 bitmap_copy(bitmap->bmap, old_bitmap->bmap, old);
1566 bitmap_set(bitmap->bmap, old, new - old);
f3f491fc
DDAG
1567
1568 /* We don't have a way to safely extend the sentmap
1569 * with RCU; so mark it as missing, entry to postcopy
1570 * will fail.
1571 */
1572 bitmap->unsentmap = NULL;
1573
60be6340 1574 atomic_rcu_set(&migration_bitmap_rcu, bitmap);
dd631697
LZ
1575 qemu_mutex_unlock(&migration_bitmap_mutex);
1576 migration_dirty_pages += new - old;
60be6340 1577 call_rcu(old_bitmap, migration_bitmap_free, rcu);
dd631697
LZ
1578 }
1579}
56e93d26 1580
4f2e4252
DDAG
1581/*
1582 * 'expected' is the value you expect the bitmap mostly to be full
1583 * of; it won't bother printing lines that are all this value.
1584 * If 'todump' is null the migration bitmap is dumped.
1585 */
1586void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
1587{
1588 int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1589
1590 int64_t cur;
1591 int64_t linelen = 128;
1592 char linebuf[129];
1593
1594 if (!todump) {
1595 todump = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1596 }
1597
1598 for (cur = 0; cur < ram_pages; cur += linelen) {
1599 int64_t curb;
1600 bool found = false;
1601 /*
1602 * Last line; catch the case where the line length
1603 * is longer than remaining ram
1604 */
1605 if (cur + linelen > ram_pages) {
1606 linelen = ram_pages - cur;
1607 }
1608 for (curb = 0; curb < linelen; curb++) {
1609 bool thisbit = test_bit(cur + curb, todump);
1610 linebuf[curb] = thisbit ? '1' : '.';
1611 found = found || (thisbit != expected);
1612 }
1613 if (found) {
1614 linebuf[curb] = '\0';
1615 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
1616 }
1617 }
1618}
1619
e0b266f0
DDAG
1620/* **** functions for postcopy ***** */
1621
ced1c616
PB
1622void ram_postcopy_migrated_memory_release(MigrationState *ms)
1623{
1624 struct RAMBlock *block;
1625 unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1626
1627 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1628 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1629 unsigned long range = first + (block->used_length >> TARGET_PAGE_BITS);
1630 unsigned long run_start = find_next_zero_bit(bitmap, range, first);
1631
1632 while (run_start < range) {
1633 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1634 ram_discard_range(NULL, block->idstr, run_start << TARGET_PAGE_BITS,
1635 (run_end - run_start) << TARGET_PAGE_BITS);
1636 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1637 }
1638 }
1639}
1640
3d0684b2
JQ
1641/**
1642 * postcopy_send_discard_bm_ram: discard a RAMBlock
1643 *
1644 * Returns zero on success
1645 *
e0b266f0
DDAG
1646 * Callback from postcopy_each_ram_send_discard for each RAMBlock
1647 * Note: At this point the 'unsentmap' is the processed bitmap combined
1648 * with the dirtymap; so a '1' means it's either dirty or unsent.
3d0684b2
JQ
1649 *
1650 * @ms: current migration state
1651 * @pds: state for postcopy
1652 * @start: RAMBlock starting page
1653 * @length: RAMBlock size
e0b266f0
DDAG
1654 */
1655static int postcopy_send_discard_bm_ram(MigrationState *ms,
1656 PostcopyDiscardState *pds,
1657 unsigned long start,
1658 unsigned long length)
1659{
1660 unsigned long end = start + length; /* one after the end */
1661 unsigned long current;
1662 unsigned long *unsentmap;
1663
1664 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1665 for (current = start; current < end; ) {
1666 unsigned long one = find_next_bit(unsentmap, end, current);
1667
1668 if (one <= end) {
1669 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1670 unsigned long discard_length;
1671
1672 if (zero >= end) {
1673 discard_length = end - one;
1674 } else {
1675 discard_length = zero - one;
1676 }
d688c62d
DDAG
1677 if (discard_length) {
1678 postcopy_discard_send_range(ms, pds, one, discard_length);
1679 }
e0b266f0
DDAG
1680 current = one + discard_length;
1681 } else {
1682 current = one;
1683 }
1684 }
1685
1686 return 0;
1687}
1688
3d0684b2
JQ
1689/**
1690 * postcopy_each_ram_send_discard: discard all RAMBlocks
1691 *
1692 * Returns 0 for success or negative for error
1693 *
e0b266f0
DDAG
1694 * Utility for the outgoing postcopy code.
1695 * Calls postcopy_send_discard_bm_ram for each RAMBlock
1696 * passing it bitmap indexes and name.
e0b266f0
DDAG
1697 * (qemu_ram_foreach_block ends up passing unscaled lengths
1698 * which would mean postcopy code would have to deal with target page)
3d0684b2
JQ
1699 *
1700 * @ms: current migration state
e0b266f0
DDAG
1701 */
1702static int postcopy_each_ram_send_discard(MigrationState *ms)
1703{
1704 struct RAMBlock *block;
1705 int ret;
1706
1707 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1708 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1709 PostcopyDiscardState *pds = postcopy_discard_send_init(ms,
1710 first,
1711 block->idstr);
1712
1713 /*
1714 * Postcopy sends chunks of bitmap over the wire, but it
1715 * just needs indexes at this point, avoids it having
1716 * target page specific code.
1717 */
1718 ret = postcopy_send_discard_bm_ram(ms, pds, first,
1719 block->used_length >> TARGET_PAGE_BITS);
1720 postcopy_discard_send_finish(ms, pds);
1721 if (ret) {
1722 return ret;
1723 }
1724 }
1725
1726 return 0;
1727}
1728
3d0684b2
JQ
1729/**
1730 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1731 *
1732 * Helper for postcopy_chunk_hostpages; it's called twice to
1733 * canonicalize the two bitmaps, that are similar, but one is
1734 * inverted.
99e314eb 1735 *
3d0684b2
JQ
1736 * Postcopy requires that all target pages in a hostpage are dirty or
1737 * clean, not a mix. This function canonicalizes the bitmaps.
99e314eb 1738 *
3d0684b2
JQ
1739 * @ms: current migration state
1740 * @unsent_pass: if true we need to canonicalize partially unsent host pages
1741 * otherwise we need to canonicalize partially dirty host pages
1742 * @block: block that contains the page we want to canonicalize
1743 * @pds: state for postcopy
99e314eb
DDAG
1744 */
1745static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1746 RAMBlock *block,
1747 PostcopyDiscardState *pds)
1748{
1749 unsigned long *bitmap;
1750 unsigned long *unsentmap;
29c59172 1751 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
99e314eb
DDAG
1752 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1753 unsigned long len = block->used_length >> TARGET_PAGE_BITS;
1754 unsigned long last = first + (len - 1);
1755 unsigned long run_start;
1756
29c59172
DDAG
1757 if (block->page_size == TARGET_PAGE_SIZE) {
1758 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1759 return;
1760 }
1761
99e314eb
DDAG
1762 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1763 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1764
1765 if (unsent_pass) {
1766 /* Find a sent page */
1767 run_start = find_next_zero_bit(unsentmap, last + 1, first);
1768 } else {
1769 /* Find a dirty page */
1770 run_start = find_next_bit(bitmap, last + 1, first);
1771 }
1772
1773 while (run_start <= last) {
1774 bool do_fixup = false;
1775 unsigned long fixup_start_addr;
1776 unsigned long host_offset;
1777
1778 /*
1779 * If the start of this run of pages is in the middle of a host
1780 * page, then we need to fixup this host page.
1781 */
1782 host_offset = run_start % host_ratio;
1783 if (host_offset) {
1784 do_fixup = true;
1785 run_start -= host_offset;
1786 fixup_start_addr = run_start;
1787 /* For the next pass */
1788 run_start = run_start + host_ratio;
1789 } else {
1790 /* Find the end of this run */
1791 unsigned long run_end;
1792 if (unsent_pass) {
1793 run_end = find_next_bit(unsentmap, last + 1, run_start + 1);
1794 } else {
1795 run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1);
1796 }
1797 /*
1798 * If the end isn't at the start of a host page, then the
1799 * run doesn't finish at the end of a host page
1800 * and we need to discard.
1801 */
1802 host_offset = run_end % host_ratio;
1803 if (host_offset) {
1804 do_fixup = true;
1805 fixup_start_addr = run_end - host_offset;
1806 /*
1807 * This host page has gone, the next loop iteration starts
1808 * from after the fixup
1809 */
1810 run_start = fixup_start_addr + host_ratio;
1811 } else {
1812 /*
1813 * No discards on this iteration, next loop starts from
1814 * next sent/dirty page
1815 */
1816 run_start = run_end + 1;
1817 }
1818 }
1819
1820 if (do_fixup) {
1821 unsigned long page;
1822
1823 /* Tell the destination to discard this page */
1824 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1825 /* For the unsent_pass we:
1826 * discard partially sent pages
1827 * For the !unsent_pass (dirty) we:
1828 * discard partially dirty pages that were sent
1829 * (any partially sent pages were already discarded
1830 * by the previous unsent_pass)
1831 */
1832 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1833 host_ratio);
1834 }
1835
1836 /* Clean up the bitmap */
1837 for (page = fixup_start_addr;
1838 page < fixup_start_addr + host_ratio; page++) {
1839 /* All pages in this host page are now not sent */
1840 set_bit(page, unsentmap);
1841
1842 /*
1843 * Remark them as dirty, updating the count for any pages
1844 * that weren't previously dirty.
1845 */
1846 migration_dirty_pages += !test_and_set_bit(page, bitmap);
1847 }
1848 }
1849
1850 if (unsent_pass) {
1851 /* Find the next sent page for the next iteration */
1852 run_start = find_next_zero_bit(unsentmap, last + 1,
1853 run_start);
1854 } else {
1855 /* Find the next dirty page for the next iteration */
1856 run_start = find_next_bit(bitmap, last + 1, run_start);
1857 }
1858 }
1859}
1860
3d0684b2
JQ
1861/**
1862 * postcopy_chuck_hostpages: discrad any partially sent host page
1863 *
99e314eb
DDAG
1864 * Utility for the outgoing postcopy code.
1865 *
1866 * Discard any partially sent host-page size chunks, mark any partially
29c59172
DDAG
1867 * dirty host-page size chunks as all dirty. In this case the host-page
1868 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
99e314eb 1869 *
3d0684b2
JQ
1870 * Returns zero on success
1871 *
1872 * @ms: current migration state
99e314eb
DDAG
1873 */
1874static int postcopy_chunk_hostpages(MigrationState *ms)
1875{
6f37bb8b 1876 RAMState *rs = &ram_state;
99e314eb
DDAG
1877 struct RAMBlock *block;
1878
99e314eb 1879 /* Easiest way to make sure we don't resume in the middle of a host-page */
6f37bb8b
JQ
1880 rs->last_seen_block = NULL;
1881 rs->last_sent_block = NULL;
1882 rs->last_offset = 0;
99e314eb
DDAG
1883
1884 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1885 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1886
1887 PostcopyDiscardState *pds =
1888 postcopy_discard_send_init(ms, first, block->idstr);
1889
1890 /* First pass: Discard all partially sent host pages */
1891 postcopy_chunk_hostpages_pass(ms, true, block, pds);
1892 /*
1893 * Second pass: Ensure that all partially dirty host pages are made
1894 * fully dirty.
1895 */
1896 postcopy_chunk_hostpages_pass(ms, false, block, pds);
1897
1898 postcopy_discard_send_finish(ms, pds);
1899 } /* ram_list loop */
1900
1901 return 0;
1902}
1903
3d0684b2
JQ
1904/**
1905 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1906 *
1907 * Returns zero on success
1908 *
e0b266f0
DDAG
1909 * Transmit the set of pages to be discarded after precopy to the target
1910 * these are pages that:
1911 * a) Have been previously transmitted but are now dirty again
1912 * b) Pages that have never been transmitted, this ensures that
1913 * any pages on the destination that have been mapped by background
1914 * tasks get discarded (transparent huge pages is the specific concern)
1915 * Hopefully this is pretty sparse
3d0684b2
JQ
1916 *
1917 * @ms: current migration state
e0b266f0
DDAG
1918 */
1919int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1920{
1921 int ret;
1922 unsigned long *bitmap, *unsentmap;
1923
1924 rcu_read_lock();
1925
1926 /* This should be our last sync, the src is now paused */
8d820d6f 1927 migration_bitmap_sync(&ram_state);
e0b266f0
DDAG
1928
1929 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1930 if (!unsentmap) {
1931 /* We don't have a safe way to resize the sentmap, so
1932 * if the bitmap was resized it will be NULL at this
1933 * point.
1934 */
1935 error_report("migration ram resized during precopy phase");
1936 rcu_read_unlock();
1937 return -EINVAL;
1938 }
1939
29c59172 1940 /* Deal with TPS != HPS and huge pages */
99e314eb
DDAG
1941 ret = postcopy_chunk_hostpages(ms);
1942 if (ret) {
1943 rcu_read_unlock();
1944 return ret;
1945 }
1946
e0b266f0
DDAG
1947 /*
1948 * Update the unsentmap to be unsentmap = unsentmap | dirty
1949 */
1950 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1951 bitmap_or(unsentmap, unsentmap, bitmap,
1952 last_ram_offset() >> TARGET_PAGE_BITS);
1953
1954
1955 trace_ram_postcopy_send_discard_bitmap();
1956#ifdef DEBUG_POSTCOPY
1957 ram_debug_dump_bitmap(unsentmap, true);
1958#endif
1959
1960 ret = postcopy_each_ram_send_discard(ms);
1961 rcu_read_unlock();
1962
1963 return ret;
1964}
1965
3d0684b2
JQ
1966/**
1967 * ram_discard_range: discard dirtied pages at the beginning of postcopy
e0b266f0 1968 *
3d0684b2 1969 * Returns zero on success
e0b266f0 1970 *
3d0684b2 1971 * @mis: current migration incoming state
36449157
JQ
1972 * @rbname: name of the RAMBlock of the request. NULL means the
1973 * same that last one.
3d0684b2
JQ
1974 * @start: RAMBlock starting page
1975 * @length: RAMBlock size
e0b266f0
DDAG
1976 */
1977int ram_discard_range(MigrationIncomingState *mis,
36449157 1978 const char *rbname,
e0b266f0
DDAG
1979 uint64_t start, size_t length)
1980{
1981 int ret = -1;
1982
36449157 1983 trace_ram_discard_range(rbname, start, length);
d3a5038c 1984
e0b266f0 1985 rcu_read_lock();
36449157 1986 RAMBlock *rb = qemu_ram_block_by_name(rbname);
e0b266f0
DDAG
1987
1988 if (!rb) {
36449157 1989 error_report("ram_discard_range: Failed to find block '%s'", rbname);
e0b266f0
DDAG
1990 goto err;
1991 }
1992
d3a5038c 1993 ret = ram_block_discard_range(rb, start, length);
e0b266f0
DDAG
1994
1995err:
1996 rcu_read_unlock();
1997
1998 return ret;
1999}
2000
6f37bb8b 2001static int ram_save_init_globals(RAMState *rs)
56e93d26 2002{
56e93d26
JQ
2003 int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
2004
8d820d6f 2005 rs->dirty_rate_high_cnt = 0;
5a987738 2006 rs->bitmap_sync_count = 0;
f664da80 2007 migration_bitmap_sync_init(rs);
dd631697 2008 qemu_mutex_init(&migration_bitmap_mutex);
56e93d26
JQ
2009
2010 if (migrate_use_xbzrle()) {
2011 XBZRLE_cache_lock();
adb65dec 2012 ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE);
56e93d26
JQ
2013 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
2014 TARGET_PAGE_SIZE,
2015 TARGET_PAGE_SIZE);
2016 if (!XBZRLE.cache) {
2017 XBZRLE_cache_unlock();
2018 error_report("Error creating cache");
2019 return -1;
2020 }
2021 XBZRLE_cache_unlock();
2022
2023 /* We prefer not to abort if there is no memory */
2024 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2025 if (!XBZRLE.encoded_buf) {
2026 error_report("Error allocating encoded_buf");
2027 return -1;
2028 }
2029
2030 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2031 if (!XBZRLE.current_buf) {
2032 error_report("Error allocating current_buf");
2033 g_free(XBZRLE.encoded_buf);
2034 XBZRLE.encoded_buf = NULL;
2035 return -1;
2036 }
2037
2038 acct_clear();
2039 }
2040
49877834
PB
2041 /* For memory_global_dirty_log_start below. */
2042 qemu_mutex_lock_iothread();
2043
56e93d26
JQ
2044 qemu_mutex_lock_ramlist();
2045 rcu_read_lock();
2046 bytes_transferred = 0;
6f37bb8b 2047 ram_state_reset(rs);
56e93d26 2048
f3f491fc 2049 migration_bitmap_rcu = g_new0(struct BitmapRcu, 1);
0827b9e9
AA
2050 /* Skip setting bitmap if there is no RAM */
2051 if (ram_bytes_total()) {
2052 ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
2053 migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages);
2054 bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages);
2055
2056 if (migrate_postcopy_ram()) {
2057 migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages);
2058 bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages);
2059 }
f3f491fc
DDAG
2060 }
2061
56e93d26
JQ
2062 /*
2063 * Count the total number of pages used by ram blocks not including any
2064 * gaps due to alignment or unplugs.
2065 */
2066 migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2067
2068 memory_global_dirty_log_start();
8d820d6f 2069 migration_bitmap_sync(rs);
56e93d26 2070 qemu_mutex_unlock_ramlist();
49877834 2071 qemu_mutex_unlock_iothread();
a91246c9
HZ
2072 rcu_read_unlock();
2073
2074 return 0;
2075}
2076
3d0684b2
JQ
2077/*
2078 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
a91246c9
HZ
2079 * long-running RCU critical section. When rcu-reclaims in the code
2080 * start to become numerous it will be necessary to reduce the
2081 * granularity of these critical sections.
2082 */
2083
3d0684b2
JQ
2084/**
2085 * ram_save_setup: Setup RAM for migration
2086 *
2087 * Returns zero to indicate success and negative for error
2088 *
2089 * @f: QEMUFile where to send the data
2090 * @opaque: RAMState pointer
2091 */
a91246c9
HZ
2092static int ram_save_setup(QEMUFile *f, void *opaque)
2093{
6f37bb8b 2094 RAMState *rs = opaque;
a91246c9
HZ
2095 RAMBlock *block;
2096
2097 /* migration has already setup the bitmap, reuse it. */
2098 if (!migration_in_colo_state()) {
6f37bb8b 2099 if (ram_save_init_globals(rs) < 0) {
a91246c9
HZ
2100 return -1;
2101 }
2102 }
2103
2104 rcu_read_lock();
56e93d26
JQ
2105
2106 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2107
2108 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2109 qemu_put_byte(f, strlen(block->idstr));
2110 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2111 qemu_put_be64(f, block->used_length);
ef08fb38
DDAG
2112 if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2113 qemu_put_be64(f, block->page_size);
2114 }
56e93d26
JQ
2115 }
2116
2117 rcu_read_unlock();
2118
2119 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2120 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2121
2122 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2123
2124 return 0;
2125}
2126
3d0684b2
JQ
2127/**
2128 * ram_save_iterate: iterative stage for migration
2129 *
2130 * Returns zero to indicate success and negative for error
2131 *
2132 * @f: QEMUFile where to send the data
2133 * @opaque: RAMState pointer
2134 */
56e93d26
JQ
2135static int ram_save_iterate(QEMUFile *f, void *opaque)
2136{
6f37bb8b 2137 RAMState *rs = opaque;
56e93d26
JQ
2138 int ret;
2139 int i;
2140 int64_t t0;
5c90308f 2141 int done = 0;
56e93d26
JQ
2142
2143 rcu_read_lock();
6f37bb8b
JQ
2144 if (ram_list.version != rs->last_version) {
2145 ram_state_reset(rs);
56e93d26
JQ
2146 }
2147
2148 /* Read version before ram_list.blocks */
2149 smp_rmb();
2150
2151 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2152
2153 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2154 i = 0;
2155 while ((ret = qemu_file_rate_limit(f)) == 0) {
2156 int pages;
2157
6f37bb8b 2158 pages = ram_find_and_save_block(rs, f, false, &bytes_transferred);
56e93d26
JQ
2159 /* no more pages to sent */
2160 if (pages == 0) {
5c90308f 2161 done = 1;
56e93d26
JQ
2162 break;
2163 }
56e93d26 2164 acct_info.iterations++;
070afca2 2165
56e93d26
JQ
2166 /* we want to check in the 1st loop, just in case it was the 1st time
2167 and we had to sync the dirty bitmap.
2168 qemu_get_clock_ns() is a bit expensive, so we only check each some
2169 iterations
2170 */
2171 if ((i & 63) == 0) {
2172 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2173 if (t1 > MAX_WAIT) {
55c4446b 2174 trace_ram_save_iterate_big_wait(t1, i);
56e93d26
JQ
2175 break;
2176 }
2177 }
2178 i++;
2179 }
2180 flush_compressed_data(f);
2181 rcu_read_unlock();
2182
2183 /*
2184 * Must occur before EOS (or any QEMUFile operation)
2185 * because of RDMA protocol.
2186 */
2187 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2188
2189 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2190 bytes_transferred += 8;
2191
2192 ret = qemu_file_get_error(f);
2193 if (ret < 0) {
2194 return ret;
2195 }
2196
5c90308f 2197 return done;
56e93d26
JQ
2198}
2199
3d0684b2
JQ
2200/**
2201 * ram_save_complete: function called to send the remaining amount of ram
2202 *
2203 * Returns zero to indicate success
2204 *
2205 * Called with iothread lock
2206 *
2207 * @f: QEMUFile where to send the data
2208 * @opaque: RAMState pointer
2209 */
56e93d26
JQ
2210static int ram_save_complete(QEMUFile *f, void *opaque)
2211{
6f37bb8b
JQ
2212 RAMState *rs = opaque;
2213
56e93d26
JQ
2214 rcu_read_lock();
2215
663e6c1d 2216 if (!migration_in_postcopy(migrate_get_current())) {
8d820d6f 2217 migration_bitmap_sync(rs);
663e6c1d 2218 }
56e93d26
JQ
2219
2220 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2221
2222 /* try transferring iterative blocks of memory */
2223
2224 /* flush all remaining blocks regardless of rate limiting */
2225 while (true) {
2226 int pages;
2227
6f37bb8b 2228 pages = ram_find_and_save_block(rs, f, !migration_in_colo_state(),
a91246c9 2229 &bytes_transferred);
56e93d26
JQ
2230 /* no more blocks to sent */
2231 if (pages == 0) {
2232 break;
2233 }
2234 }
2235
2236 flush_compressed_data(f);
2237 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
56e93d26
JQ
2238
2239 rcu_read_unlock();
d09a6fde 2240
56e93d26
JQ
2241 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2242
2243 return 0;
2244}
2245
c31b098f
DDAG
2246static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2247 uint64_t *non_postcopiable_pending,
2248 uint64_t *postcopiable_pending)
56e93d26 2249{
8d820d6f 2250 RAMState *rs = opaque;
56e93d26
JQ
2251 uint64_t remaining_size;
2252
2253 remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2254
663e6c1d
DDAG
2255 if (!migration_in_postcopy(migrate_get_current()) &&
2256 remaining_size < max_size) {
56e93d26
JQ
2257 qemu_mutex_lock_iothread();
2258 rcu_read_lock();
8d820d6f 2259 migration_bitmap_sync(rs);
56e93d26
JQ
2260 rcu_read_unlock();
2261 qemu_mutex_unlock_iothread();
2262 remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2263 }
c31b098f
DDAG
2264
2265 /* We can do postcopy, and all the data is postcopiable */
2266 *postcopiable_pending += remaining_size;
56e93d26
JQ
2267}
2268
2269static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2270{
2271 unsigned int xh_len;
2272 int xh_flags;
063e760a 2273 uint8_t *loaded_data;
56e93d26
JQ
2274
2275 if (!xbzrle_decoded_buf) {
2276 xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2277 }
063e760a 2278 loaded_data = xbzrle_decoded_buf;
56e93d26
JQ
2279
2280 /* extract RLE header */
2281 xh_flags = qemu_get_byte(f);
2282 xh_len = qemu_get_be16(f);
2283
2284 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2285 error_report("Failed to load XBZRLE page - wrong compression!");
2286 return -1;
2287 }
2288
2289 if (xh_len > TARGET_PAGE_SIZE) {
2290 error_report("Failed to load XBZRLE page - len overflow!");
2291 return -1;
2292 }
2293 /* load data and decode */
063e760a 2294 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
56e93d26
JQ
2295
2296 /* decode RLE */
063e760a 2297 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
56e93d26
JQ
2298 TARGET_PAGE_SIZE) == -1) {
2299 error_report("Failed to load XBZRLE page - decode error!");
2300 return -1;
2301 }
2302
2303 return 0;
2304}
2305
3d0684b2
JQ
2306/**
2307 * ram_block_from_stream: read a RAMBlock id from the migration stream
2308 *
2309 * Must be called from within a rcu critical section.
2310 *
56e93d26 2311 * Returns a pointer from within the RCU-protected ram_list.
a7180877 2312 *
3d0684b2
JQ
2313 * @f: QEMUFile where to read the data from
2314 * @flags: Page flags (mostly to see if it's a continuation of previous block)
a7180877 2315 */
3d0684b2 2316static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
56e93d26
JQ
2317{
2318 static RAMBlock *block = NULL;
2319 char id[256];
2320 uint8_t len;
2321
2322 if (flags & RAM_SAVE_FLAG_CONTINUE) {
4c4bad48 2323 if (!block) {
56e93d26
JQ
2324 error_report("Ack, bad migration stream!");
2325 return NULL;
2326 }
4c4bad48 2327 return block;
56e93d26
JQ
2328 }
2329
2330 len = qemu_get_byte(f);
2331 qemu_get_buffer(f, (uint8_t *)id, len);
2332 id[len] = 0;
2333
e3dd7493 2334 block = qemu_ram_block_by_name(id);
4c4bad48
HZ
2335 if (!block) {
2336 error_report("Can't find block %s", id);
2337 return NULL;
56e93d26
JQ
2338 }
2339
4c4bad48
HZ
2340 return block;
2341}
2342
2343static inline void *host_from_ram_block_offset(RAMBlock *block,
2344 ram_addr_t offset)
2345{
2346 if (!offset_in_ramblock(block, offset)) {
2347 return NULL;
2348 }
2349
2350 return block->host + offset;
56e93d26
JQ
2351}
2352
3d0684b2
JQ
2353/**
2354 * ram_handle_compressed: handle the zero page case
2355 *
56e93d26
JQ
2356 * If a page (or a whole RDMA chunk) has been
2357 * determined to be zero, then zap it.
3d0684b2
JQ
2358 *
2359 * @host: host address for the zero page
2360 * @ch: what the page is filled from. We only support zero
2361 * @size: size of the zero page
56e93d26
JQ
2362 */
2363void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2364{
2365 if (ch != 0 || !is_zero_range(host, size)) {
2366 memset(host, ch, size);
2367 }
2368}
2369
2370static void *do_data_decompress(void *opaque)
2371{
2372 DecompressParam *param = opaque;
2373 unsigned long pagesize;
33d151f4
LL
2374 uint8_t *des;
2375 int len;
56e93d26 2376
33d151f4 2377 qemu_mutex_lock(&param->mutex);
90e56fb4 2378 while (!param->quit) {
33d151f4
LL
2379 if (param->des) {
2380 des = param->des;
2381 len = param->len;
2382 param->des = 0;
2383 qemu_mutex_unlock(&param->mutex);
2384
56e93d26 2385 pagesize = TARGET_PAGE_SIZE;
73a8912b
LL
2386 /* uncompress() will return failed in some case, especially
2387 * when the page is dirted when doing the compression, it's
2388 * not a problem because the dirty page will be retransferred
2389 * and uncompress() won't break the data in other pages.
2390 */
33d151f4
LL
2391 uncompress((Bytef *)des, &pagesize,
2392 (const Bytef *)param->compbuf, len);
73a8912b 2393
33d151f4
LL
2394 qemu_mutex_lock(&decomp_done_lock);
2395 param->done = true;
2396 qemu_cond_signal(&decomp_done_cond);
2397 qemu_mutex_unlock(&decomp_done_lock);
2398
2399 qemu_mutex_lock(&param->mutex);
2400 } else {
2401 qemu_cond_wait(&param->cond, &param->mutex);
2402 }
56e93d26 2403 }
33d151f4 2404 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
2405
2406 return NULL;
2407}
2408
5533b2e9
LL
2409static void wait_for_decompress_done(void)
2410{
2411 int idx, thread_count;
2412
2413 if (!migrate_use_compression()) {
2414 return;
2415 }
2416
2417 thread_count = migrate_decompress_threads();
2418 qemu_mutex_lock(&decomp_done_lock);
2419 for (idx = 0; idx < thread_count; idx++) {
2420 while (!decomp_param[idx].done) {
2421 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2422 }
2423 }
2424 qemu_mutex_unlock(&decomp_done_lock);
2425}
2426
56e93d26
JQ
2427void migrate_decompress_threads_create(void)
2428{
2429 int i, thread_count;
2430
2431 thread_count = migrate_decompress_threads();
2432 decompress_threads = g_new0(QemuThread, thread_count);
2433 decomp_param = g_new0(DecompressParam, thread_count);
73a8912b
LL
2434 qemu_mutex_init(&decomp_done_lock);
2435 qemu_cond_init(&decomp_done_cond);
56e93d26
JQ
2436 for (i = 0; i < thread_count; i++) {
2437 qemu_mutex_init(&decomp_param[i].mutex);
2438 qemu_cond_init(&decomp_param[i].cond);
2439 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
73a8912b 2440 decomp_param[i].done = true;
90e56fb4 2441 decomp_param[i].quit = false;
56e93d26
JQ
2442 qemu_thread_create(decompress_threads + i, "decompress",
2443 do_data_decompress, decomp_param + i,
2444 QEMU_THREAD_JOINABLE);
2445 }
2446}
2447
2448void migrate_decompress_threads_join(void)
2449{
2450 int i, thread_count;
2451
56e93d26
JQ
2452 thread_count = migrate_decompress_threads();
2453 for (i = 0; i < thread_count; i++) {
2454 qemu_mutex_lock(&decomp_param[i].mutex);
90e56fb4 2455 decomp_param[i].quit = true;
56e93d26
JQ
2456 qemu_cond_signal(&decomp_param[i].cond);
2457 qemu_mutex_unlock(&decomp_param[i].mutex);
2458 }
2459 for (i = 0; i < thread_count; i++) {
2460 qemu_thread_join(decompress_threads + i);
2461 qemu_mutex_destroy(&decomp_param[i].mutex);
2462 qemu_cond_destroy(&decomp_param[i].cond);
2463 g_free(decomp_param[i].compbuf);
2464 }
2465 g_free(decompress_threads);
2466 g_free(decomp_param);
56e93d26
JQ
2467 decompress_threads = NULL;
2468 decomp_param = NULL;
56e93d26
JQ
2469}
2470
c1bc6626 2471static void decompress_data_with_multi_threads(QEMUFile *f,
56e93d26
JQ
2472 void *host, int len)
2473{
2474 int idx, thread_count;
2475
2476 thread_count = migrate_decompress_threads();
73a8912b 2477 qemu_mutex_lock(&decomp_done_lock);
56e93d26
JQ
2478 while (true) {
2479 for (idx = 0; idx < thread_count; idx++) {
73a8912b 2480 if (decomp_param[idx].done) {
33d151f4
LL
2481 decomp_param[idx].done = false;
2482 qemu_mutex_lock(&decomp_param[idx].mutex);
c1bc6626 2483 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
56e93d26
JQ
2484 decomp_param[idx].des = host;
2485 decomp_param[idx].len = len;
33d151f4
LL
2486 qemu_cond_signal(&decomp_param[idx].cond);
2487 qemu_mutex_unlock(&decomp_param[idx].mutex);
56e93d26
JQ
2488 break;
2489 }
2490 }
2491 if (idx < thread_count) {
2492 break;
73a8912b
LL
2493 } else {
2494 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
56e93d26
JQ
2495 }
2496 }
73a8912b 2497 qemu_mutex_unlock(&decomp_done_lock);
56e93d26
JQ
2498}
2499
3d0684b2
JQ
2500/**
2501 * ram_postcopy_incoming_init: allocate postcopy data structures
2502 *
2503 * Returns 0 for success and negative if there was one error
2504 *
2505 * @mis: current migration incoming state
2506 *
2507 * Allocate data structures etc needed by incoming migration with
2508 * postcopy-ram. postcopy-ram's similarly names
2509 * postcopy_ram_incoming_init does the work.
1caddf8a
DDAG
2510 */
2511int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2512{
2513 size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
2514
2515 return postcopy_ram_incoming_init(mis, ram_pages);
2516}
2517
3d0684b2
JQ
2518/**
2519 * ram_load_postcopy: load a page in postcopy case
2520 *
2521 * Returns 0 for success or -errno in case of error
2522 *
a7180877
DDAG
2523 * Called in postcopy mode by ram_load().
2524 * rcu_read_lock is taken prior to this being called.
3d0684b2
JQ
2525 *
2526 * @f: QEMUFile where to send the data
a7180877
DDAG
2527 */
2528static int ram_load_postcopy(QEMUFile *f)
2529{
2530 int flags = 0, ret = 0;
2531 bool place_needed = false;
28abd200 2532 bool matching_page_sizes = false;
a7180877
DDAG
2533 MigrationIncomingState *mis = migration_incoming_get_current();
2534 /* Temporary page that is later 'placed' */
2535 void *postcopy_host_page = postcopy_get_tmp_page(mis);
c53b7ddc 2536 void *last_host = NULL;
a3b6ff6d 2537 bool all_zero = false;
a7180877
DDAG
2538
2539 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2540 ram_addr_t addr;
2541 void *host = NULL;
2542 void *page_buffer = NULL;
2543 void *place_source = NULL;
df9ff5e1 2544 RAMBlock *block = NULL;
a7180877 2545 uint8_t ch;
a7180877
DDAG
2546
2547 addr = qemu_get_be64(f);
2548 flags = addr & ~TARGET_PAGE_MASK;
2549 addr &= TARGET_PAGE_MASK;
2550
2551 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2552 place_needed = false;
2553 if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) {
df9ff5e1 2554 block = ram_block_from_stream(f, flags);
4c4bad48
HZ
2555
2556 host = host_from_ram_block_offset(block, addr);
a7180877
DDAG
2557 if (!host) {
2558 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2559 ret = -EINVAL;
2560 break;
2561 }
28abd200 2562 matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
a7180877 2563 /*
28abd200
DDAG
2564 * Postcopy requires that we place whole host pages atomically;
2565 * these may be huge pages for RAMBlocks that are backed by
2566 * hugetlbfs.
a7180877
DDAG
2567 * To make it atomic, the data is read into a temporary page
2568 * that's moved into place later.
2569 * The migration protocol uses, possibly smaller, target-pages
2570 * however the source ensures it always sends all the components
2571 * of a host page in order.
2572 */
2573 page_buffer = postcopy_host_page +
28abd200 2574 ((uintptr_t)host & (block->page_size - 1));
a7180877 2575 /* If all TP are zero then we can optimise the place */
28abd200 2576 if (!((uintptr_t)host & (block->page_size - 1))) {
a7180877 2577 all_zero = true;
c53b7ddc
DDAG
2578 } else {
2579 /* not the 1st TP within the HP */
2580 if (host != (last_host + TARGET_PAGE_SIZE)) {
9af9e0fe 2581 error_report("Non-sequential target page %p/%p",
c53b7ddc
DDAG
2582 host, last_host);
2583 ret = -EINVAL;
2584 break;
2585 }
a7180877
DDAG
2586 }
2587
c53b7ddc 2588
a7180877
DDAG
2589 /*
2590 * If it's the last part of a host page then we place the host
2591 * page
2592 */
2593 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
28abd200 2594 (block->page_size - 1)) == 0;
a7180877
DDAG
2595 place_source = postcopy_host_page;
2596 }
c53b7ddc 2597 last_host = host;
a7180877
DDAG
2598
2599 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2600 case RAM_SAVE_FLAG_COMPRESS:
2601 ch = qemu_get_byte(f);
2602 memset(page_buffer, ch, TARGET_PAGE_SIZE);
2603 if (ch) {
2604 all_zero = false;
2605 }
2606 break;
2607
2608 case RAM_SAVE_FLAG_PAGE:
2609 all_zero = false;
2610 if (!place_needed || !matching_page_sizes) {
2611 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2612 } else {
2613 /* Avoids the qemu_file copy during postcopy, which is
2614 * going to do a copy later; can only do it when we
2615 * do this read in one go (matching page sizes)
2616 */
2617 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2618 TARGET_PAGE_SIZE);
2619 }
2620 break;
2621 case RAM_SAVE_FLAG_EOS:
2622 /* normal exit */
2623 break;
2624 default:
2625 error_report("Unknown combination of migration flags: %#x"
2626 " (postcopy mode)", flags);
2627 ret = -EINVAL;
2628 }
2629
2630 if (place_needed) {
2631 /* This gets called at the last target page in the host page */
df9ff5e1
DDAG
2632 void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2633
a7180877 2634 if (all_zero) {
df9ff5e1
DDAG
2635 ret = postcopy_place_page_zero(mis, place_dest,
2636 block->page_size);
a7180877 2637 } else {
df9ff5e1
DDAG
2638 ret = postcopy_place_page(mis, place_dest,
2639 place_source, block->page_size);
a7180877
DDAG
2640 }
2641 }
2642 if (!ret) {
2643 ret = qemu_file_get_error(f);
2644 }
2645 }
2646
2647 return ret;
2648}
2649
56e93d26
JQ
2650static int ram_load(QEMUFile *f, void *opaque, int version_id)
2651{
2652 int flags = 0, ret = 0;
2653 static uint64_t seq_iter;
2654 int len = 0;
a7180877
DDAG
2655 /*
2656 * If system is running in postcopy mode, page inserts to host memory must
2657 * be atomic
2658 */
2659 bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
ef08fb38
DDAG
2660 /* ADVISE is earlier, it shows the source has the postcopy capability on */
2661 bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
56e93d26
JQ
2662
2663 seq_iter++;
2664
2665 if (version_id != 4) {
2666 ret = -EINVAL;
2667 }
2668
2669 /* This RCU critical section can be very long running.
2670 * When RCU reclaims in the code start to become numerous,
2671 * it will be necessary to reduce the granularity of this
2672 * critical section.
2673 */
2674 rcu_read_lock();
a7180877
DDAG
2675
2676 if (postcopy_running) {
2677 ret = ram_load_postcopy(f);
2678 }
2679
2680 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
56e93d26 2681 ram_addr_t addr, total_ram_bytes;
a776aa15 2682 void *host = NULL;
56e93d26
JQ
2683 uint8_t ch;
2684
2685 addr = qemu_get_be64(f);
2686 flags = addr & ~TARGET_PAGE_MASK;
2687 addr &= TARGET_PAGE_MASK;
2688
a776aa15
DDAG
2689 if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE |
2690 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4c4bad48
HZ
2691 RAMBlock *block = ram_block_from_stream(f, flags);
2692
2693 host = host_from_ram_block_offset(block, addr);
a776aa15
DDAG
2694 if (!host) {
2695 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2696 ret = -EINVAL;
2697 break;
2698 }
2699 }
2700
56e93d26
JQ
2701 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2702 case RAM_SAVE_FLAG_MEM_SIZE:
2703 /* Synchronize RAM block list */
2704 total_ram_bytes = addr;
2705 while (!ret && total_ram_bytes) {
2706 RAMBlock *block;
56e93d26
JQ
2707 char id[256];
2708 ram_addr_t length;
2709
2710 len = qemu_get_byte(f);
2711 qemu_get_buffer(f, (uint8_t *)id, len);
2712 id[len] = 0;
2713 length = qemu_get_be64(f);
2714
e3dd7493
DDAG
2715 block = qemu_ram_block_by_name(id);
2716 if (block) {
2717 if (length != block->used_length) {
2718 Error *local_err = NULL;
56e93d26 2719
fa53a0e5 2720 ret = qemu_ram_resize(block, length,
e3dd7493
DDAG
2721 &local_err);
2722 if (local_err) {
2723 error_report_err(local_err);
56e93d26 2724 }
56e93d26 2725 }
ef08fb38
DDAG
2726 /* For postcopy we need to check hugepage sizes match */
2727 if (postcopy_advised &&
2728 block->page_size != qemu_host_page_size) {
2729 uint64_t remote_page_size = qemu_get_be64(f);
2730 if (remote_page_size != block->page_size) {
2731 error_report("Mismatched RAM page size %s "
2732 "(local) %zd != %" PRId64,
2733 id, block->page_size,
2734 remote_page_size);
2735 ret = -EINVAL;
2736 }
2737 }
e3dd7493
DDAG
2738 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2739 block->idstr);
2740 } else {
56e93d26
JQ
2741 error_report("Unknown ramblock \"%s\", cannot "
2742 "accept migration", id);
2743 ret = -EINVAL;
2744 }
2745
2746 total_ram_bytes -= length;
2747 }
2748 break;
a776aa15 2749
56e93d26 2750 case RAM_SAVE_FLAG_COMPRESS:
56e93d26
JQ
2751 ch = qemu_get_byte(f);
2752 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2753 break;
a776aa15 2754
56e93d26 2755 case RAM_SAVE_FLAG_PAGE:
56e93d26
JQ
2756 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2757 break;
56e93d26 2758
a776aa15 2759 case RAM_SAVE_FLAG_COMPRESS_PAGE:
56e93d26
JQ
2760 len = qemu_get_be32(f);
2761 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2762 error_report("Invalid compressed data length: %d", len);
2763 ret = -EINVAL;
2764 break;
2765 }
c1bc6626 2766 decompress_data_with_multi_threads(f, host, len);
56e93d26 2767 break;
a776aa15 2768
56e93d26 2769 case RAM_SAVE_FLAG_XBZRLE:
56e93d26
JQ
2770 if (load_xbzrle(f, addr, host) < 0) {
2771 error_report("Failed to decompress XBZRLE page at "
2772 RAM_ADDR_FMT, addr);
2773 ret = -EINVAL;
2774 break;
2775 }
2776 break;
2777 case RAM_SAVE_FLAG_EOS:
2778 /* normal exit */
2779 break;
2780 default:
2781 if (flags & RAM_SAVE_FLAG_HOOK) {
632e3a5c 2782 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
56e93d26
JQ
2783 } else {
2784 error_report("Unknown combination of migration flags: %#x",
2785 flags);
2786 ret = -EINVAL;
2787 }
2788 }
2789 if (!ret) {
2790 ret = qemu_file_get_error(f);
2791 }
2792 }
2793
5533b2e9 2794 wait_for_decompress_done();
56e93d26 2795 rcu_read_unlock();
55c4446b 2796 trace_ram_load_complete(ret, seq_iter);
56e93d26
JQ
2797 return ret;
2798}
2799
2800static SaveVMHandlers savevm_ram_handlers = {
2801 .save_live_setup = ram_save_setup,
2802 .save_live_iterate = ram_save_iterate,
763c906b 2803 .save_live_complete_postcopy = ram_save_complete,
a3e06c3d 2804 .save_live_complete_precopy = ram_save_complete,
56e93d26
JQ
2805 .save_live_pending = ram_save_pending,
2806 .load_state = ram_load,
6ad2a215 2807 .cleanup = ram_migration_cleanup,
56e93d26
JQ
2808};
2809
2810void ram_mig_init(void)
2811{
2812 qemu_mutex_init(&XBZRLE.lock);
6f37bb8b 2813 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
56e93d26 2814}