]> git.proxmox.com Git - mirror_qemu.git/blame - migration/ram.c
ram: Update all functions comments
[mirror_qemu.git] / migration / ram.c
CommitLineData
56e93d26
JQ
1/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
76cc7b58
JQ
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
56e93d26
JQ
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
1393a485 28#include "qemu/osdep.h"
33c11879
PB
29#include "qemu-common.h"
30#include "cpu.h"
56e93d26 31#include <zlib.h>
4addcd4f 32#include "qapi-event.h"
f348b6d1 33#include "qemu/cutils.h"
56e93d26
JQ
34#include "qemu/bitops.h"
35#include "qemu/bitmap.h"
7205c9ec
JQ
36#include "qemu/timer.h"
37#include "qemu/main-loop.h"
56e93d26 38#include "migration/migration.h"
e0b266f0 39#include "migration/postcopy-ram.h"
56e93d26
JQ
40#include "exec/address-spaces.h"
41#include "migration/page_cache.h"
56e93d26 42#include "qemu/error-report.h"
56e93d26 43#include "trace.h"
56e93d26 44#include "exec/ram_addr.h"
56e93d26 45#include "qemu/rcu_queue.h"
a91246c9 46#include "migration/colo.h"
56e93d26 47
56e93d26 48static int dirty_rate_high_cnt;
56e93d26
JQ
49
50static uint64_t bitmap_sync_count;
51
52/***********************************************************/
53/* ram save/restore */
54
55#define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
56#define RAM_SAVE_FLAG_COMPRESS 0x02
57#define RAM_SAVE_FLAG_MEM_SIZE 0x04
58#define RAM_SAVE_FLAG_PAGE 0x08
59#define RAM_SAVE_FLAG_EOS 0x10
60#define RAM_SAVE_FLAG_CONTINUE 0x20
61#define RAM_SAVE_FLAG_XBZRLE 0x40
62/* 0x80 is reserved in migration.h start with 0x100 next */
63#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
64
adb65dec 65static uint8_t *ZERO_TARGET_PAGE;
56e93d26
JQ
66
67static inline bool is_zero_range(uint8_t *p, uint64_t size)
68{
a1febc49 69 return buffer_is_zero(p, size);
56e93d26
JQ
70}
71
72/* struct contains XBZRLE cache and a static page
73 used by the compression */
74static struct {
75 /* buffer used for XBZRLE encoding */
76 uint8_t *encoded_buf;
77 /* buffer for storing page content */
78 uint8_t *current_buf;
79 /* Cache for XBZRLE, Protected by lock. */
80 PageCache *cache;
81 QemuMutex lock;
82} XBZRLE;
83
84/* buffer used for XBZRLE decoding */
85static uint8_t *xbzrle_decoded_buf;
86
87static void XBZRLE_cache_lock(void)
88{
89 if (migrate_use_xbzrle())
90 qemu_mutex_lock(&XBZRLE.lock);
91}
92
93static void XBZRLE_cache_unlock(void)
94{
95 if (migrate_use_xbzrle())
96 qemu_mutex_unlock(&XBZRLE.lock);
97}
98
3d0684b2
JQ
99/**
100 * xbzrle_cache_resize: resize the xbzrle cache
101 *
102 * This function is called from qmp_migrate_set_cache_size in main
103 * thread, possibly while a migration is in progress. A running
104 * migration may be using the cache and might finish during this call,
105 * hence changes to the cache are protected by XBZRLE.lock().
106 *
107 * Returns the new_size or negative in case of error.
108 *
109 * @new_size: new cache size
56e93d26
JQ
110 */
111int64_t xbzrle_cache_resize(int64_t new_size)
112{
113 PageCache *new_cache;
114 int64_t ret;
115
116 if (new_size < TARGET_PAGE_SIZE) {
117 return -1;
118 }
119
120 XBZRLE_cache_lock();
121
122 if (XBZRLE.cache != NULL) {
123 if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
124 goto out_new_size;
125 }
126 new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
127 TARGET_PAGE_SIZE);
128 if (!new_cache) {
129 error_report("Error creating cache");
130 ret = -1;
131 goto out;
132 }
133
134 cache_fini(XBZRLE.cache);
135 XBZRLE.cache = new_cache;
136 }
137
138out_new_size:
139 ret = pow2floor(new_size);
140out:
141 XBZRLE_cache_unlock();
142 return ret;
143}
144
145/* accounting for migration statistics */
146typedef struct AccountingInfo {
147 uint64_t dup_pages;
148 uint64_t skipped_pages;
149 uint64_t norm_pages;
150 uint64_t iterations;
151 uint64_t xbzrle_bytes;
152 uint64_t xbzrle_pages;
153 uint64_t xbzrle_cache_miss;
154 double xbzrle_cache_miss_rate;
155 uint64_t xbzrle_overflows;
156} AccountingInfo;
157
158static AccountingInfo acct_info;
159
160static void acct_clear(void)
161{
162 memset(&acct_info, 0, sizeof(acct_info));
163}
164
165uint64_t dup_mig_bytes_transferred(void)
166{
167 return acct_info.dup_pages * TARGET_PAGE_SIZE;
168}
169
170uint64_t dup_mig_pages_transferred(void)
171{
172 return acct_info.dup_pages;
173}
174
175uint64_t skipped_mig_bytes_transferred(void)
176{
177 return acct_info.skipped_pages * TARGET_PAGE_SIZE;
178}
179
180uint64_t skipped_mig_pages_transferred(void)
181{
182 return acct_info.skipped_pages;
183}
184
185uint64_t norm_mig_bytes_transferred(void)
186{
187 return acct_info.norm_pages * TARGET_PAGE_SIZE;
188}
189
190uint64_t norm_mig_pages_transferred(void)
191{
192 return acct_info.norm_pages;
193}
194
195uint64_t xbzrle_mig_bytes_transferred(void)
196{
197 return acct_info.xbzrle_bytes;
198}
199
200uint64_t xbzrle_mig_pages_transferred(void)
201{
202 return acct_info.xbzrle_pages;
203}
204
205uint64_t xbzrle_mig_pages_cache_miss(void)
206{
207 return acct_info.xbzrle_cache_miss;
208}
209
210double xbzrle_mig_cache_miss_rate(void)
211{
212 return acct_info.xbzrle_cache_miss_rate;
213}
214
215uint64_t xbzrle_mig_pages_overflow(void)
216{
217 return acct_info.xbzrle_overflows;
218}
219
220/* This is the last block that we have visited serching for dirty pages
221 */
222static RAMBlock *last_seen_block;
223/* This is the last block from where we have sent data */
224static RAMBlock *last_sent_block;
225static ram_addr_t last_offset;
dd631697 226static QemuMutex migration_bitmap_mutex;
56e93d26
JQ
227static uint64_t migration_dirty_pages;
228static uint32_t last_version;
229static bool ram_bulk_stage;
230
b8fb8cb7
DDAG
231/* used by the search for pages to send */
232struct PageSearchStatus {
233 /* Current block being searched */
234 RAMBlock *block;
235 /* Current offset to search from */
236 ram_addr_t offset;
237 /* Set once we wrap around */
238 bool complete_round;
239};
240typedef struct PageSearchStatus PageSearchStatus;
241
60be6340
DL
242static struct BitmapRcu {
243 struct rcu_head rcu;
f3f491fc 244 /* Main migration bitmap */
60be6340 245 unsigned long *bmap;
f3f491fc
DDAG
246 /* bitmap of pages that haven't been sent even once
247 * only maintained and used in postcopy at the moment
248 * where it's used to send the dirtymap at the start
249 * of the postcopy phase
250 */
251 unsigned long *unsentmap;
60be6340
DL
252} *migration_bitmap_rcu;
253
56e93d26 254struct CompressParam {
56e93d26 255 bool done;
90e56fb4 256 bool quit;
56e93d26
JQ
257 QEMUFile *file;
258 QemuMutex mutex;
259 QemuCond cond;
260 RAMBlock *block;
261 ram_addr_t offset;
262};
263typedef struct CompressParam CompressParam;
264
265struct DecompressParam {
73a8912b 266 bool done;
90e56fb4 267 bool quit;
56e93d26
JQ
268 QemuMutex mutex;
269 QemuCond cond;
270 void *des;
d341d9f3 271 uint8_t *compbuf;
56e93d26
JQ
272 int len;
273};
274typedef struct DecompressParam DecompressParam;
275
276static CompressParam *comp_param;
277static QemuThread *compress_threads;
278/* comp_done_cond is used to wake up the migration thread when
279 * one of the compression threads has finished the compression.
280 * comp_done_lock is used to co-work with comp_done_cond.
281 */
0d9f9a5c
LL
282static QemuMutex comp_done_lock;
283static QemuCond comp_done_cond;
56e93d26
JQ
284/* The empty QEMUFileOps will be used by file in CompressParam */
285static const QEMUFileOps empty_ops = { };
286
287static bool compression_switch;
56e93d26
JQ
288static DecompressParam *decomp_param;
289static QemuThread *decompress_threads;
73a8912b
LL
290static QemuMutex decomp_done_lock;
291static QemuCond decomp_done_cond;
56e93d26 292
a7a9a88f
LL
293static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
294 ram_addr_t offset);
56e93d26
JQ
295
296static void *do_data_compress(void *opaque)
297{
298 CompressParam *param = opaque;
a7a9a88f
LL
299 RAMBlock *block;
300 ram_addr_t offset;
56e93d26 301
a7a9a88f 302 qemu_mutex_lock(&param->mutex);
90e56fb4 303 while (!param->quit) {
a7a9a88f
LL
304 if (param->block) {
305 block = param->block;
306 offset = param->offset;
307 param->block = NULL;
308 qemu_mutex_unlock(&param->mutex);
309
310 do_compress_ram_page(param->file, block, offset);
311
0d9f9a5c 312 qemu_mutex_lock(&comp_done_lock);
a7a9a88f 313 param->done = true;
0d9f9a5c
LL
314 qemu_cond_signal(&comp_done_cond);
315 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
316
317 qemu_mutex_lock(&param->mutex);
318 } else {
56e93d26
JQ
319 qemu_cond_wait(&param->cond, &param->mutex);
320 }
56e93d26 321 }
a7a9a88f 322 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
323
324 return NULL;
325}
326
327static inline void terminate_compression_threads(void)
328{
329 int idx, thread_count;
330
331 thread_count = migrate_compress_threads();
3d0684b2 332
56e93d26
JQ
333 for (idx = 0; idx < thread_count; idx++) {
334 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 335 comp_param[idx].quit = true;
56e93d26
JQ
336 qemu_cond_signal(&comp_param[idx].cond);
337 qemu_mutex_unlock(&comp_param[idx].mutex);
338 }
339}
340
341void migrate_compress_threads_join(void)
342{
343 int i, thread_count;
344
345 if (!migrate_use_compression()) {
346 return;
347 }
348 terminate_compression_threads();
349 thread_count = migrate_compress_threads();
350 for (i = 0; i < thread_count; i++) {
351 qemu_thread_join(compress_threads + i);
352 qemu_fclose(comp_param[i].file);
353 qemu_mutex_destroy(&comp_param[i].mutex);
354 qemu_cond_destroy(&comp_param[i].cond);
355 }
0d9f9a5c
LL
356 qemu_mutex_destroy(&comp_done_lock);
357 qemu_cond_destroy(&comp_done_cond);
56e93d26
JQ
358 g_free(compress_threads);
359 g_free(comp_param);
56e93d26
JQ
360 compress_threads = NULL;
361 comp_param = NULL;
56e93d26
JQ
362}
363
364void migrate_compress_threads_create(void)
365{
366 int i, thread_count;
367
368 if (!migrate_use_compression()) {
369 return;
370 }
56e93d26
JQ
371 compression_switch = true;
372 thread_count = migrate_compress_threads();
373 compress_threads = g_new0(QemuThread, thread_count);
374 comp_param = g_new0(CompressParam, thread_count);
0d9f9a5c
LL
375 qemu_cond_init(&comp_done_cond);
376 qemu_mutex_init(&comp_done_lock);
56e93d26 377 for (i = 0; i < thread_count; i++) {
e110aa91
C
378 /* comp_param[i].file is just used as a dummy buffer to save data,
379 * set its ops to empty.
56e93d26
JQ
380 */
381 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
382 comp_param[i].done = true;
90e56fb4 383 comp_param[i].quit = false;
56e93d26
JQ
384 qemu_mutex_init(&comp_param[i].mutex);
385 qemu_cond_init(&comp_param[i].cond);
386 qemu_thread_create(compress_threads + i, "compress",
387 do_data_compress, comp_param + i,
388 QEMU_THREAD_JOINABLE);
389 }
390}
391
392/**
3d0684b2 393 * save_page_header: write page header to wire
56e93d26
JQ
394 *
395 * If this is the 1st block, it also writes the block identification
396 *
3d0684b2 397 * Returns the number of bytes written
56e93d26
JQ
398 *
399 * @f: QEMUFile where to send the data
400 * @block: block that contains the page we want to send
401 * @offset: offset inside the block for the page
402 * in the lower bits, it contains flags
403 */
404static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
405{
9f5f380b 406 size_t size, len;
56e93d26
JQ
407
408 qemu_put_be64(f, offset);
409 size = 8;
410
411 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
9f5f380b
LL
412 len = strlen(block->idstr);
413 qemu_put_byte(f, len);
414 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
415 size += 1 + len;
56e93d26
JQ
416 }
417 return size;
418}
419
3d0684b2
JQ
420/**
421 * mig_throttle_guest_down: throotle down the guest
422 *
423 * Reduce amount of guest cpu execution to hopefully slow down memory
424 * writes. If guest dirty memory rate is reduced below the rate at
425 * which we can transfer pages to the destination then we should be
426 * able to complete migration. Some workloads dirty memory way too
427 * fast and will not effectively converge, even with auto-converge.
070afca2
JH
428 */
429static void mig_throttle_guest_down(void)
430{
431 MigrationState *s = migrate_get_current();
2594f56d
DB
432 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
433 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
070afca2
JH
434
435 /* We have not started throttling yet. Let's start it. */
436 if (!cpu_throttle_active()) {
437 cpu_throttle_set(pct_initial);
438 } else {
439 /* Throttling already on, just increase the rate */
440 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
441 }
442}
443
3d0684b2
JQ
444/**
445 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
446 *
447 * @current_addr: address for the zero page
448 *
449 * Update the xbzrle cache to reflect a page that's been sent as all 0.
56e93d26
JQ
450 * The important thing is that a stale (not-yet-0'd) page be replaced
451 * by the new data.
452 * As a bonus, if the page wasn't in the cache it gets added so that
3d0684b2 453 * when a small write is made into the 0'd page it gets XBZRLE sent.
56e93d26
JQ
454 */
455static void xbzrle_cache_zero_page(ram_addr_t current_addr)
456{
457 if (ram_bulk_stage || !migrate_use_xbzrle()) {
458 return;
459 }
460
461 /* We don't care if this fails to allocate a new cache page
462 * as long as it updated an old one */
463 cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
464 bitmap_sync_count);
465}
466
467#define ENCODING_FLAG_XBZRLE 0x1
468
469/**
470 * save_xbzrle_page: compress and send current page
471 *
472 * Returns: 1 means that we wrote the page
473 * 0 means that page is identical to the one already sent
474 * -1 means that xbzrle would be longer than normal
475 *
476 * @f: QEMUFile where to send the data
3d0684b2
JQ
477 * @current_data: pointer to the address of the page contents
478 * @current_addr: addr of the page
56e93d26
JQ
479 * @block: block that contains the page we want to send
480 * @offset: offset inside the block for the page
481 * @last_stage: if we are at the completion stage
482 * @bytes_transferred: increase it with the number of transferred bytes
483 */
484static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data,
485 ram_addr_t current_addr, RAMBlock *block,
486 ram_addr_t offset, bool last_stage,
487 uint64_t *bytes_transferred)
488{
489 int encoded_len = 0, bytes_xbzrle;
490 uint8_t *prev_cached_page;
491
492 if (!cache_is_cached(XBZRLE.cache, current_addr, bitmap_sync_count)) {
493 acct_info.xbzrle_cache_miss++;
494 if (!last_stage) {
495 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
496 bitmap_sync_count) == -1) {
497 return -1;
498 } else {
499 /* update *current_data when the page has been
500 inserted into cache */
501 *current_data = get_cached_data(XBZRLE.cache, current_addr);
502 }
503 }
504 return -1;
505 }
506
507 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
508
509 /* save current buffer into memory */
510 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
511
512 /* XBZRLE encoding (if there is no overflow) */
513 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
514 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
515 TARGET_PAGE_SIZE);
516 if (encoded_len == 0) {
55c4446b 517 trace_save_xbzrle_page_skipping();
56e93d26
JQ
518 return 0;
519 } else if (encoded_len == -1) {
55c4446b 520 trace_save_xbzrle_page_overflow();
56e93d26
JQ
521 acct_info.xbzrle_overflows++;
522 /* update data in the cache */
523 if (!last_stage) {
524 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
525 *current_data = prev_cached_page;
526 }
527 return -1;
528 }
529
530 /* we need to update the data in the cache, in order to get the same data */
531 if (!last_stage) {
532 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
533 }
534
535 /* Send XBZRLE based compressed page */
536 bytes_xbzrle = save_page_header(f, block, offset | RAM_SAVE_FLAG_XBZRLE);
537 qemu_put_byte(f, ENCODING_FLAG_XBZRLE);
538 qemu_put_be16(f, encoded_len);
539 qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len);
540 bytes_xbzrle += encoded_len + 1 + 2;
541 acct_info.xbzrle_pages++;
542 acct_info.xbzrle_bytes += bytes_xbzrle;
543 *bytes_transferred += bytes_xbzrle;
544
545 return 1;
546}
547
3d0684b2
JQ
548/**
549 * migration_bitmap_find_dirty: find the next dirty page from start
f3f491fc 550 *
3d0684b2
JQ
551 * Called with rcu_read_lock() to protect migration_bitmap
552 *
553 * Returns the byte offset within memory region of the start of a dirty page
554 *
555 * @rb: RAMBlock where to search for dirty pages
556 * @start: starting address (typically so we can continue from previous page)
557 * @ram_addr_abs: pointer into which to store the address of the dirty page
558 * within the global ram_addr space
f3f491fc 559 */
56e93d26 560static inline
a82d593b
DDAG
561ram_addr_t migration_bitmap_find_dirty(RAMBlock *rb,
562 ram_addr_t start,
563 ram_addr_t *ram_addr_abs)
56e93d26 564{
2f68e399 565 unsigned long base = rb->offset >> TARGET_PAGE_BITS;
56e93d26 566 unsigned long nr = base + (start >> TARGET_PAGE_BITS);
2f68e399
DDAG
567 uint64_t rb_size = rb->used_length;
568 unsigned long size = base + (rb_size >> TARGET_PAGE_BITS);
2ff64038 569 unsigned long *bitmap;
56e93d26
JQ
570
571 unsigned long next;
572
60be6340 573 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
56e93d26
JQ
574 if (ram_bulk_stage && nr > base) {
575 next = nr + 1;
576 } else {
2ff64038 577 next = find_next_bit(bitmap, size, nr);
56e93d26
JQ
578 }
579
f3f491fc 580 *ram_addr_abs = next << TARGET_PAGE_BITS;
56e93d26
JQ
581 return (next - base) << TARGET_PAGE_BITS;
582}
583
a82d593b
DDAG
584static inline bool migration_bitmap_clear_dirty(ram_addr_t addr)
585{
586 bool ret;
587 int nr = addr >> TARGET_PAGE_BITS;
588 unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
589
590 ret = test_and_clear_bit(nr, bitmap);
591
592 if (ret) {
593 migration_dirty_pages--;
594 }
595 return ret;
596}
597
1ffb5dfd 598static int64_t num_dirty_pages_period;
56e93d26
JQ
599static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length)
600{
2ff64038 601 unsigned long *bitmap;
60be6340 602 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1ffb5dfd
CF
603 migration_dirty_pages += cpu_physical_memory_sync_dirty_bitmap(bitmap,
604 start, length, &num_dirty_pages_period);
56e93d26
JQ
605}
606
56e93d26
JQ
607/* Fix me: there are too many global variables used in migration process. */
608static int64_t start_time;
609static int64_t bytes_xfer_prev;
56e93d26
JQ
610static uint64_t xbzrle_cache_miss_prev;
611static uint64_t iterations_prev;
612
613static void migration_bitmap_sync_init(void)
614{
615 start_time = 0;
616 bytes_xfer_prev = 0;
617 num_dirty_pages_period = 0;
618 xbzrle_cache_miss_prev = 0;
619 iterations_prev = 0;
620}
621
3d0684b2
JQ
622/**
623 * ram_pagesize_summary: calculate all the pagesizes of a VM
624 *
625 * Returns a summary bitmap of the page sizes of all RAMBlocks
626 *
627 * For VMs with just normal pages this is equivalent to the host page
628 * size. If it's got some huge pages then it's the OR of all the
629 * different page sizes.
e8ca1db2
DDAG
630 */
631uint64_t ram_pagesize_summary(void)
632{
633 RAMBlock *block;
634 uint64_t summary = 0;
635
636 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
637 summary |= block->page_size;
638 }
639
640 return summary;
641}
642
56e93d26
JQ
643static void migration_bitmap_sync(void)
644{
645 RAMBlock *block;
56e93d26
JQ
646 MigrationState *s = migrate_get_current();
647 int64_t end_time;
648 int64_t bytes_xfer_now;
649
650 bitmap_sync_count++;
651
652 if (!bytes_xfer_prev) {
653 bytes_xfer_prev = ram_bytes_transferred();
654 }
655
656 if (!start_time) {
657 start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
658 }
659
660 trace_migration_bitmap_sync_start();
9c1f8f44 661 memory_global_dirty_log_sync();
56e93d26 662
dd631697 663 qemu_mutex_lock(&migration_bitmap_mutex);
56e93d26
JQ
664 rcu_read_lock();
665 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2f68e399 666 migration_bitmap_sync_range(block->offset, block->used_length);
56e93d26
JQ
667 }
668 rcu_read_unlock();
dd631697 669 qemu_mutex_unlock(&migration_bitmap_mutex);
56e93d26 670
1ffb5dfd
CF
671 trace_migration_bitmap_sync_end(num_dirty_pages_period);
672
56e93d26
JQ
673 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
674
675 /* more than 1 second = 1000 millisecons */
676 if (end_time > start_time + 1000) {
677 if (migrate_auto_converge()) {
678 /* The following detection logic can be refined later. For now:
679 Check to see if the dirtied bytes is 50% more than the approx.
680 amount of bytes that just got transferred since the last time we
070afca2
JH
681 were in this routine. If that happens twice, start or increase
682 throttling */
56e93d26 683 bytes_xfer_now = ram_bytes_transferred();
070afca2 684
56e93d26
JQ
685 if (s->dirty_pages_rate &&
686 (num_dirty_pages_period * TARGET_PAGE_SIZE >
687 (bytes_xfer_now - bytes_xfer_prev)/2) &&
070afca2 688 (dirty_rate_high_cnt++ >= 2)) {
56e93d26 689 trace_migration_throttle();
56e93d26 690 dirty_rate_high_cnt = 0;
070afca2 691 mig_throttle_guest_down();
56e93d26
JQ
692 }
693 bytes_xfer_prev = bytes_xfer_now;
56e93d26 694 }
070afca2 695
56e93d26
JQ
696 if (migrate_use_xbzrle()) {
697 if (iterations_prev != acct_info.iterations) {
698 acct_info.xbzrle_cache_miss_rate =
699 (double)(acct_info.xbzrle_cache_miss -
700 xbzrle_cache_miss_prev) /
701 (acct_info.iterations - iterations_prev);
702 }
703 iterations_prev = acct_info.iterations;
704 xbzrle_cache_miss_prev = acct_info.xbzrle_cache_miss;
705 }
706 s->dirty_pages_rate = num_dirty_pages_period * 1000
707 / (end_time - start_time);
708 s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE;
709 start_time = end_time;
710 num_dirty_pages_period = 0;
711 }
712 s->dirty_sync_count = bitmap_sync_count;
4addcd4f
DDAG
713 if (migrate_use_events()) {
714 qapi_event_send_migration_pass(bitmap_sync_count, NULL);
715 }
56e93d26
JQ
716}
717
718/**
3d0684b2 719 * save_zero_page: send the zero page to the stream
56e93d26 720 *
3d0684b2 721 * Returns the number of pages written.
56e93d26
JQ
722 *
723 * @f: QEMUFile where to send the data
724 * @block: block that contains the page we want to send
725 * @offset: offset inside the block for the page
726 * @p: pointer to the page
727 * @bytes_transferred: increase it with the number of transferred bytes
728 */
729static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
730 uint8_t *p, uint64_t *bytes_transferred)
731{
732 int pages = -1;
733
734 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
735 acct_info.dup_pages++;
736 *bytes_transferred += save_page_header(f, block,
737 offset | RAM_SAVE_FLAG_COMPRESS);
738 qemu_put_byte(f, 0);
739 *bytes_transferred += 1;
740 pages = 1;
741 }
742
743 return pages;
744}
745
53f09a10
PB
746static void ram_release_pages(MigrationState *ms, const char *block_name,
747 uint64_t offset, int pages)
748{
749 if (!migrate_release_ram() || !migration_in_postcopy(ms)) {
750 return;
751 }
752
753 ram_discard_range(NULL, block_name, offset, pages << TARGET_PAGE_BITS);
754}
755
56e93d26 756/**
3d0684b2 757 * ram_save_page: send the given page to the stream
56e93d26 758 *
3d0684b2 759 * Returns the number of pages written.
3fd3c4b3
DDAG
760 * < 0 - error
761 * >=0 - Number of pages written - this might legally be 0
762 * if xbzrle noticed the page was the same.
56e93d26 763 *
3d0684b2 764 * @ms: current migration state
56e93d26
JQ
765 * @f: QEMUFile where to send the data
766 * @block: block that contains the page we want to send
767 * @offset: offset inside the block for the page
768 * @last_stage: if we are at the completion stage
769 * @bytes_transferred: increase it with the number of transferred bytes
770 */
9eb14766 771static int ram_save_page(MigrationState *ms, QEMUFile *f, PageSearchStatus *pss,
56e93d26
JQ
772 bool last_stage, uint64_t *bytes_transferred)
773{
774 int pages = -1;
775 uint64_t bytes_xmit;
776 ram_addr_t current_addr;
56e93d26
JQ
777 uint8_t *p;
778 int ret;
779 bool send_async = true;
a08f6890
HZ
780 RAMBlock *block = pss->block;
781 ram_addr_t offset = pss->offset;
56e93d26 782
2f68e399 783 p = block->host + offset;
56e93d26
JQ
784
785 /* In doubt sent page as normal */
786 bytes_xmit = 0;
787 ret = ram_control_save_page(f, block->offset,
788 offset, TARGET_PAGE_SIZE, &bytes_xmit);
789 if (bytes_xmit) {
790 *bytes_transferred += bytes_xmit;
791 pages = 1;
792 }
793
794 XBZRLE_cache_lock();
795
796 current_addr = block->offset + offset;
797
798 if (block == last_sent_block) {
799 offset |= RAM_SAVE_FLAG_CONTINUE;
800 }
801 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
802 if (ret != RAM_SAVE_CONTROL_DELAYED) {
803 if (bytes_xmit > 0) {
804 acct_info.norm_pages++;
805 } else if (bytes_xmit == 0) {
806 acct_info.dup_pages++;
807 }
808 }
809 } else {
810 pages = save_zero_page(f, block, offset, p, bytes_transferred);
811 if (pages > 0) {
812 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
813 * page would be stale
814 */
815 xbzrle_cache_zero_page(current_addr);
53f09a10 816 ram_release_pages(ms, block->idstr, pss->offset, pages);
2ebeaec0 817 } else if (!ram_bulk_stage &&
9eb14766 818 !migration_in_postcopy(ms) && migrate_use_xbzrle()) {
56e93d26
JQ
819 pages = save_xbzrle_page(f, &p, current_addr, block,
820 offset, last_stage, bytes_transferred);
821 if (!last_stage) {
822 /* Can't send this cached data async, since the cache page
823 * might get updated before it gets to the wire
824 */
825 send_async = false;
826 }
827 }
828 }
829
830 /* XBZRLE overflow or normal page */
831 if (pages == -1) {
832 *bytes_transferred += save_page_header(f, block,
833 offset | RAM_SAVE_FLAG_PAGE);
834 if (send_async) {
53f09a10
PB
835 qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE,
836 migrate_release_ram() &
837 migration_in_postcopy(ms));
56e93d26
JQ
838 } else {
839 qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
840 }
841 *bytes_transferred += TARGET_PAGE_SIZE;
842 pages = 1;
843 acct_info.norm_pages++;
844 }
845
846 XBZRLE_cache_unlock();
847
848 return pages;
849}
850
a7a9a88f
LL
851static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
852 ram_addr_t offset)
56e93d26
JQ
853{
854 int bytes_sent, blen;
a7a9a88f 855 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
56e93d26 856
a7a9a88f 857 bytes_sent = save_page_header(f, block, offset |
56e93d26 858 RAM_SAVE_FLAG_COMPRESS_PAGE);
a7a9a88f 859 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
56e93d26 860 migrate_compress_level());
b3be2896
LL
861 if (blen < 0) {
862 bytes_sent = 0;
863 qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
864 error_report("compressed data failed!");
865 } else {
866 bytes_sent += blen;
53f09a10
PB
867 ram_release_pages(migrate_get_current(), block->idstr,
868 offset & TARGET_PAGE_MASK, 1);
b3be2896 869 }
56e93d26
JQ
870
871 return bytes_sent;
872}
873
56e93d26
JQ
874static uint64_t bytes_transferred;
875
876static void flush_compressed_data(QEMUFile *f)
877{
878 int idx, len, thread_count;
879
880 if (!migrate_use_compression()) {
881 return;
882 }
883 thread_count = migrate_compress_threads();
a7a9a88f 884
0d9f9a5c 885 qemu_mutex_lock(&comp_done_lock);
56e93d26 886 for (idx = 0; idx < thread_count; idx++) {
a7a9a88f 887 while (!comp_param[idx].done) {
0d9f9a5c 888 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26 889 }
a7a9a88f 890 }
0d9f9a5c 891 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
892
893 for (idx = 0; idx < thread_count; idx++) {
894 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 895 if (!comp_param[idx].quit) {
56e93d26
JQ
896 len = qemu_put_qemu_file(f, comp_param[idx].file);
897 bytes_transferred += len;
898 }
a7a9a88f 899 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26
JQ
900 }
901}
902
903static inline void set_compress_params(CompressParam *param, RAMBlock *block,
904 ram_addr_t offset)
905{
906 param->block = block;
907 param->offset = offset;
908}
909
910static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block,
911 ram_addr_t offset,
912 uint64_t *bytes_transferred)
913{
914 int idx, thread_count, bytes_xmit = -1, pages = -1;
915
916 thread_count = migrate_compress_threads();
0d9f9a5c 917 qemu_mutex_lock(&comp_done_lock);
56e93d26
JQ
918 while (true) {
919 for (idx = 0; idx < thread_count; idx++) {
920 if (comp_param[idx].done) {
a7a9a88f 921 comp_param[idx].done = false;
56e93d26 922 bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file);
a7a9a88f 923 qemu_mutex_lock(&comp_param[idx].mutex);
56e93d26 924 set_compress_params(&comp_param[idx], block, offset);
a7a9a88f
LL
925 qemu_cond_signal(&comp_param[idx].cond);
926 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26
JQ
927 pages = 1;
928 acct_info.norm_pages++;
929 *bytes_transferred += bytes_xmit;
930 break;
931 }
932 }
933 if (pages > 0) {
934 break;
935 } else {
0d9f9a5c 936 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26
JQ
937 }
938 }
0d9f9a5c 939 qemu_mutex_unlock(&comp_done_lock);
56e93d26
JQ
940
941 return pages;
942}
943
944/**
945 * ram_save_compressed_page: compress the given page and send it to the stream
946 *
3d0684b2 947 * Returns the number of pages written.
56e93d26 948 *
3d0684b2 949 * @ms: current migration state
56e93d26
JQ
950 * @f: QEMUFile where to send the data
951 * @block: block that contains the page we want to send
952 * @offset: offset inside the block for the page
953 * @last_stage: if we are at the completion stage
954 * @bytes_transferred: increase it with the number of transferred bytes
955 */
9eb14766
PB
956static int ram_save_compressed_page(MigrationState *ms, QEMUFile *f,
957 PageSearchStatus *pss, bool last_stage,
56e93d26
JQ
958 uint64_t *bytes_transferred)
959{
960 int pages = -1;
fc50438e 961 uint64_t bytes_xmit = 0;
56e93d26 962 uint8_t *p;
fc50438e 963 int ret, blen;
a08f6890
HZ
964 RAMBlock *block = pss->block;
965 ram_addr_t offset = pss->offset;
56e93d26 966
2f68e399 967 p = block->host + offset;
56e93d26 968
56e93d26
JQ
969 ret = ram_control_save_page(f, block->offset,
970 offset, TARGET_PAGE_SIZE, &bytes_xmit);
971 if (bytes_xmit) {
972 *bytes_transferred += bytes_xmit;
973 pages = 1;
974 }
56e93d26
JQ
975 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
976 if (ret != RAM_SAVE_CONTROL_DELAYED) {
977 if (bytes_xmit > 0) {
978 acct_info.norm_pages++;
979 } else if (bytes_xmit == 0) {
980 acct_info.dup_pages++;
981 }
982 }
983 } else {
984 /* When starting the process of a new block, the first page of
985 * the block should be sent out before other pages in the same
986 * block, and all the pages in last block should have been sent
987 * out, keeping this order is important, because the 'cont' flag
988 * is used to avoid resending the block name.
989 */
990 if (block != last_sent_block) {
991 flush_compressed_data(f);
992 pages = save_zero_page(f, block, offset, p, bytes_transferred);
993 if (pages == -1) {
fc50438e
LL
994 /* Make sure the first page is sent out before other pages */
995 bytes_xmit = save_page_header(f, block, offset |
996 RAM_SAVE_FLAG_COMPRESS_PAGE);
997 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
998 migrate_compress_level());
999 if (blen > 0) {
1000 *bytes_transferred += bytes_xmit + blen;
b3be2896 1001 acct_info.norm_pages++;
b3be2896 1002 pages = 1;
fc50438e
LL
1003 } else {
1004 qemu_file_set_error(f, blen);
1005 error_report("compressed data failed!");
b3be2896 1006 }
56e93d26 1007 }
53f09a10
PB
1008 if (pages > 0) {
1009 ram_release_pages(ms, block->idstr, pss->offset, pages);
1010 }
56e93d26 1011 } else {
fc50438e 1012 offset |= RAM_SAVE_FLAG_CONTINUE;
56e93d26
JQ
1013 pages = save_zero_page(f, block, offset, p, bytes_transferred);
1014 if (pages == -1) {
1015 pages = compress_page_with_multi_thread(f, block, offset,
1016 bytes_transferred);
53f09a10
PB
1017 } else {
1018 ram_release_pages(ms, block->idstr, pss->offset, pages);
56e93d26
JQ
1019 }
1020 }
1021 }
1022
1023 return pages;
1024}
1025
3d0684b2
JQ
1026/**
1027 * find_dirty_block: find the next dirty page and update any state
1028 * associated with the search process.
b9e60928 1029 *
3d0684b2 1030 * Returns if a page is found
b9e60928 1031 *
3d0684b2
JQ
1032 * @f: QEMUFile where to send the data
1033 * @pss: data about the state of the current dirty page scan
1034 * @again: set to false if the search has scanned the whole of RAM
1035 * @ram_addr_abs: pointer into which to store the address of the dirty page
1036 * within the global ram_addr space
b9e60928
DDAG
1037 */
1038static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss,
f3f491fc 1039 bool *again, ram_addr_t *ram_addr_abs)
b9e60928 1040{
a82d593b
DDAG
1041 pss->offset = migration_bitmap_find_dirty(pss->block, pss->offset,
1042 ram_addr_abs);
b9e60928
DDAG
1043 if (pss->complete_round && pss->block == last_seen_block &&
1044 pss->offset >= last_offset) {
1045 /*
1046 * We've been once around the RAM and haven't found anything.
1047 * Give up.
1048 */
1049 *again = false;
1050 return false;
1051 }
1052 if (pss->offset >= pss->block->used_length) {
1053 /* Didn't find anything in this RAM Block */
1054 pss->offset = 0;
1055 pss->block = QLIST_NEXT_RCU(pss->block, next);
1056 if (!pss->block) {
1057 /* Hit the end of the list */
1058 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1059 /* Flag that we've looped */
1060 pss->complete_round = true;
1061 ram_bulk_stage = false;
1062 if (migrate_use_xbzrle()) {
1063 /* If xbzrle is on, stop using the data compression at this
1064 * point. In theory, xbzrle can do better than compression.
1065 */
1066 flush_compressed_data(f);
1067 compression_switch = false;
1068 }
1069 }
1070 /* Didn't find anything this time, but try again on the new block */
1071 *again = true;
1072 return false;
1073 } else {
1074 /* Can go around again, but... */
1075 *again = true;
1076 /* We've found something so probably don't need to */
1077 return true;
1078 }
1079}
1080
3d0684b2
JQ
1081/**
1082 * unqueue_page: gets a page of the queue
1083 *
a82d593b 1084 * Helper for 'get_queued_page' - gets a page off the queue
a82d593b 1085 *
3d0684b2
JQ
1086 * Returns the block of the page (or NULL if none available)
1087 *
1088 * @ms: current migration state
1089 * @offset: used to return the offset within the RAMBlock
1090 * @ram_addr_abs: pointer into which to store the address of the dirty page
1091 * within the global ram_addr space
a82d593b
DDAG
1092 */
1093static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset,
1094 ram_addr_t *ram_addr_abs)
1095{
1096 RAMBlock *block = NULL;
1097
1098 qemu_mutex_lock(&ms->src_page_req_mutex);
1099 if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) {
1100 struct MigrationSrcPageRequest *entry =
1101 QSIMPLEQ_FIRST(&ms->src_page_requests);
1102 block = entry->rb;
1103 *offset = entry->offset;
1104 *ram_addr_abs = (entry->offset + entry->rb->offset) &
1105 TARGET_PAGE_MASK;
1106
1107 if (entry->len > TARGET_PAGE_SIZE) {
1108 entry->len -= TARGET_PAGE_SIZE;
1109 entry->offset += TARGET_PAGE_SIZE;
1110 } else {
1111 memory_region_unref(block->mr);
1112 QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1113 g_free(entry);
1114 }
1115 }
1116 qemu_mutex_unlock(&ms->src_page_req_mutex);
1117
1118 return block;
1119}
1120
3d0684b2
JQ
1121/**
1122 * get_queued_page: unqueue a page from the postocpy requests
1123 *
1124 * Skips pages that are already sent (!dirty)
a82d593b 1125 *
3d0684b2 1126 * Returns if a queued page is found
a82d593b 1127 *
3d0684b2
JQ
1128 * @ms: current migration state
1129 * @pss: data about the state of the current dirty page scan
1130 * @ram_addr_abs: pointer into which to store the address of the dirty page
1131 * within the global ram_addr space
a82d593b
DDAG
1132 */
1133static bool get_queued_page(MigrationState *ms, PageSearchStatus *pss,
1134 ram_addr_t *ram_addr_abs)
1135{
1136 RAMBlock *block;
1137 ram_addr_t offset;
1138 bool dirty;
1139
1140 do {
1141 block = unqueue_page(ms, &offset, ram_addr_abs);
1142 /*
1143 * We're sending this page, and since it's postcopy nothing else
1144 * will dirty it, and we must make sure it doesn't get sent again
1145 * even if this queue request was received after the background
1146 * search already sent it.
1147 */
1148 if (block) {
1149 unsigned long *bitmap;
1150 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1151 dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap);
1152 if (!dirty) {
1153 trace_get_queued_page_not_dirty(
1154 block->idstr, (uint64_t)offset,
1155 (uint64_t)*ram_addr_abs,
1156 test_bit(*ram_addr_abs >> TARGET_PAGE_BITS,
1157 atomic_rcu_read(&migration_bitmap_rcu)->unsentmap));
1158 } else {
1159 trace_get_queued_page(block->idstr,
1160 (uint64_t)offset,
1161 (uint64_t)*ram_addr_abs);
1162 }
1163 }
1164
1165 } while (block && !dirty);
1166
1167 if (block) {
1168 /*
1169 * As soon as we start servicing pages out of order, then we have
1170 * to kill the bulk stage, since the bulk stage assumes
1171 * in (migration_bitmap_find_and_reset_dirty) that every page is
1172 * dirty, that's no longer true.
1173 */
1174 ram_bulk_stage = false;
1175
1176 /*
1177 * We want the background search to continue from the queued page
1178 * since the guest is likely to want other pages near to the page
1179 * it just requested.
1180 */
1181 pss->block = block;
1182 pss->offset = offset;
1183 }
1184
1185 return !!block;
1186}
1187
6c595cde 1188/**
3d0684b2 1189 * flush_page_queue: flush any remaining pages in the ram request queue
6c595cde 1190 *
3d0684b2
JQ
1191 * It should be empty at the end anyway, but in error cases there may
1192 * be some left. in case that there is any page left, we drop it.
1193 *
1194 * @ms: current migration state
6c595cde
DDAG
1195 */
1196void flush_page_queue(MigrationState *ms)
1197{
1198 struct MigrationSrcPageRequest *mspr, *next_mspr;
1199 /* This queue generally should be empty - but in the case of a failed
1200 * migration might have some droppings in.
1201 */
1202 rcu_read_lock();
1203 QSIMPLEQ_FOREACH_SAFE(mspr, &ms->src_page_requests, next_req, next_mspr) {
1204 memory_region_unref(mspr->rb->mr);
1205 QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1206 g_free(mspr);
1207 }
1208 rcu_read_unlock();
1209}
1210
1211/**
3d0684b2
JQ
1212 * ram_save_queue_pages: queue the page for transmission
1213 *
1214 * A request from postcopy destination for example.
1215 *
1216 * Returns zero on success or negative on error
1217 *
1218 * @ms: current migration state
1219 * @rbname: Name of the RAMBLock of the request. NULL means the
1220 * same that last one.
1221 * @start: starting address from the start of the RAMBlock
1222 * @len: length (in bytes) to send
6c595cde
DDAG
1223 */
1224int ram_save_queue_pages(MigrationState *ms, const char *rbname,
1225 ram_addr_t start, ram_addr_t len)
1226{
1227 RAMBlock *ramblock;
1228
d3bf5418 1229 ms->postcopy_requests++;
6c595cde
DDAG
1230 rcu_read_lock();
1231 if (!rbname) {
1232 /* Reuse last RAMBlock */
1233 ramblock = ms->last_req_rb;
1234
1235 if (!ramblock) {
1236 /*
1237 * Shouldn't happen, we can't reuse the last RAMBlock if
1238 * it's the 1st request.
1239 */
1240 error_report("ram_save_queue_pages no previous block");
1241 goto err;
1242 }
1243 } else {
1244 ramblock = qemu_ram_block_by_name(rbname);
1245
1246 if (!ramblock) {
1247 /* We shouldn't be asked for a non-existent RAMBlock */
1248 error_report("ram_save_queue_pages no block '%s'", rbname);
1249 goto err;
1250 }
1251 ms->last_req_rb = ramblock;
1252 }
1253 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1254 if (start+len > ramblock->used_length) {
9458ad6b
JQ
1255 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1256 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
6c595cde
DDAG
1257 __func__, start, len, ramblock->used_length);
1258 goto err;
1259 }
1260
1261 struct MigrationSrcPageRequest *new_entry =
1262 g_malloc0(sizeof(struct MigrationSrcPageRequest));
1263 new_entry->rb = ramblock;
1264 new_entry->offset = start;
1265 new_entry->len = len;
1266
1267 memory_region_ref(ramblock->mr);
1268 qemu_mutex_lock(&ms->src_page_req_mutex);
1269 QSIMPLEQ_INSERT_TAIL(&ms->src_page_requests, new_entry, next_req);
1270 qemu_mutex_unlock(&ms->src_page_req_mutex);
1271 rcu_read_unlock();
1272
1273 return 0;
1274
1275err:
1276 rcu_read_unlock();
1277 return -1;
1278}
1279
a82d593b 1280/**
3d0684b2 1281 * ram_save_target_page: save one target page
a82d593b 1282 *
3d0684b2 1283 * Returns the number of pages written
a82d593b 1284 *
3d0684b2 1285 * @ms: current migration state
a82d593b 1286 * @f: QEMUFile where to send the data
3d0684b2 1287 * @pss: data about the page we want to send
a82d593b
DDAG
1288 * @last_stage: if we are at the completion stage
1289 * @bytes_transferred: increase it with the number of transferred bytes
3d0684b2 1290 * @dirty_ram_abs: address of the start of the dirty page in ram_addr_t space
a82d593b
DDAG
1291 */
1292static int ram_save_target_page(MigrationState *ms, QEMUFile *f,
a08f6890 1293 PageSearchStatus *pss,
a82d593b
DDAG
1294 bool last_stage,
1295 uint64_t *bytes_transferred,
1296 ram_addr_t dirty_ram_abs)
1297{
1298 int res = 0;
1299
1300 /* Check the pages is dirty and if it is send it */
1301 if (migration_bitmap_clear_dirty(dirty_ram_abs)) {
1302 unsigned long *unsentmap;
1303 if (compression_switch && migrate_use_compression()) {
9eb14766 1304 res = ram_save_compressed_page(ms, f, pss,
a82d593b
DDAG
1305 last_stage,
1306 bytes_transferred);
1307 } else {
9eb14766 1308 res = ram_save_page(ms, f, pss, last_stage,
a82d593b
DDAG
1309 bytes_transferred);
1310 }
1311
1312 if (res < 0) {
1313 return res;
1314 }
1315 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1316 if (unsentmap) {
1317 clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap);
1318 }
3fd3c4b3
DDAG
1319 /* Only update last_sent_block if a block was actually sent; xbzrle
1320 * might have decided the page was identical so didn't bother writing
1321 * to the stream.
1322 */
1323 if (res > 0) {
a08f6890 1324 last_sent_block = pss->block;
3fd3c4b3 1325 }
a82d593b
DDAG
1326 }
1327
1328 return res;
1329}
1330
1331/**
3d0684b2 1332 * ram_save_host_page: save a whole host page
a82d593b 1333 *
3d0684b2
JQ
1334 * Starting at *offset send pages up to the end of the current host
1335 * page. It's valid for the initial offset to point into the middle of
1336 * a host page in which case the remainder of the hostpage is sent.
1337 * Only dirty target pages are sent. Note that the host page size may
1338 * be a huge page for this block.
a82d593b 1339 *
3d0684b2
JQ
1340 * Returns the number of pages written or negative on error
1341 *
1342 * @ms: current migration state
a82d593b 1343 * @f: QEMUFile where to send the data
3d0684b2 1344 * @pss: data about the page we want to send
a82d593b
DDAG
1345 * @last_stage: if we are at the completion stage
1346 * @bytes_transferred: increase it with the number of transferred bytes
1347 * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1348 */
a08f6890
HZ
1349static int ram_save_host_page(MigrationState *ms, QEMUFile *f,
1350 PageSearchStatus *pss,
1351 bool last_stage,
a82d593b
DDAG
1352 uint64_t *bytes_transferred,
1353 ram_addr_t dirty_ram_abs)
1354{
1355 int tmppages, pages = 0;
4c011c37
DDAG
1356 size_t pagesize = qemu_ram_pagesize(pss->block);
1357
a82d593b 1358 do {
a08f6890 1359 tmppages = ram_save_target_page(ms, f, pss, last_stage,
a82d593b
DDAG
1360 bytes_transferred, dirty_ram_abs);
1361 if (tmppages < 0) {
1362 return tmppages;
1363 }
1364
1365 pages += tmppages;
a08f6890 1366 pss->offset += TARGET_PAGE_SIZE;
a82d593b 1367 dirty_ram_abs += TARGET_PAGE_SIZE;
4c011c37 1368 } while (pss->offset & (pagesize - 1));
a82d593b
DDAG
1369
1370 /* The offset we leave with is the last one we looked at */
a08f6890 1371 pss->offset -= TARGET_PAGE_SIZE;
a82d593b
DDAG
1372 return pages;
1373}
6c595cde 1374
56e93d26 1375/**
3d0684b2 1376 * ram_find_and_save_block: finds a dirty page and sends it to f
56e93d26
JQ
1377 *
1378 * Called within an RCU critical section.
1379 *
3d0684b2 1380 * Returns the number of pages written where zero means no dirty pages
56e93d26
JQ
1381 *
1382 * @f: QEMUFile where to send the data
1383 * @last_stage: if we are at the completion stage
1384 * @bytes_transferred: increase it with the number of transferred bytes
a82d593b
DDAG
1385 *
1386 * On systems where host-page-size > target-page-size it will send all the
1387 * pages in a host page that are dirty.
56e93d26
JQ
1388 */
1389
1390static int ram_find_and_save_block(QEMUFile *f, bool last_stage,
1391 uint64_t *bytes_transferred)
1392{
b8fb8cb7 1393 PageSearchStatus pss;
a82d593b 1394 MigrationState *ms = migrate_get_current();
56e93d26 1395 int pages = 0;
b9e60928 1396 bool again, found;
f3f491fc
DDAG
1397 ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
1398 ram_addr_t space */
56e93d26 1399
0827b9e9
AA
1400 /* No dirty page as there is zero RAM */
1401 if (!ram_bytes_total()) {
1402 return pages;
1403 }
1404
b8fb8cb7
DDAG
1405 pss.block = last_seen_block;
1406 pss.offset = last_offset;
1407 pss.complete_round = false;
1408
1409 if (!pss.block) {
1410 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1411 }
56e93d26 1412
b9e60928 1413 do {
a82d593b
DDAG
1414 again = true;
1415 found = get_queued_page(ms, &pss, &dirty_ram_abs);
b9e60928 1416
a82d593b
DDAG
1417 if (!found) {
1418 /* priority queue empty, so just search for something dirty */
1419 found = find_dirty_block(f, &pss, &again, &dirty_ram_abs);
1420 }
f3f491fc 1421
a82d593b 1422 if (found) {
a08f6890 1423 pages = ram_save_host_page(ms, f, &pss,
a82d593b
DDAG
1424 last_stage, bytes_transferred,
1425 dirty_ram_abs);
56e93d26 1426 }
b9e60928 1427 } while (!pages && again);
56e93d26 1428
b8fb8cb7
DDAG
1429 last_seen_block = pss.block;
1430 last_offset = pss.offset;
56e93d26
JQ
1431
1432 return pages;
1433}
1434
1435void acct_update_position(QEMUFile *f, size_t size, bool zero)
1436{
1437 uint64_t pages = size / TARGET_PAGE_SIZE;
1438 if (zero) {
1439 acct_info.dup_pages += pages;
1440 } else {
1441 acct_info.norm_pages += pages;
1442 bytes_transferred += size;
1443 qemu_update_position(f, size);
1444 }
1445}
1446
1447static ram_addr_t ram_save_remaining(void)
1448{
1449 return migration_dirty_pages;
1450}
1451
1452uint64_t ram_bytes_remaining(void)
1453{
1454 return ram_save_remaining() * TARGET_PAGE_SIZE;
1455}
1456
1457uint64_t ram_bytes_transferred(void)
1458{
1459 return bytes_transferred;
1460}
1461
1462uint64_t ram_bytes_total(void)
1463{
1464 RAMBlock *block;
1465 uint64_t total = 0;
1466
1467 rcu_read_lock();
1468 QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
1469 total += block->used_length;
1470 rcu_read_unlock();
1471 return total;
1472}
1473
1474void free_xbzrle_decoded_buf(void)
1475{
1476 g_free(xbzrle_decoded_buf);
1477 xbzrle_decoded_buf = NULL;
1478}
1479
60be6340
DL
1480static void migration_bitmap_free(struct BitmapRcu *bmap)
1481{
1482 g_free(bmap->bmap);
f3f491fc 1483 g_free(bmap->unsentmap);
60be6340
DL
1484 g_free(bmap);
1485}
1486
6ad2a215 1487static void ram_migration_cleanup(void *opaque)
56e93d26 1488{
2ff64038
LZ
1489 /* caller have hold iothread lock or is in a bh, so there is
1490 * no writing race against this migration_bitmap
1491 */
60be6340
DL
1492 struct BitmapRcu *bitmap = migration_bitmap_rcu;
1493 atomic_rcu_set(&migration_bitmap_rcu, NULL);
2ff64038 1494 if (bitmap) {
56e93d26 1495 memory_global_dirty_log_stop();
60be6340 1496 call_rcu(bitmap, migration_bitmap_free, rcu);
56e93d26
JQ
1497 }
1498
1499 XBZRLE_cache_lock();
1500 if (XBZRLE.cache) {
1501 cache_fini(XBZRLE.cache);
1502 g_free(XBZRLE.encoded_buf);
1503 g_free(XBZRLE.current_buf);
adb65dec 1504 g_free(ZERO_TARGET_PAGE);
56e93d26
JQ
1505 XBZRLE.cache = NULL;
1506 XBZRLE.encoded_buf = NULL;
1507 XBZRLE.current_buf = NULL;
1508 }
1509 XBZRLE_cache_unlock();
1510}
1511
56e93d26
JQ
1512static void reset_ram_globals(void)
1513{
1514 last_seen_block = NULL;
1515 last_sent_block = NULL;
1516 last_offset = 0;
1517 last_version = ram_list.version;
1518 ram_bulk_stage = true;
1519}
1520
1521#define MAX_WAIT 50 /* ms, half buffered_file limit */
1522
dd631697
LZ
1523void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
1524{
1525 /* called in qemu main thread, so there is
1526 * no writing race against this migration_bitmap
1527 */
60be6340
DL
1528 if (migration_bitmap_rcu) {
1529 struct BitmapRcu *old_bitmap = migration_bitmap_rcu, *bitmap;
1530 bitmap = g_new(struct BitmapRcu, 1);
1531 bitmap->bmap = bitmap_new(new);
dd631697
LZ
1532
1533 /* prevent migration_bitmap content from being set bit
1534 * by migration_bitmap_sync_range() at the same time.
1535 * it is safe to migration if migration_bitmap is cleared bit
1536 * at the same time.
1537 */
1538 qemu_mutex_lock(&migration_bitmap_mutex);
60be6340
DL
1539 bitmap_copy(bitmap->bmap, old_bitmap->bmap, old);
1540 bitmap_set(bitmap->bmap, old, new - old);
f3f491fc
DDAG
1541
1542 /* We don't have a way to safely extend the sentmap
1543 * with RCU; so mark it as missing, entry to postcopy
1544 * will fail.
1545 */
1546 bitmap->unsentmap = NULL;
1547
60be6340 1548 atomic_rcu_set(&migration_bitmap_rcu, bitmap);
dd631697
LZ
1549 qemu_mutex_unlock(&migration_bitmap_mutex);
1550 migration_dirty_pages += new - old;
60be6340 1551 call_rcu(old_bitmap, migration_bitmap_free, rcu);
dd631697
LZ
1552 }
1553}
56e93d26 1554
4f2e4252
DDAG
1555/*
1556 * 'expected' is the value you expect the bitmap mostly to be full
1557 * of; it won't bother printing lines that are all this value.
1558 * If 'todump' is null the migration bitmap is dumped.
1559 */
1560void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
1561{
1562 int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1563
1564 int64_t cur;
1565 int64_t linelen = 128;
1566 char linebuf[129];
1567
1568 if (!todump) {
1569 todump = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1570 }
1571
1572 for (cur = 0; cur < ram_pages; cur += linelen) {
1573 int64_t curb;
1574 bool found = false;
1575 /*
1576 * Last line; catch the case where the line length
1577 * is longer than remaining ram
1578 */
1579 if (cur + linelen > ram_pages) {
1580 linelen = ram_pages - cur;
1581 }
1582 for (curb = 0; curb < linelen; curb++) {
1583 bool thisbit = test_bit(cur + curb, todump);
1584 linebuf[curb] = thisbit ? '1' : '.';
1585 found = found || (thisbit != expected);
1586 }
1587 if (found) {
1588 linebuf[curb] = '\0';
1589 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
1590 }
1591 }
1592}
1593
e0b266f0
DDAG
1594/* **** functions for postcopy ***** */
1595
ced1c616
PB
1596void ram_postcopy_migrated_memory_release(MigrationState *ms)
1597{
1598 struct RAMBlock *block;
1599 unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1600
1601 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1602 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1603 unsigned long range = first + (block->used_length >> TARGET_PAGE_BITS);
1604 unsigned long run_start = find_next_zero_bit(bitmap, range, first);
1605
1606 while (run_start < range) {
1607 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1608 ram_discard_range(NULL, block->idstr, run_start << TARGET_PAGE_BITS,
1609 (run_end - run_start) << TARGET_PAGE_BITS);
1610 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1611 }
1612 }
1613}
1614
3d0684b2
JQ
1615/**
1616 * postcopy_send_discard_bm_ram: discard a RAMBlock
1617 *
1618 * Returns zero on success
1619 *
e0b266f0
DDAG
1620 * Callback from postcopy_each_ram_send_discard for each RAMBlock
1621 * Note: At this point the 'unsentmap' is the processed bitmap combined
1622 * with the dirtymap; so a '1' means it's either dirty or unsent.
3d0684b2
JQ
1623 *
1624 * @ms: current migration state
1625 * @pds: state for postcopy
1626 * @start: RAMBlock starting page
1627 * @length: RAMBlock size
e0b266f0
DDAG
1628 */
1629static int postcopy_send_discard_bm_ram(MigrationState *ms,
1630 PostcopyDiscardState *pds,
1631 unsigned long start,
1632 unsigned long length)
1633{
1634 unsigned long end = start + length; /* one after the end */
1635 unsigned long current;
1636 unsigned long *unsentmap;
1637
1638 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1639 for (current = start; current < end; ) {
1640 unsigned long one = find_next_bit(unsentmap, end, current);
1641
1642 if (one <= end) {
1643 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1644 unsigned long discard_length;
1645
1646 if (zero >= end) {
1647 discard_length = end - one;
1648 } else {
1649 discard_length = zero - one;
1650 }
d688c62d
DDAG
1651 if (discard_length) {
1652 postcopy_discard_send_range(ms, pds, one, discard_length);
1653 }
e0b266f0
DDAG
1654 current = one + discard_length;
1655 } else {
1656 current = one;
1657 }
1658 }
1659
1660 return 0;
1661}
1662
3d0684b2
JQ
1663/**
1664 * postcopy_each_ram_send_discard: discard all RAMBlocks
1665 *
1666 * Returns 0 for success or negative for error
1667 *
e0b266f0
DDAG
1668 * Utility for the outgoing postcopy code.
1669 * Calls postcopy_send_discard_bm_ram for each RAMBlock
1670 * passing it bitmap indexes and name.
e0b266f0
DDAG
1671 * (qemu_ram_foreach_block ends up passing unscaled lengths
1672 * which would mean postcopy code would have to deal with target page)
3d0684b2
JQ
1673 *
1674 * @ms: current migration state
e0b266f0
DDAG
1675 */
1676static int postcopy_each_ram_send_discard(MigrationState *ms)
1677{
1678 struct RAMBlock *block;
1679 int ret;
1680
1681 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1682 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1683 PostcopyDiscardState *pds = postcopy_discard_send_init(ms,
1684 first,
1685 block->idstr);
1686
1687 /*
1688 * Postcopy sends chunks of bitmap over the wire, but it
1689 * just needs indexes at this point, avoids it having
1690 * target page specific code.
1691 */
1692 ret = postcopy_send_discard_bm_ram(ms, pds, first,
1693 block->used_length >> TARGET_PAGE_BITS);
1694 postcopy_discard_send_finish(ms, pds);
1695 if (ret) {
1696 return ret;
1697 }
1698 }
1699
1700 return 0;
1701}
1702
3d0684b2
JQ
1703/**
1704 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1705 *
1706 * Helper for postcopy_chunk_hostpages; it's called twice to
1707 * canonicalize the two bitmaps, that are similar, but one is
1708 * inverted.
99e314eb 1709 *
3d0684b2
JQ
1710 * Postcopy requires that all target pages in a hostpage are dirty or
1711 * clean, not a mix. This function canonicalizes the bitmaps.
99e314eb 1712 *
3d0684b2
JQ
1713 * @ms: current migration state
1714 * @unsent_pass: if true we need to canonicalize partially unsent host pages
1715 * otherwise we need to canonicalize partially dirty host pages
1716 * @block: block that contains the page we want to canonicalize
1717 * @pds: state for postcopy
99e314eb
DDAG
1718 */
1719static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1720 RAMBlock *block,
1721 PostcopyDiscardState *pds)
1722{
1723 unsigned long *bitmap;
1724 unsigned long *unsentmap;
29c59172 1725 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
99e314eb
DDAG
1726 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1727 unsigned long len = block->used_length >> TARGET_PAGE_BITS;
1728 unsigned long last = first + (len - 1);
1729 unsigned long run_start;
1730
29c59172
DDAG
1731 if (block->page_size == TARGET_PAGE_SIZE) {
1732 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1733 return;
1734 }
1735
99e314eb
DDAG
1736 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1737 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1738
1739 if (unsent_pass) {
1740 /* Find a sent page */
1741 run_start = find_next_zero_bit(unsentmap, last + 1, first);
1742 } else {
1743 /* Find a dirty page */
1744 run_start = find_next_bit(bitmap, last + 1, first);
1745 }
1746
1747 while (run_start <= last) {
1748 bool do_fixup = false;
1749 unsigned long fixup_start_addr;
1750 unsigned long host_offset;
1751
1752 /*
1753 * If the start of this run of pages is in the middle of a host
1754 * page, then we need to fixup this host page.
1755 */
1756 host_offset = run_start % host_ratio;
1757 if (host_offset) {
1758 do_fixup = true;
1759 run_start -= host_offset;
1760 fixup_start_addr = run_start;
1761 /* For the next pass */
1762 run_start = run_start + host_ratio;
1763 } else {
1764 /* Find the end of this run */
1765 unsigned long run_end;
1766 if (unsent_pass) {
1767 run_end = find_next_bit(unsentmap, last + 1, run_start + 1);
1768 } else {
1769 run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1);
1770 }
1771 /*
1772 * If the end isn't at the start of a host page, then the
1773 * run doesn't finish at the end of a host page
1774 * and we need to discard.
1775 */
1776 host_offset = run_end % host_ratio;
1777 if (host_offset) {
1778 do_fixup = true;
1779 fixup_start_addr = run_end - host_offset;
1780 /*
1781 * This host page has gone, the next loop iteration starts
1782 * from after the fixup
1783 */
1784 run_start = fixup_start_addr + host_ratio;
1785 } else {
1786 /*
1787 * No discards on this iteration, next loop starts from
1788 * next sent/dirty page
1789 */
1790 run_start = run_end + 1;
1791 }
1792 }
1793
1794 if (do_fixup) {
1795 unsigned long page;
1796
1797 /* Tell the destination to discard this page */
1798 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1799 /* For the unsent_pass we:
1800 * discard partially sent pages
1801 * For the !unsent_pass (dirty) we:
1802 * discard partially dirty pages that were sent
1803 * (any partially sent pages were already discarded
1804 * by the previous unsent_pass)
1805 */
1806 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1807 host_ratio);
1808 }
1809
1810 /* Clean up the bitmap */
1811 for (page = fixup_start_addr;
1812 page < fixup_start_addr + host_ratio; page++) {
1813 /* All pages in this host page are now not sent */
1814 set_bit(page, unsentmap);
1815
1816 /*
1817 * Remark them as dirty, updating the count for any pages
1818 * that weren't previously dirty.
1819 */
1820 migration_dirty_pages += !test_and_set_bit(page, bitmap);
1821 }
1822 }
1823
1824 if (unsent_pass) {
1825 /* Find the next sent page for the next iteration */
1826 run_start = find_next_zero_bit(unsentmap, last + 1,
1827 run_start);
1828 } else {
1829 /* Find the next dirty page for the next iteration */
1830 run_start = find_next_bit(bitmap, last + 1, run_start);
1831 }
1832 }
1833}
1834
3d0684b2
JQ
1835/**
1836 * postcopy_chuck_hostpages: discrad any partially sent host page
1837 *
99e314eb
DDAG
1838 * Utility for the outgoing postcopy code.
1839 *
1840 * Discard any partially sent host-page size chunks, mark any partially
29c59172
DDAG
1841 * dirty host-page size chunks as all dirty. In this case the host-page
1842 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
99e314eb 1843 *
3d0684b2
JQ
1844 * Returns zero on success
1845 *
1846 * @ms: current migration state
99e314eb
DDAG
1847 */
1848static int postcopy_chunk_hostpages(MigrationState *ms)
1849{
1850 struct RAMBlock *block;
1851
99e314eb
DDAG
1852 /* Easiest way to make sure we don't resume in the middle of a host-page */
1853 last_seen_block = NULL;
1854 last_sent_block = NULL;
1855 last_offset = 0;
1856
1857 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1858 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1859
1860 PostcopyDiscardState *pds =
1861 postcopy_discard_send_init(ms, first, block->idstr);
1862
1863 /* First pass: Discard all partially sent host pages */
1864 postcopy_chunk_hostpages_pass(ms, true, block, pds);
1865 /*
1866 * Second pass: Ensure that all partially dirty host pages are made
1867 * fully dirty.
1868 */
1869 postcopy_chunk_hostpages_pass(ms, false, block, pds);
1870
1871 postcopy_discard_send_finish(ms, pds);
1872 } /* ram_list loop */
1873
1874 return 0;
1875}
1876
3d0684b2
JQ
1877/**
1878 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1879 *
1880 * Returns zero on success
1881 *
e0b266f0
DDAG
1882 * Transmit the set of pages to be discarded after precopy to the target
1883 * these are pages that:
1884 * a) Have been previously transmitted but are now dirty again
1885 * b) Pages that have never been transmitted, this ensures that
1886 * any pages on the destination that have been mapped by background
1887 * tasks get discarded (transparent huge pages is the specific concern)
1888 * Hopefully this is pretty sparse
3d0684b2
JQ
1889 *
1890 * @ms: current migration state
e0b266f0
DDAG
1891 */
1892int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1893{
1894 int ret;
1895 unsigned long *bitmap, *unsentmap;
1896
1897 rcu_read_lock();
1898
1899 /* This should be our last sync, the src is now paused */
1900 migration_bitmap_sync();
1901
1902 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1903 if (!unsentmap) {
1904 /* We don't have a safe way to resize the sentmap, so
1905 * if the bitmap was resized it will be NULL at this
1906 * point.
1907 */
1908 error_report("migration ram resized during precopy phase");
1909 rcu_read_unlock();
1910 return -EINVAL;
1911 }
1912
29c59172 1913 /* Deal with TPS != HPS and huge pages */
99e314eb
DDAG
1914 ret = postcopy_chunk_hostpages(ms);
1915 if (ret) {
1916 rcu_read_unlock();
1917 return ret;
1918 }
1919
e0b266f0
DDAG
1920 /*
1921 * Update the unsentmap to be unsentmap = unsentmap | dirty
1922 */
1923 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1924 bitmap_or(unsentmap, unsentmap, bitmap,
1925 last_ram_offset() >> TARGET_PAGE_BITS);
1926
1927
1928 trace_ram_postcopy_send_discard_bitmap();
1929#ifdef DEBUG_POSTCOPY
1930 ram_debug_dump_bitmap(unsentmap, true);
1931#endif
1932
1933 ret = postcopy_each_ram_send_discard(ms);
1934 rcu_read_unlock();
1935
1936 return ret;
1937}
1938
3d0684b2
JQ
1939/**
1940 * ram_discard_range: discard dirtied pages at the beginning of postcopy
e0b266f0 1941 *
3d0684b2 1942 * Returns zero on success
e0b266f0 1943 *
3d0684b2
JQ
1944 * @mis: current migration incoming state
1945 * @block_name: Name of the RAMBlock of the request. NULL means the
1946 * same that last one.
1947 * @start: RAMBlock starting page
1948 * @length: RAMBlock size
e0b266f0
DDAG
1949 */
1950int ram_discard_range(MigrationIncomingState *mis,
1951 const char *block_name,
1952 uint64_t start, size_t length)
1953{
1954 int ret = -1;
1955
d3a5038c
DDAG
1956 trace_ram_discard_range(block_name, start, length);
1957
e0b266f0
DDAG
1958 rcu_read_lock();
1959 RAMBlock *rb = qemu_ram_block_by_name(block_name);
1960
1961 if (!rb) {
1962 error_report("ram_discard_range: Failed to find block '%s'",
1963 block_name);
1964 goto err;
1965 }
1966
d3a5038c 1967 ret = ram_block_discard_range(rb, start, length);
e0b266f0
DDAG
1968
1969err:
1970 rcu_read_unlock();
1971
1972 return ret;
1973}
1974
a91246c9 1975static int ram_save_init_globals(void)
56e93d26 1976{
56e93d26
JQ
1977 int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
1978
56e93d26
JQ
1979 dirty_rate_high_cnt = 0;
1980 bitmap_sync_count = 0;
1981 migration_bitmap_sync_init();
dd631697 1982 qemu_mutex_init(&migration_bitmap_mutex);
56e93d26
JQ
1983
1984 if (migrate_use_xbzrle()) {
1985 XBZRLE_cache_lock();
adb65dec 1986 ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE);
56e93d26
JQ
1987 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1988 TARGET_PAGE_SIZE,
1989 TARGET_PAGE_SIZE);
1990 if (!XBZRLE.cache) {
1991 XBZRLE_cache_unlock();
1992 error_report("Error creating cache");
1993 return -1;
1994 }
1995 XBZRLE_cache_unlock();
1996
1997 /* We prefer not to abort if there is no memory */
1998 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1999 if (!XBZRLE.encoded_buf) {
2000 error_report("Error allocating encoded_buf");
2001 return -1;
2002 }
2003
2004 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2005 if (!XBZRLE.current_buf) {
2006 error_report("Error allocating current_buf");
2007 g_free(XBZRLE.encoded_buf);
2008 XBZRLE.encoded_buf = NULL;
2009 return -1;
2010 }
2011
2012 acct_clear();
2013 }
2014
49877834
PB
2015 /* For memory_global_dirty_log_start below. */
2016 qemu_mutex_lock_iothread();
2017
56e93d26
JQ
2018 qemu_mutex_lock_ramlist();
2019 rcu_read_lock();
2020 bytes_transferred = 0;
2021 reset_ram_globals();
2022
f3f491fc 2023 migration_bitmap_rcu = g_new0(struct BitmapRcu, 1);
0827b9e9
AA
2024 /* Skip setting bitmap if there is no RAM */
2025 if (ram_bytes_total()) {
2026 ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
2027 migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages);
2028 bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages);
2029
2030 if (migrate_postcopy_ram()) {
2031 migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages);
2032 bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages);
2033 }
f3f491fc
DDAG
2034 }
2035
56e93d26
JQ
2036 /*
2037 * Count the total number of pages used by ram blocks not including any
2038 * gaps due to alignment or unplugs.
2039 */
2040 migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2041
2042 memory_global_dirty_log_start();
2043 migration_bitmap_sync();
2044 qemu_mutex_unlock_ramlist();
49877834 2045 qemu_mutex_unlock_iothread();
a91246c9
HZ
2046 rcu_read_unlock();
2047
2048 return 0;
2049}
2050
3d0684b2
JQ
2051/*
2052 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
a91246c9
HZ
2053 * long-running RCU critical section. When rcu-reclaims in the code
2054 * start to become numerous it will be necessary to reduce the
2055 * granularity of these critical sections.
2056 */
2057
3d0684b2
JQ
2058/**
2059 * ram_save_setup: Setup RAM for migration
2060 *
2061 * Returns zero to indicate success and negative for error
2062 *
2063 * @f: QEMUFile where to send the data
2064 * @opaque: RAMState pointer
2065 */
a91246c9
HZ
2066static int ram_save_setup(QEMUFile *f, void *opaque)
2067{
2068 RAMBlock *block;
2069
2070 /* migration has already setup the bitmap, reuse it. */
2071 if (!migration_in_colo_state()) {
2072 if (ram_save_init_globals() < 0) {
2073 return -1;
2074 }
2075 }
2076
2077 rcu_read_lock();
56e93d26
JQ
2078
2079 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2080
2081 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2082 qemu_put_byte(f, strlen(block->idstr));
2083 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2084 qemu_put_be64(f, block->used_length);
ef08fb38
DDAG
2085 if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2086 qemu_put_be64(f, block->page_size);
2087 }
56e93d26
JQ
2088 }
2089
2090 rcu_read_unlock();
2091
2092 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2093 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2094
2095 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2096
2097 return 0;
2098}
2099
3d0684b2
JQ
2100/**
2101 * ram_save_iterate: iterative stage for migration
2102 *
2103 * Returns zero to indicate success and negative for error
2104 *
2105 * @f: QEMUFile where to send the data
2106 * @opaque: RAMState pointer
2107 */
56e93d26
JQ
2108static int ram_save_iterate(QEMUFile *f, void *opaque)
2109{
2110 int ret;
2111 int i;
2112 int64_t t0;
5c90308f 2113 int done = 0;
56e93d26
JQ
2114
2115 rcu_read_lock();
2116 if (ram_list.version != last_version) {
2117 reset_ram_globals();
2118 }
2119
2120 /* Read version before ram_list.blocks */
2121 smp_rmb();
2122
2123 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2124
2125 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2126 i = 0;
2127 while ((ret = qemu_file_rate_limit(f)) == 0) {
2128 int pages;
2129
2130 pages = ram_find_and_save_block(f, false, &bytes_transferred);
2131 /* no more pages to sent */
2132 if (pages == 0) {
5c90308f 2133 done = 1;
56e93d26
JQ
2134 break;
2135 }
56e93d26 2136 acct_info.iterations++;
070afca2 2137
56e93d26
JQ
2138 /* we want to check in the 1st loop, just in case it was the 1st time
2139 and we had to sync the dirty bitmap.
2140 qemu_get_clock_ns() is a bit expensive, so we only check each some
2141 iterations
2142 */
2143 if ((i & 63) == 0) {
2144 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2145 if (t1 > MAX_WAIT) {
55c4446b 2146 trace_ram_save_iterate_big_wait(t1, i);
56e93d26
JQ
2147 break;
2148 }
2149 }
2150 i++;
2151 }
2152 flush_compressed_data(f);
2153 rcu_read_unlock();
2154
2155 /*
2156 * Must occur before EOS (or any QEMUFile operation)
2157 * because of RDMA protocol.
2158 */
2159 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2160
2161 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2162 bytes_transferred += 8;
2163
2164 ret = qemu_file_get_error(f);
2165 if (ret < 0) {
2166 return ret;
2167 }
2168
5c90308f 2169 return done;
56e93d26
JQ
2170}
2171
3d0684b2
JQ
2172/**
2173 * ram_save_complete: function called to send the remaining amount of ram
2174 *
2175 * Returns zero to indicate success
2176 *
2177 * Called with iothread lock
2178 *
2179 * @f: QEMUFile where to send the data
2180 * @opaque: RAMState pointer
2181 */
56e93d26
JQ
2182static int ram_save_complete(QEMUFile *f, void *opaque)
2183{
2184 rcu_read_lock();
2185
663e6c1d
DDAG
2186 if (!migration_in_postcopy(migrate_get_current())) {
2187 migration_bitmap_sync();
2188 }
56e93d26
JQ
2189
2190 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2191
2192 /* try transferring iterative blocks of memory */
2193
2194 /* flush all remaining blocks regardless of rate limiting */
2195 while (true) {
2196 int pages;
2197
a91246c9
HZ
2198 pages = ram_find_and_save_block(f, !migration_in_colo_state(),
2199 &bytes_transferred);
56e93d26
JQ
2200 /* no more blocks to sent */
2201 if (pages == 0) {
2202 break;
2203 }
2204 }
2205
2206 flush_compressed_data(f);
2207 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
56e93d26
JQ
2208
2209 rcu_read_unlock();
d09a6fde 2210
56e93d26
JQ
2211 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2212
2213 return 0;
2214}
2215
c31b098f
DDAG
2216static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2217 uint64_t *non_postcopiable_pending,
2218 uint64_t *postcopiable_pending)
56e93d26
JQ
2219{
2220 uint64_t remaining_size;
2221
2222 remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2223
663e6c1d
DDAG
2224 if (!migration_in_postcopy(migrate_get_current()) &&
2225 remaining_size < max_size) {
56e93d26
JQ
2226 qemu_mutex_lock_iothread();
2227 rcu_read_lock();
2228 migration_bitmap_sync();
2229 rcu_read_unlock();
2230 qemu_mutex_unlock_iothread();
2231 remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2232 }
c31b098f
DDAG
2233
2234 /* We can do postcopy, and all the data is postcopiable */
2235 *postcopiable_pending += remaining_size;
56e93d26
JQ
2236}
2237
2238static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2239{
2240 unsigned int xh_len;
2241 int xh_flags;
063e760a 2242 uint8_t *loaded_data;
56e93d26
JQ
2243
2244 if (!xbzrle_decoded_buf) {
2245 xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2246 }
063e760a 2247 loaded_data = xbzrle_decoded_buf;
56e93d26
JQ
2248
2249 /* extract RLE header */
2250 xh_flags = qemu_get_byte(f);
2251 xh_len = qemu_get_be16(f);
2252
2253 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2254 error_report("Failed to load XBZRLE page - wrong compression!");
2255 return -1;
2256 }
2257
2258 if (xh_len > TARGET_PAGE_SIZE) {
2259 error_report("Failed to load XBZRLE page - len overflow!");
2260 return -1;
2261 }
2262 /* load data and decode */
063e760a 2263 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
56e93d26
JQ
2264
2265 /* decode RLE */
063e760a 2266 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
56e93d26
JQ
2267 TARGET_PAGE_SIZE) == -1) {
2268 error_report("Failed to load XBZRLE page - decode error!");
2269 return -1;
2270 }
2271
2272 return 0;
2273}
2274
3d0684b2
JQ
2275/**
2276 * ram_block_from_stream: read a RAMBlock id from the migration stream
2277 *
2278 * Must be called from within a rcu critical section.
2279 *
56e93d26 2280 * Returns a pointer from within the RCU-protected ram_list.
a7180877 2281 *
3d0684b2
JQ
2282 * @f: QEMUFile where to read the data from
2283 * @flags: Page flags (mostly to see if it's a continuation of previous block)
a7180877 2284 */
3d0684b2 2285static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
56e93d26
JQ
2286{
2287 static RAMBlock *block = NULL;
2288 char id[256];
2289 uint8_t len;
2290
2291 if (flags & RAM_SAVE_FLAG_CONTINUE) {
4c4bad48 2292 if (!block) {
56e93d26
JQ
2293 error_report("Ack, bad migration stream!");
2294 return NULL;
2295 }
4c4bad48 2296 return block;
56e93d26
JQ
2297 }
2298
2299 len = qemu_get_byte(f);
2300 qemu_get_buffer(f, (uint8_t *)id, len);
2301 id[len] = 0;
2302
e3dd7493 2303 block = qemu_ram_block_by_name(id);
4c4bad48
HZ
2304 if (!block) {
2305 error_report("Can't find block %s", id);
2306 return NULL;
56e93d26
JQ
2307 }
2308
4c4bad48
HZ
2309 return block;
2310}
2311
2312static inline void *host_from_ram_block_offset(RAMBlock *block,
2313 ram_addr_t offset)
2314{
2315 if (!offset_in_ramblock(block, offset)) {
2316 return NULL;
2317 }
2318
2319 return block->host + offset;
56e93d26
JQ
2320}
2321
3d0684b2
JQ
2322/**
2323 * ram_handle_compressed: handle the zero page case
2324 *
56e93d26
JQ
2325 * If a page (or a whole RDMA chunk) has been
2326 * determined to be zero, then zap it.
3d0684b2
JQ
2327 *
2328 * @host: host address for the zero page
2329 * @ch: what the page is filled from. We only support zero
2330 * @size: size of the zero page
56e93d26
JQ
2331 */
2332void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2333{
2334 if (ch != 0 || !is_zero_range(host, size)) {
2335 memset(host, ch, size);
2336 }
2337}
2338
2339static void *do_data_decompress(void *opaque)
2340{
2341 DecompressParam *param = opaque;
2342 unsigned long pagesize;
33d151f4
LL
2343 uint8_t *des;
2344 int len;
56e93d26 2345
33d151f4 2346 qemu_mutex_lock(&param->mutex);
90e56fb4 2347 while (!param->quit) {
33d151f4
LL
2348 if (param->des) {
2349 des = param->des;
2350 len = param->len;
2351 param->des = 0;
2352 qemu_mutex_unlock(&param->mutex);
2353
56e93d26 2354 pagesize = TARGET_PAGE_SIZE;
73a8912b
LL
2355 /* uncompress() will return failed in some case, especially
2356 * when the page is dirted when doing the compression, it's
2357 * not a problem because the dirty page will be retransferred
2358 * and uncompress() won't break the data in other pages.
2359 */
33d151f4
LL
2360 uncompress((Bytef *)des, &pagesize,
2361 (const Bytef *)param->compbuf, len);
73a8912b 2362
33d151f4
LL
2363 qemu_mutex_lock(&decomp_done_lock);
2364 param->done = true;
2365 qemu_cond_signal(&decomp_done_cond);
2366 qemu_mutex_unlock(&decomp_done_lock);
2367
2368 qemu_mutex_lock(&param->mutex);
2369 } else {
2370 qemu_cond_wait(&param->cond, &param->mutex);
2371 }
56e93d26 2372 }
33d151f4 2373 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
2374
2375 return NULL;
2376}
2377
5533b2e9
LL
2378static void wait_for_decompress_done(void)
2379{
2380 int idx, thread_count;
2381
2382 if (!migrate_use_compression()) {
2383 return;
2384 }
2385
2386 thread_count = migrate_decompress_threads();
2387 qemu_mutex_lock(&decomp_done_lock);
2388 for (idx = 0; idx < thread_count; idx++) {
2389 while (!decomp_param[idx].done) {
2390 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2391 }
2392 }
2393 qemu_mutex_unlock(&decomp_done_lock);
2394}
2395
56e93d26
JQ
2396void migrate_decompress_threads_create(void)
2397{
2398 int i, thread_count;
2399
2400 thread_count = migrate_decompress_threads();
2401 decompress_threads = g_new0(QemuThread, thread_count);
2402 decomp_param = g_new0(DecompressParam, thread_count);
73a8912b
LL
2403 qemu_mutex_init(&decomp_done_lock);
2404 qemu_cond_init(&decomp_done_cond);
56e93d26
JQ
2405 for (i = 0; i < thread_count; i++) {
2406 qemu_mutex_init(&decomp_param[i].mutex);
2407 qemu_cond_init(&decomp_param[i].cond);
2408 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
73a8912b 2409 decomp_param[i].done = true;
90e56fb4 2410 decomp_param[i].quit = false;
56e93d26
JQ
2411 qemu_thread_create(decompress_threads + i, "decompress",
2412 do_data_decompress, decomp_param + i,
2413 QEMU_THREAD_JOINABLE);
2414 }
2415}
2416
2417void migrate_decompress_threads_join(void)
2418{
2419 int i, thread_count;
2420
56e93d26
JQ
2421 thread_count = migrate_decompress_threads();
2422 for (i = 0; i < thread_count; i++) {
2423 qemu_mutex_lock(&decomp_param[i].mutex);
90e56fb4 2424 decomp_param[i].quit = true;
56e93d26
JQ
2425 qemu_cond_signal(&decomp_param[i].cond);
2426 qemu_mutex_unlock(&decomp_param[i].mutex);
2427 }
2428 for (i = 0; i < thread_count; i++) {
2429 qemu_thread_join(decompress_threads + i);
2430 qemu_mutex_destroy(&decomp_param[i].mutex);
2431 qemu_cond_destroy(&decomp_param[i].cond);
2432 g_free(decomp_param[i].compbuf);
2433 }
2434 g_free(decompress_threads);
2435 g_free(decomp_param);
56e93d26
JQ
2436 decompress_threads = NULL;
2437 decomp_param = NULL;
56e93d26
JQ
2438}
2439
c1bc6626 2440static void decompress_data_with_multi_threads(QEMUFile *f,
56e93d26
JQ
2441 void *host, int len)
2442{
2443 int idx, thread_count;
2444
2445 thread_count = migrate_decompress_threads();
73a8912b 2446 qemu_mutex_lock(&decomp_done_lock);
56e93d26
JQ
2447 while (true) {
2448 for (idx = 0; idx < thread_count; idx++) {
73a8912b 2449 if (decomp_param[idx].done) {
33d151f4
LL
2450 decomp_param[idx].done = false;
2451 qemu_mutex_lock(&decomp_param[idx].mutex);
c1bc6626 2452 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
56e93d26
JQ
2453 decomp_param[idx].des = host;
2454 decomp_param[idx].len = len;
33d151f4
LL
2455 qemu_cond_signal(&decomp_param[idx].cond);
2456 qemu_mutex_unlock(&decomp_param[idx].mutex);
56e93d26
JQ
2457 break;
2458 }
2459 }
2460 if (idx < thread_count) {
2461 break;
73a8912b
LL
2462 } else {
2463 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
56e93d26
JQ
2464 }
2465 }
73a8912b 2466 qemu_mutex_unlock(&decomp_done_lock);
56e93d26
JQ
2467}
2468
3d0684b2
JQ
2469/**
2470 * ram_postcopy_incoming_init: allocate postcopy data structures
2471 *
2472 * Returns 0 for success and negative if there was one error
2473 *
2474 * @mis: current migration incoming state
2475 *
2476 * Allocate data structures etc needed by incoming migration with
2477 * postcopy-ram. postcopy-ram's similarly names
2478 * postcopy_ram_incoming_init does the work.
1caddf8a
DDAG
2479 */
2480int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2481{
2482 size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
2483
2484 return postcopy_ram_incoming_init(mis, ram_pages);
2485}
2486
3d0684b2
JQ
2487/**
2488 * ram_load_postcopy: load a page in postcopy case
2489 *
2490 * Returns 0 for success or -errno in case of error
2491 *
a7180877
DDAG
2492 * Called in postcopy mode by ram_load().
2493 * rcu_read_lock is taken prior to this being called.
3d0684b2
JQ
2494 *
2495 * @f: QEMUFile where to send the data
a7180877
DDAG
2496 */
2497static int ram_load_postcopy(QEMUFile *f)
2498{
2499 int flags = 0, ret = 0;
2500 bool place_needed = false;
28abd200 2501 bool matching_page_sizes = false;
a7180877
DDAG
2502 MigrationIncomingState *mis = migration_incoming_get_current();
2503 /* Temporary page that is later 'placed' */
2504 void *postcopy_host_page = postcopy_get_tmp_page(mis);
c53b7ddc 2505 void *last_host = NULL;
a3b6ff6d 2506 bool all_zero = false;
a7180877
DDAG
2507
2508 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2509 ram_addr_t addr;
2510 void *host = NULL;
2511 void *page_buffer = NULL;
2512 void *place_source = NULL;
df9ff5e1 2513 RAMBlock *block = NULL;
a7180877 2514 uint8_t ch;
a7180877
DDAG
2515
2516 addr = qemu_get_be64(f);
2517 flags = addr & ~TARGET_PAGE_MASK;
2518 addr &= TARGET_PAGE_MASK;
2519
2520 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2521 place_needed = false;
2522 if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) {
df9ff5e1 2523 block = ram_block_from_stream(f, flags);
4c4bad48
HZ
2524
2525 host = host_from_ram_block_offset(block, addr);
a7180877
DDAG
2526 if (!host) {
2527 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2528 ret = -EINVAL;
2529 break;
2530 }
28abd200 2531 matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
a7180877 2532 /*
28abd200
DDAG
2533 * Postcopy requires that we place whole host pages atomically;
2534 * these may be huge pages for RAMBlocks that are backed by
2535 * hugetlbfs.
a7180877
DDAG
2536 * To make it atomic, the data is read into a temporary page
2537 * that's moved into place later.
2538 * The migration protocol uses, possibly smaller, target-pages
2539 * however the source ensures it always sends all the components
2540 * of a host page in order.
2541 */
2542 page_buffer = postcopy_host_page +
28abd200 2543 ((uintptr_t)host & (block->page_size - 1));
a7180877 2544 /* If all TP are zero then we can optimise the place */
28abd200 2545 if (!((uintptr_t)host & (block->page_size - 1))) {
a7180877 2546 all_zero = true;
c53b7ddc
DDAG
2547 } else {
2548 /* not the 1st TP within the HP */
2549 if (host != (last_host + TARGET_PAGE_SIZE)) {
9af9e0fe 2550 error_report("Non-sequential target page %p/%p",
c53b7ddc
DDAG
2551 host, last_host);
2552 ret = -EINVAL;
2553 break;
2554 }
a7180877
DDAG
2555 }
2556
c53b7ddc 2557
a7180877
DDAG
2558 /*
2559 * If it's the last part of a host page then we place the host
2560 * page
2561 */
2562 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
28abd200 2563 (block->page_size - 1)) == 0;
a7180877
DDAG
2564 place_source = postcopy_host_page;
2565 }
c53b7ddc 2566 last_host = host;
a7180877
DDAG
2567
2568 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2569 case RAM_SAVE_FLAG_COMPRESS:
2570 ch = qemu_get_byte(f);
2571 memset(page_buffer, ch, TARGET_PAGE_SIZE);
2572 if (ch) {
2573 all_zero = false;
2574 }
2575 break;
2576
2577 case RAM_SAVE_FLAG_PAGE:
2578 all_zero = false;
2579 if (!place_needed || !matching_page_sizes) {
2580 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2581 } else {
2582 /* Avoids the qemu_file copy during postcopy, which is
2583 * going to do a copy later; can only do it when we
2584 * do this read in one go (matching page sizes)
2585 */
2586 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2587 TARGET_PAGE_SIZE);
2588 }
2589 break;
2590 case RAM_SAVE_FLAG_EOS:
2591 /* normal exit */
2592 break;
2593 default:
2594 error_report("Unknown combination of migration flags: %#x"
2595 " (postcopy mode)", flags);
2596 ret = -EINVAL;
2597 }
2598
2599 if (place_needed) {
2600 /* This gets called at the last target page in the host page */
df9ff5e1
DDAG
2601 void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2602
a7180877 2603 if (all_zero) {
df9ff5e1
DDAG
2604 ret = postcopy_place_page_zero(mis, place_dest,
2605 block->page_size);
a7180877 2606 } else {
df9ff5e1
DDAG
2607 ret = postcopy_place_page(mis, place_dest,
2608 place_source, block->page_size);
a7180877
DDAG
2609 }
2610 }
2611 if (!ret) {
2612 ret = qemu_file_get_error(f);
2613 }
2614 }
2615
2616 return ret;
2617}
2618
56e93d26
JQ
2619static int ram_load(QEMUFile *f, void *opaque, int version_id)
2620{
2621 int flags = 0, ret = 0;
2622 static uint64_t seq_iter;
2623 int len = 0;
a7180877
DDAG
2624 /*
2625 * If system is running in postcopy mode, page inserts to host memory must
2626 * be atomic
2627 */
2628 bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
ef08fb38
DDAG
2629 /* ADVISE is earlier, it shows the source has the postcopy capability on */
2630 bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
56e93d26
JQ
2631
2632 seq_iter++;
2633
2634 if (version_id != 4) {
2635 ret = -EINVAL;
2636 }
2637
2638 /* This RCU critical section can be very long running.
2639 * When RCU reclaims in the code start to become numerous,
2640 * it will be necessary to reduce the granularity of this
2641 * critical section.
2642 */
2643 rcu_read_lock();
a7180877
DDAG
2644
2645 if (postcopy_running) {
2646 ret = ram_load_postcopy(f);
2647 }
2648
2649 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
56e93d26 2650 ram_addr_t addr, total_ram_bytes;
a776aa15 2651 void *host = NULL;
56e93d26
JQ
2652 uint8_t ch;
2653
2654 addr = qemu_get_be64(f);
2655 flags = addr & ~TARGET_PAGE_MASK;
2656 addr &= TARGET_PAGE_MASK;
2657
a776aa15
DDAG
2658 if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE |
2659 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4c4bad48
HZ
2660 RAMBlock *block = ram_block_from_stream(f, flags);
2661
2662 host = host_from_ram_block_offset(block, addr);
a776aa15
DDAG
2663 if (!host) {
2664 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2665 ret = -EINVAL;
2666 break;
2667 }
2668 }
2669
56e93d26
JQ
2670 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2671 case RAM_SAVE_FLAG_MEM_SIZE:
2672 /* Synchronize RAM block list */
2673 total_ram_bytes = addr;
2674 while (!ret && total_ram_bytes) {
2675 RAMBlock *block;
56e93d26
JQ
2676 char id[256];
2677 ram_addr_t length;
2678
2679 len = qemu_get_byte(f);
2680 qemu_get_buffer(f, (uint8_t *)id, len);
2681 id[len] = 0;
2682 length = qemu_get_be64(f);
2683
e3dd7493
DDAG
2684 block = qemu_ram_block_by_name(id);
2685 if (block) {
2686 if (length != block->used_length) {
2687 Error *local_err = NULL;
56e93d26 2688
fa53a0e5 2689 ret = qemu_ram_resize(block, length,
e3dd7493
DDAG
2690 &local_err);
2691 if (local_err) {
2692 error_report_err(local_err);
56e93d26 2693 }
56e93d26 2694 }
ef08fb38
DDAG
2695 /* For postcopy we need to check hugepage sizes match */
2696 if (postcopy_advised &&
2697 block->page_size != qemu_host_page_size) {
2698 uint64_t remote_page_size = qemu_get_be64(f);
2699 if (remote_page_size != block->page_size) {
2700 error_report("Mismatched RAM page size %s "
2701 "(local) %zd != %" PRId64,
2702 id, block->page_size,
2703 remote_page_size);
2704 ret = -EINVAL;
2705 }
2706 }
e3dd7493
DDAG
2707 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2708 block->idstr);
2709 } else {
56e93d26
JQ
2710 error_report("Unknown ramblock \"%s\", cannot "
2711 "accept migration", id);
2712 ret = -EINVAL;
2713 }
2714
2715 total_ram_bytes -= length;
2716 }
2717 break;
a776aa15 2718
56e93d26 2719 case RAM_SAVE_FLAG_COMPRESS:
56e93d26
JQ
2720 ch = qemu_get_byte(f);
2721 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2722 break;
a776aa15 2723
56e93d26 2724 case RAM_SAVE_FLAG_PAGE:
56e93d26
JQ
2725 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2726 break;
56e93d26 2727
a776aa15 2728 case RAM_SAVE_FLAG_COMPRESS_PAGE:
56e93d26
JQ
2729 len = qemu_get_be32(f);
2730 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2731 error_report("Invalid compressed data length: %d", len);
2732 ret = -EINVAL;
2733 break;
2734 }
c1bc6626 2735 decompress_data_with_multi_threads(f, host, len);
56e93d26 2736 break;
a776aa15 2737
56e93d26 2738 case RAM_SAVE_FLAG_XBZRLE:
56e93d26
JQ
2739 if (load_xbzrle(f, addr, host) < 0) {
2740 error_report("Failed to decompress XBZRLE page at "
2741 RAM_ADDR_FMT, addr);
2742 ret = -EINVAL;
2743 break;
2744 }
2745 break;
2746 case RAM_SAVE_FLAG_EOS:
2747 /* normal exit */
2748 break;
2749 default:
2750 if (flags & RAM_SAVE_FLAG_HOOK) {
632e3a5c 2751 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
56e93d26
JQ
2752 } else {
2753 error_report("Unknown combination of migration flags: %#x",
2754 flags);
2755 ret = -EINVAL;
2756 }
2757 }
2758 if (!ret) {
2759 ret = qemu_file_get_error(f);
2760 }
2761 }
2762
5533b2e9 2763 wait_for_decompress_done();
56e93d26 2764 rcu_read_unlock();
55c4446b 2765 trace_ram_load_complete(ret, seq_iter);
56e93d26
JQ
2766 return ret;
2767}
2768
2769static SaveVMHandlers savevm_ram_handlers = {
2770 .save_live_setup = ram_save_setup,
2771 .save_live_iterate = ram_save_iterate,
763c906b 2772 .save_live_complete_postcopy = ram_save_complete,
a3e06c3d 2773 .save_live_complete_precopy = ram_save_complete,
56e93d26
JQ
2774 .save_live_pending = ram_save_pending,
2775 .load_state = ram_load,
6ad2a215 2776 .cleanup = ram_migration_cleanup,
56e93d26
JQ
2777};
2778
2779void ram_mig_init(void)
2780{
2781 qemu_mutex_init(&XBZRLE.lock);
2782 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, NULL);
2783}