]> git.proxmox.com Git - mirror_qemu.git/blame - migration/ram.c
ram: Move bitmap_sync_count into RAMState
[mirror_qemu.git] / migration / ram.c
CommitLineData
56e93d26
JQ
1/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
76cc7b58
JQ
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
56e93d26
JQ
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
1393a485 28#include "qemu/osdep.h"
33c11879
PB
29#include "qemu-common.h"
30#include "cpu.h"
56e93d26 31#include <zlib.h>
4addcd4f 32#include "qapi-event.h"
f348b6d1 33#include "qemu/cutils.h"
56e93d26
JQ
34#include "qemu/bitops.h"
35#include "qemu/bitmap.h"
7205c9ec
JQ
36#include "qemu/timer.h"
37#include "qemu/main-loop.h"
56e93d26 38#include "migration/migration.h"
e0b266f0 39#include "migration/postcopy-ram.h"
56e93d26
JQ
40#include "exec/address-spaces.h"
41#include "migration/page_cache.h"
56e93d26 42#include "qemu/error-report.h"
56e93d26 43#include "trace.h"
56e93d26 44#include "exec/ram_addr.h"
56e93d26 45#include "qemu/rcu_queue.h"
a91246c9 46#include "migration/colo.h"
56e93d26 47
56e93d26
JQ
48/***********************************************************/
49/* ram save/restore */
50
51#define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
52#define RAM_SAVE_FLAG_COMPRESS 0x02
53#define RAM_SAVE_FLAG_MEM_SIZE 0x04
54#define RAM_SAVE_FLAG_PAGE 0x08
55#define RAM_SAVE_FLAG_EOS 0x10
56#define RAM_SAVE_FLAG_CONTINUE 0x20
57#define RAM_SAVE_FLAG_XBZRLE 0x40
58/* 0x80 is reserved in migration.h start with 0x100 next */
59#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
60
adb65dec 61static uint8_t *ZERO_TARGET_PAGE;
56e93d26
JQ
62
63static inline bool is_zero_range(uint8_t *p, uint64_t size)
64{
a1febc49 65 return buffer_is_zero(p, size);
56e93d26
JQ
66}
67
68/* struct contains XBZRLE cache and a static page
69 used by the compression */
70static struct {
71 /* buffer used for XBZRLE encoding */
72 uint8_t *encoded_buf;
73 /* buffer for storing page content */
74 uint8_t *current_buf;
75 /* Cache for XBZRLE, Protected by lock. */
76 PageCache *cache;
77 QemuMutex lock;
78} XBZRLE;
79
80/* buffer used for XBZRLE decoding */
81static uint8_t *xbzrle_decoded_buf;
82
83static void XBZRLE_cache_lock(void)
84{
85 if (migrate_use_xbzrle())
86 qemu_mutex_lock(&XBZRLE.lock);
87}
88
89static void XBZRLE_cache_unlock(void)
90{
91 if (migrate_use_xbzrle())
92 qemu_mutex_unlock(&XBZRLE.lock);
93}
94
3d0684b2
JQ
95/**
96 * xbzrle_cache_resize: resize the xbzrle cache
97 *
98 * This function is called from qmp_migrate_set_cache_size in main
99 * thread, possibly while a migration is in progress. A running
100 * migration may be using the cache and might finish during this call,
101 * hence changes to the cache are protected by XBZRLE.lock().
102 *
103 * Returns the new_size or negative in case of error.
104 *
105 * @new_size: new cache size
56e93d26
JQ
106 */
107int64_t xbzrle_cache_resize(int64_t new_size)
108{
109 PageCache *new_cache;
110 int64_t ret;
111
112 if (new_size < TARGET_PAGE_SIZE) {
113 return -1;
114 }
115
116 XBZRLE_cache_lock();
117
118 if (XBZRLE.cache != NULL) {
119 if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
120 goto out_new_size;
121 }
122 new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
123 TARGET_PAGE_SIZE);
124 if (!new_cache) {
125 error_report("Error creating cache");
126 ret = -1;
127 goto out;
128 }
129
130 cache_fini(XBZRLE.cache);
131 XBZRLE.cache = new_cache;
132 }
133
134out_new_size:
135 ret = pow2floor(new_size);
136out:
137 XBZRLE_cache_unlock();
138 return ret;
139}
140
6f37bb8b
JQ
141/* State of RAM for migration */
142struct RAMState {
143 /* Last block that we have visited searching for dirty pages */
144 RAMBlock *last_seen_block;
145 /* Last block from where we have sent data */
146 RAMBlock *last_sent_block;
147 /* Last offset we have sent data from */
148 ram_addr_t last_offset;
149 /* last ram version we have seen */
150 uint32_t last_version;
151 /* We are in the first round */
152 bool ram_bulk_stage;
8d820d6f
JQ
153 /* How many times we have dirty too many pages */
154 int dirty_rate_high_cnt;
5a987738
JQ
155 /* How many times we have synchronized the bitmap */
156 uint64_t bitmap_sync_count;
6f37bb8b
JQ
157};
158typedef struct RAMState RAMState;
159
160static RAMState ram_state;
161
56e93d26
JQ
162/* accounting for migration statistics */
163typedef struct AccountingInfo {
164 uint64_t dup_pages;
165 uint64_t skipped_pages;
166 uint64_t norm_pages;
167 uint64_t iterations;
168 uint64_t xbzrle_bytes;
169 uint64_t xbzrle_pages;
170 uint64_t xbzrle_cache_miss;
171 double xbzrle_cache_miss_rate;
172 uint64_t xbzrle_overflows;
173} AccountingInfo;
174
175static AccountingInfo acct_info;
176
177static void acct_clear(void)
178{
179 memset(&acct_info, 0, sizeof(acct_info));
180}
181
182uint64_t dup_mig_bytes_transferred(void)
183{
184 return acct_info.dup_pages * TARGET_PAGE_SIZE;
185}
186
187uint64_t dup_mig_pages_transferred(void)
188{
189 return acct_info.dup_pages;
190}
191
192uint64_t skipped_mig_bytes_transferred(void)
193{
194 return acct_info.skipped_pages * TARGET_PAGE_SIZE;
195}
196
197uint64_t skipped_mig_pages_transferred(void)
198{
199 return acct_info.skipped_pages;
200}
201
202uint64_t norm_mig_bytes_transferred(void)
203{
204 return acct_info.norm_pages * TARGET_PAGE_SIZE;
205}
206
207uint64_t norm_mig_pages_transferred(void)
208{
209 return acct_info.norm_pages;
210}
211
212uint64_t xbzrle_mig_bytes_transferred(void)
213{
214 return acct_info.xbzrle_bytes;
215}
216
217uint64_t xbzrle_mig_pages_transferred(void)
218{
219 return acct_info.xbzrle_pages;
220}
221
222uint64_t xbzrle_mig_pages_cache_miss(void)
223{
224 return acct_info.xbzrle_cache_miss;
225}
226
227double xbzrle_mig_cache_miss_rate(void)
228{
229 return acct_info.xbzrle_cache_miss_rate;
230}
231
232uint64_t xbzrle_mig_pages_overflow(void)
233{
234 return acct_info.xbzrle_overflows;
235}
236
dd631697 237static QemuMutex migration_bitmap_mutex;
56e93d26 238static uint64_t migration_dirty_pages;
56e93d26 239
b8fb8cb7
DDAG
240/* used by the search for pages to send */
241struct PageSearchStatus {
242 /* Current block being searched */
243 RAMBlock *block;
244 /* Current offset to search from */
245 ram_addr_t offset;
246 /* Set once we wrap around */
247 bool complete_round;
248};
249typedef struct PageSearchStatus PageSearchStatus;
250
60be6340
DL
251static struct BitmapRcu {
252 struct rcu_head rcu;
f3f491fc 253 /* Main migration bitmap */
60be6340 254 unsigned long *bmap;
f3f491fc
DDAG
255 /* bitmap of pages that haven't been sent even once
256 * only maintained and used in postcopy at the moment
257 * where it's used to send the dirtymap at the start
258 * of the postcopy phase
259 */
260 unsigned long *unsentmap;
60be6340
DL
261} *migration_bitmap_rcu;
262
56e93d26 263struct CompressParam {
56e93d26 264 bool done;
90e56fb4 265 bool quit;
56e93d26
JQ
266 QEMUFile *file;
267 QemuMutex mutex;
268 QemuCond cond;
269 RAMBlock *block;
270 ram_addr_t offset;
271};
272typedef struct CompressParam CompressParam;
273
274struct DecompressParam {
73a8912b 275 bool done;
90e56fb4 276 bool quit;
56e93d26
JQ
277 QemuMutex mutex;
278 QemuCond cond;
279 void *des;
d341d9f3 280 uint8_t *compbuf;
56e93d26
JQ
281 int len;
282};
283typedef struct DecompressParam DecompressParam;
284
285static CompressParam *comp_param;
286static QemuThread *compress_threads;
287/* comp_done_cond is used to wake up the migration thread when
288 * one of the compression threads has finished the compression.
289 * comp_done_lock is used to co-work with comp_done_cond.
290 */
0d9f9a5c
LL
291static QemuMutex comp_done_lock;
292static QemuCond comp_done_cond;
56e93d26
JQ
293/* The empty QEMUFileOps will be used by file in CompressParam */
294static const QEMUFileOps empty_ops = { };
295
296static bool compression_switch;
56e93d26
JQ
297static DecompressParam *decomp_param;
298static QemuThread *decompress_threads;
73a8912b
LL
299static QemuMutex decomp_done_lock;
300static QemuCond decomp_done_cond;
56e93d26 301
a7a9a88f
LL
302static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
303 ram_addr_t offset);
56e93d26
JQ
304
305static void *do_data_compress(void *opaque)
306{
307 CompressParam *param = opaque;
a7a9a88f
LL
308 RAMBlock *block;
309 ram_addr_t offset;
56e93d26 310
a7a9a88f 311 qemu_mutex_lock(&param->mutex);
90e56fb4 312 while (!param->quit) {
a7a9a88f
LL
313 if (param->block) {
314 block = param->block;
315 offset = param->offset;
316 param->block = NULL;
317 qemu_mutex_unlock(&param->mutex);
318
319 do_compress_ram_page(param->file, block, offset);
320
0d9f9a5c 321 qemu_mutex_lock(&comp_done_lock);
a7a9a88f 322 param->done = true;
0d9f9a5c
LL
323 qemu_cond_signal(&comp_done_cond);
324 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
325
326 qemu_mutex_lock(&param->mutex);
327 } else {
56e93d26
JQ
328 qemu_cond_wait(&param->cond, &param->mutex);
329 }
56e93d26 330 }
a7a9a88f 331 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
332
333 return NULL;
334}
335
336static inline void terminate_compression_threads(void)
337{
338 int idx, thread_count;
339
340 thread_count = migrate_compress_threads();
3d0684b2 341
56e93d26
JQ
342 for (idx = 0; idx < thread_count; idx++) {
343 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 344 comp_param[idx].quit = true;
56e93d26
JQ
345 qemu_cond_signal(&comp_param[idx].cond);
346 qemu_mutex_unlock(&comp_param[idx].mutex);
347 }
348}
349
350void migrate_compress_threads_join(void)
351{
352 int i, thread_count;
353
354 if (!migrate_use_compression()) {
355 return;
356 }
357 terminate_compression_threads();
358 thread_count = migrate_compress_threads();
359 for (i = 0; i < thread_count; i++) {
360 qemu_thread_join(compress_threads + i);
361 qemu_fclose(comp_param[i].file);
362 qemu_mutex_destroy(&comp_param[i].mutex);
363 qemu_cond_destroy(&comp_param[i].cond);
364 }
0d9f9a5c
LL
365 qemu_mutex_destroy(&comp_done_lock);
366 qemu_cond_destroy(&comp_done_cond);
56e93d26
JQ
367 g_free(compress_threads);
368 g_free(comp_param);
56e93d26
JQ
369 compress_threads = NULL;
370 comp_param = NULL;
56e93d26
JQ
371}
372
373void migrate_compress_threads_create(void)
374{
375 int i, thread_count;
376
377 if (!migrate_use_compression()) {
378 return;
379 }
56e93d26
JQ
380 compression_switch = true;
381 thread_count = migrate_compress_threads();
382 compress_threads = g_new0(QemuThread, thread_count);
383 comp_param = g_new0(CompressParam, thread_count);
0d9f9a5c
LL
384 qemu_cond_init(&comp_done_cond);
385 qemu_mutex_init(&comp_done_lock);
56e93d26 386 for (i = 0; i < thread_count; i++) {
e110aa91
C
387 /* comp_param[i].file is just used as a dummy buffer to save data,
388 * set its ops to empty.
56e93d26
JQ
389 */
390 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
391 comp_param[i].done = true;
90e56fb4 392 comp_param[i].quit = false;
56e93d26
JQ
393 qemu_mutex_init(&comp_param[i].mutex);
394 qemu_cond_init(&comp_param[i].cond);
395 qemu_thread_create(compress_threads + i, "compress",
396 do_data_compress, comp_param + i,
397 QEMU_THREAD_JOINABLE);
398 }
399}
400
401/**
3d0684b2 402 * save_page_header: write page header to wire
56e93d26
JQ
403 *
404 * If this is the 1st block, it also writes the block identification
405 *
3d0684b2 406 * Returns the number of bytes written
56e93d26
JQ
407 *
408 * @f: QEMUFile where to send the data
409 * @block: block that contains the page we want to send
410 * @offset: offset inside the block for the page
411 * in the lower bits, it contains flags
412 */
413static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
414{
9f5f380b 415 size_t size, len;
56e93d26
JQ
416
417 qemu_put_be64(f, offset);
418 size = 8;
419
420 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
9f5f380b
LL
421 len = strlen(block->idstr);
422 qemu_put_byte(f, len);
423 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
424 size += 1 + len;
56e93d26
JQ
425 }
426 return size;
427}
428
3d0684b2
JQ
429/**
430 * mig_throttle_guest_down: throotle down the guest
431 *
432 * Reduce amount of guest cpu execution to hopefully slow down memory
433 * writes. If guest dirty memory rate is reduced below the rate at
434 * which we can transfer pages to the destination then we should be
435 * able to complete migration. Some workloads dirty memory way too
436 * fast and will not effectively converge, even with auto-converge.
070afca2
JH
437 */
438static void mig_throttle_guest_down(void)
439{
440 MigrationState *s = migrate_get_current();
2594f56d
DB
441 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
442 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
070afca2
JH
443
444 /* We have not started throttling yet. Let's start it. */
445 if (!cpu_throttle_active()) {
446 cpu_throttle_set(pct_initial);
447 } else {
448 /* Throttling already on, just increase the rate */
449 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
450 }
451}
452
3d0684b2
JQ
453/**
454 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
455 *
6f37bb8b 456 * @rs: current RAM state
3d0684b2
JQ
457 * @current_addr: address for the zero page
458 *
459 * Update the xbzrle cache to reflect a page that's been sent as all 0.
56e93d26
JQ
460 * The important thing is that a stale (not-yet-0'd) page be replaced
461 * by the new data.
462 * As a bonus, if the page wasn't in the cache it gets added so that
3d0684b2 463 * when a small write is made into the 0'd page it gets XBZRLE sent.
56e93d26 464 */
6f37bb8b 465static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
56e93d26 466{
6f37bb8b 467 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
56e93d26
JQ
468 return;
469 }
470
471 /* We don't care if this fails to allocate a new cache page
472 * as long as it updated an old one */
473 cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
5a987738 474 rs->bitmap_sync_count);
56e93d26
JQ
475}
476
477#define ENCODING_FLAG_XBZRLE 0x1
478
479/**
480 * save_xbzrle_page: compress and send current page
481 *
482 * Returns: 1 means that we wrote the page
483 * 0 means that page is identical to the one already sent
484 * -1 means that xbzrle would be longer than normal
485 *
5a987738 486 * @rs: current RAM state
56e93d26 487 * @f: QEMUFile where to send the data
3d0684b2
JQ
488 * @current_data: pointer to the address of the page contents
489 * @current_addr: addr of the page
56e93d26
JQ
490 * @block: block that contains the page we want to send
491 * @offset: offset inside the block for the page
492 * @last_stage: if we are at the completion stage
493 * @bytes_transferred: increase it with the number of transferred bytes
494 */
5a987738 495static int save_xbzrle_page(RAMState *rs, QEMUFile *f, uint8_t **current_data,
56e93d26
JQ
496 ram_addr_t current_addr, RAMBlock *block,
497 ram_addr_t offset, bool last_stage,
498 uint64_t *bytes_transferred)
499{
500 int encoded_len = 0, bytes_xbzrle;
501 uint8_t *prev_cached_page;
502
5a987738 503 if (!cache_is_cached(XBZRLE.cache, current_addr, rs->bitmap_sync_count)) {
56e93d26
JQ
504 acct_info.xbzrle_cache_miss++;
505 if (!last_stage) {
506 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
5a987738 507 rs->bitmap_sync_count) == -1) {
56e93d26
JQ
508 return -1;
509 } else {
510 /* update *current_data when the page has been
511 inserted into cache */
512 *current_data = get_cached_data(XBZRLE.cache, current_addr);
513 }
514 }
515 return -1;
516 }
517
518 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
519
520 /* save current buffer into memory */
521 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
522
523 /* XBZRLE encoding (if there is no overflow) */
524 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
525 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
526 TARGET_PAGE_SIZE);
527 if (encoded_len == 0) {
55c4446b 528 trace_save_xbzrle_page_skipping();
56e93d26
JQ
529 return 0;
530 } else if (encoded_len == -1) {
55c4446b 531 trace_save_xbzrle_page_overflow();
56e93d26
JQ
532 acct_info.xbzrle_overflows++;
533 /* update data in the cache */
534 if (!last_stage) {
535 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
536 *current_data = prev_cached_page;
537 }
538 return -1;
539 }
540
541 /* we need to update the data in the cache, in order to get the same data */
542 if (!last_stage) {
543 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
544 }
545
546 /* Send XBZRLE based compressed page */
547 bytes_xbzrle = save_page_header(f, block, offset | RAM_SAVE_FLAG_XBZRLE);
548 qemu_put_byte(f, ENCODING_FLAG_XBZRLE);
549 qemu_put_be16(f, encoded_len);
550 qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len);
551 bytes_xbzrle += encoded_len + 1 + 2;
552 acct_info.xbzrle_pages++;
553 acct_info.xbzrle_bytes += bytes_xbzrle;
554 *bytes_transferred += bytes_xbzrle;
555
556 return 1;
557}
558
3d0684b2
JQ
559/**
560 * migration_bitmap_find_dirty: find the next dirty page from start
f3f491fc 561 *
3d0684b2
JQ
562 * Called with rcu_read_lock() to protect migration_bitmap
563 *
564 * Returns the byte offset within memory region of the start of a dirty page
565 *
6f37bb8b 566 * @rs: current RAM state
3d0684b2
JQ
567 * @rb: RAMBlock where to search for dirty pages
568 * @start: starting address (typically so we can continue from previous page)
569 * @ram_addr_abs: pointer into which to store the address of the dirty page
570 * within the global ram_addr space
f3f491fc 571 */
56e93d26 572static inline
6f37bb8b 573ram_addr_t migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
a82d593b
DDAG
574 ram_addr_t start,
575 ram_addr_t *ram_addr_abs)
56e93d26 576{
2f68e399 577 unsigned long base = rb->offset >> TARGET_PAGE_BITS;
56e93d26 578 unsigned long nr = base + (start >> TARGET_PAGE_BITS);
2f68e399
DDAG
579 uint64_t rb_size = rb->used_length;
580 unsigned long size = base + (rb_size >> TARGET_PAGE_BITS);
2ff64038 581 unsigned long *bitmap;
56e93d26
JQ
582
583 unsigned long next;
584
60be6340 585 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
6f37bb8b 586 if (rs->ram_bulk_stage && nr > base) {
56e93d26
JQ
587 next = nr + 1;
588 } else {
2ff64038 589 next = find_next_bit(bitmap, size, nr);
56e93d26
JQ
590 }
591
f3f491fc 592 *ram_addr_abs = next << TARGET_PAGE_BITS;
56e93d26
JQ
593 return (next - base) << TARGET_PAGE_BITS;
594}
595
a82d593b
DDAG
596static inline bool migration_bitmap_clear_dirty(ram_addr_t addr)
597{
598 bool ret;
599 int nr = addr >> TARGET_PAGE_BITS;
600 unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
601
602 ret = test_and_clear_bit(nr, bitmap);
603
604 if (ret) {
605 migration_dirty_pages--;
606 }
607 return ret;
608}
609
1ffb5dfd 610static int64_t num_dirty_pages_period;
56e93d26
JQ
611static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length)
612{
2ff64038 613 unsigned long *bitmap;
60be6340 614 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1ffb5dfd
CF
615 migration_dirty_pages += cpu_physical_memory_sync_dirty_bitmap(bitmap,
616 start, length, &num_dirty_pages_period);
56e93d26
JQ
617}
618
56e93d26
JQ
619/* Fix me: there are too many global variables used in migration process. */
620static int64_t start_time;
621static int64_t bytes_xfer_prev;
56e93d26
JQ
622static uint64_t xbzrle_cache_miss_prev;
623static uint64_t iterations_prev;
624
625static void migration_bitmap_sync_init(void)
626{
627 start_time = 0;
628 bytes_xfer_prev = 0;
629 num_dirty_pages_period = 0;
630 xbzrle_cache_miss_prev = 0;
631 iterations_prev = 0;
632}
633
3d0684b2
JQ
634/**
635 * ram_pagesize_summary: calculate all the pagesizes of a VM
636 *
637 * Returns a summary bitmap of the page sizes of all RAMBlocks
638 *
639 * For VMs with just normal pages this is equivalent to the host page
640 * size. If it's got some huge pages then it's the OR of all the
641 * different page sizes.
e8ca1db2
DDAG
642 */
643uint64_t ram_pagesize_summary(void)
644{
645 RAMBlock *block;
646 uint64_t summary = 0;
647
648 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
649 summary |= block->page_size;
650 }
651
652 return summary;
653}
654
8d820d6f 655static void migration_bitmap_sync(RAMState *rs)
56e93d26
JQ
656{
657 RAMBlock *block;
56e93d26
JQ
658 MigrationState *s = migrate_get_current();
659 int64_t end_time;
660 int64_t bytes_xfer_now;
661
5a987738 662 rs->bitmap_sync_count++;
56e93d26
JQ
663
664 if (!bytes_xfer_prev) {
665 bytes_xfer_prev = ram_bytes_transferred();
666 }
667
668 if (!start_time) {
669 start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
670 }
671
672 trace_migration_bitmap_sync_start();
9c1f8f44 673 memory_global_dirty_log_sync();
56e93d26 674
dd631697 675 qemu_mutex_lock(&migration_bitmap_mutex);
56e93d26
JQ
676 rcu_read_lock();
677 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2f68e399 678 migration_bitmap_sync_range(block->offset, block->used_length);
56e93d26
JQ
679 }
680 rcu_read_unlock();
dd631697 681 qemu_mutex_unlock(&migration_bitmap_mutex);
56e93d26 682
1ffb5dfd
CF
683 trace_migration_bitmap_sync_end(num_dirty_pages_period);
684
56e93d26
JQ
685 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
686
687 /* more than 1 second = 1000 millisecons */
688 if (end_time > start_time + 1000) {
689 if (migrate_auto_converge()) {
690 /* The following detection logic can be refined later. For now:
691 Check to see if the dirtied bytes is 50% more than the approx.
692 amount of bytes that just got transferred since the last time we
070afca2
JH
693 were in this routine. If that happens twice, start or increase
694 throttling */
56e93d26 695 bytes_xfer_now = ram_bytes_transferred();
070afca2 696
56e93d26
JQ
697 if (s->dirty_pages_rate &&
698 (num_dirty_pages_period * TARGET_PAGE_SIZE >
699 (bytes_xfer_now - bytes_xfer_prev)/2) &&
8d820d6f 700 (rs->dirty_rate_high_cnt++ >= 2)) {
56e93d26 701 trace_migration_throttle();
8d820d6f 702 rs->dirty_rate_high_cnt = 0;
070afca2 703 mig_throttle_guest_down();
56e93d26
JQ
704 }
705 bytes_xfer_prev = bytes_xfer_now;
56e93d26 706 }
070afca2 707
56e93d26
JQ
708 if (migrate_use_xbzrle()) {
709 if (iterations_prev != acct_info.iterations) {
710 acct_info.xbzrle_cache_miss_rate =
711 (double)(acct_info.xbzrle_cache_miss -
712 xbzrle_cache_miss_prev) /
713 (acct_info.iterations - iterations_prev);
714 }
715 iterations_prev = acct_info.iterations;
716 xbzrle_cache_miss_prev = acct_info.xbzrle_cache_miss;
717 }
718 s->dirty_pages_rate = num_dirty_pages_period * 1000
719 / (end_time - start_time);
720 s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE;
721 start_time = end_time;
722 num_dirty_pages_period = 0;
723 }
5a987738 724 s->dirty_sync_count = rs->bitmap_sync_count;
4addcd4f 725 if (migrate_use_events()) {
5a987738 726 qapi_event_send_migration_pass(rs->bitmap_sync_count, NULL);
4addcd4f 727 }
56e93d26
JQ
728}
729
730/**
3d0684b2 731 * save_zero_page: send the zero page to the stream
56e93d26 732 *
3d0684b2 733 * Returns the number of pages written.
56e93d26
JQ
734 *
735 * @f: QEMUFile where to send the data
736 * @block: block that contains the page we want to send
737 * @offset: offset inside the block for the page
738 * @p: pointer to the page
739 * @bytes_transferred: increase it with the number of transferred bytes
740 */
741static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
742 uint8_t *p, uint64_t *bytes_transferred)
743{
744 int pages = -1;
745
746 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
747 acct_info.dup_pages++;
748 *bytes_transferred += save_page_header(f, block,
749 offset | RAM_SAVE_FLAG_COMPRESS);
750 qemu_put_byte(f, 0);
751 *bytes_transferred += 1;
752 pages = 1;
753 }
754
755 return pages;
756}
757
36449157 758static void ram_release_pages(MigrationState *ms, const char *rbname,
53f09a10
PB
759 uint64_t offset, int pages)
760{
761 if (!migrate_release_ram() || !migration_in_postcopy(ms)) {
762 return;
763 }
764
36449157 765 ram_discard_range(NULL, rbname, offset, pages << TARGET_PAGE_BITS);
53f09a10
PB
766}
767
56e93d26 768/**
3d0684b2 769 * ram_save_page: send the given page to the stream
56e93d26 770 *
3d0684b2 771 * Returns the number of pages written.
3fd3c4b3
DDAG
772 * < 0 - error
773 * >=0 - Number of pages written - this might legally be 0
774 * if xbzrle noticed the page was the same.
56e93d26 775 *
6f37bb8b 776 * @rs: current RAM state
3d0684b2 777 * @ms: current migration state
56e93d26
JQ
778 * @f: QEMUFile where to send the data
779 * @block: block that contains the page we want to send
780 * @offset: offset inside the block for the page
781 * @last_stage: if we are at the completion stage
782 * @bytes_transferred: increase it with the number of transferred bytes
783 */
6f37bb8b
JQ
784static int ram_save_page(RAMState *rs, MigrationState *ms, QEMUFile *f,
785 PageSearchStatus *pss, bool last_stage,
786 uint64_t *bytes_transferred)
56e93d26
JQ
787{
788 int pages = -1;
789 uint64_t bytes_xmit;
790 ram_addr_t current_addr;
56e93d26
JQ
791 uint8_t *p;
792 int ret;
793 bool send_async = true;
a08f6890
HZ
794 RAMBlock *block = pss->block;
795 ram_addr_t offset = pss->offset;
56e93d26 796
2f68e399 797 p = block->host + offset;
56e93d26
JQ
798
799 /* In doubt sent page as normal */
800 bytes_xmit = 0;
801 ret = ram_control_save_page(f, block->offset,
802 offset, TARGET_PAGE_SIZE, &bytes_xmit);
803 if (bytes_xmit) {
804 *bytes_transferred += bytes_xmit;
805 pages = 1;
806 }
807
808 XBZRLE_cache_lock();
809
810 current_addr = block->offset + offset;
811
6f37bb8b 812 if (block == rs->last_sent_block) {
56e93d26
JQ
813 offset |= RAM_SAVE_FLAG_CONTINUE;
814 }
815 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
816 if (ret != RAM_SAVE_CONTROL_DELAYED) {
817 if (bytes_xmit > 0) {
818 acct_info.norm_pages++;
819 } else if (bytes_xmit == 0) {
820 acct_info.dup_pages++;
821 }
822 }
823 } else {
824 pages = save_zero_page(f, block, offset, p, bytes_transferred);
825 if (pages > 0) {
826 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
827 * page would be stale
828 */
6f37bb8b 829 xbzrle_cache_zero_page(rs, current_addr);
53f09a10 830 ram_release_pages(ms, block->idstr, pss->offset, pages);
6f37bb8b 831 } else if (!rs->ram_bulk_stage &&
9eb14766 832 !migration_in_postcopy(ms) && migrate_use_xbzrle()) {
5a987738 833 pages = save_xbzrle_page(rs, f, &p, current_addr, block,
56e93d26
JQ
834 offset, last_stage, bytes_transferred);
835 if (!last_stage) {
836 /* Can't send this cached data async, since the cache page
837 * might get updated before it gets to the wire
838 */
839 send_async = false;
840 }
841 }
842 }
843
844 /* XBZRLE overflow or normal page */
845 if (pages == -1) {
846 *bytes_transferred += save_page_header(f, block,
847 offset | RAM_SAVE_FLAG_PAGE);
848 if (send_async) {
53f09a10
PB
849 qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE,
850 migrate_release_ram() &
851 migration_in_postcopy(ms));
56e93d26
JQ
852 } else {
853 qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
854 }
855 *bytes_transferred += TARGET_PAGE_SIZE;
856 pages = 1;
857 acct_info.norm_pages++;
858 }
859
860 XBZRLE_cache_unlock();
861
862 return pages;
863}
864
a7a9a88f
LL
865static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
866 ram_addr_t offset)
56e93d26
JQ
867{
868 int bytes_sent, blen;
a7a9a88f 869 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
56e93d26 870
a7a9a88f 871 bytes_sent = save_page_header(f, block, offset |
56e93d26 872 RAM_SAVE_FLAG_COMPRESS_PAGE);
a7a9a88f 873 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
56e93d26 874 migrate_compress_level());
b3be2896
LL
875 if (blen < 0) {
876 bytes_sent = 0;
877 qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
878 error_report("compressed data failed!");
879 } else {
880 bytes_sent += blen;
53f09a10
PB
881 ram_release_pages(migrate_get_current(), block->idstr,
882 offset & TARGET_PAGE_MASK, 1);
b3be2896 883 }
56e93d26
JQ
884
885 return bytes_sent;
886}
887
56e93d26
JQ
888static uint64_t bytes_transferred;
889
890static void flush_compressed_data(QEMUFile *f)
891{
892 int idx, len, thread_count;
893
894 if (!migrate_use_compression()) {
895 return;
896 }
897 thread_count = migrate_compress_threads();
a7a9a88f 898
0d9f9a5c 899 qemu_mutex_lock(&comp_done_lock);
56e93d26 900 for (idx = 0; idx < thread_count; idx++) {
a7a9a88f 901 while (!comp_param[idx].done) {
0d9f9a5c 902 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26 903 }
a7a9a88f 904 }
0d9f9a5c 905 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
906
907 for (idx = 0; idx < thread_count; idx++) {
908 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 909 if (!comp_param[idx].quit) {
56e93d26
JQ
910 len = qemu_put_qemu_file(f, comp_param[idx].file);
911 bytes_transferred += len;
912 }
a7a9a88f 913 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26
JQ
914 }
915}
916
917static inline void set_compress_params(CompressParam *param, RAMBlock *block,
918 ram_addr_t offset)
919{
920 param->block = block;
921 param->offset = offset;
922}
923
924static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block,
925 ram_addr_t offset,
926 uint64_t *bytes_transferred)
927{
928 int idx, thread_count, bytes_xmit = -1, pages = -1;
929
930 thread_count = migrate_compress_threads();
0d9f9a5c 931 qemu_mutex_lock(&comp_done_lock);
56e93d26
JQ
932 while (true) {
933 for (idx = 0; idx < thread_count; idx++) {
934 if (comp_param[idx].done) {
a7a9a88f 935 comp_param[idx].done = false;
56e93d26 936 bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file);
a7a9a88f 937 qemu_mutex_lock(&comp_param[idx].mutex);
56e93d26 938 set_compress_params(&comp_param[idx], block, offset);
a7a9a88f
LL
939 qemu_cond_signal(&comp_param[idx].cond);
940 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26
JQ
941 pages = 1;
942 acct_info.norm_pages++;
943 *bytes_transferred += bytes_xmit;
944 break;
945 }
946 }
947 if (pages > 0) {
948 break;
949 } else {
0d9f9a5c 950 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26
JQ
951 }
952 }
0d9f9a5c 953 qemu_mutex_unlock(&comp_done_lock);
56e93d26
JQ
954
955 return pages;
956}
957
958/**
959 * ram_save_compressed_page: compress the given page and send it to the stream
960 *
3d0684b2 961 * Returns the number of pages written.
56e93d26 962 *
6f37bb8b 963 * @rs: current RAM state
3d0684b2 964 * @ms: current migration state
56e93d26
JQ
965 * @f: QEMUFile where to send the data
966 * @block: block that contains the page we want to send
967 * @offset: offset inside the block for the page
968 * @last_stage: if we are at the completion stage
969 * @bytes_transferred: increase it with the number of transferred bytes
970 */
6f37bb8b
JQ
971static int ram_save_compressed_page(RAMState *rs, MigrationState *ms,
972 QEMUFile *f,
9eb14766 973 PageSearchStatus *pss, bool last_stage,
56e93d26
JQ
974 uint64_t *bytes_transferred)
975{
976 int pages = -1;
fc50438e 977 uint64_t bytes_xmit = 0;
56e93d26 978 uint8_t *p;
fc50438e 979 int ret, blen;
a08f6890
HZ
980 RAMBlock *block = pss->block;
981 ram_addr_t offset = pss->offset;
56e93d26 982
2f68e399 983 p = block->host + offset;
56e93d26 984
56e93d26
JQ
985 ret = ram_control_save_page(f, block->offset,
986 offset, TARGET_PAGE_SIZE, &bytes_xmit);
987 if (bytes_xmit) {
988 *bytes_transferred += bytes_xmit;
989 pages = 1;
990 }
56e93d26
JQ
991 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
992 if (ret != RAM_SAVE_CONTROL_DELAYED) {
993 if (bytes_xmit > 0) {
994 acct_info.norm_pages++;
995 } else if (bytes_xmit == 0) {
996 acct_info.dup_pages++;
997 }
998 }
999 } else {
1000 /* When starting the process of a new block, the first page of
1001 * the block should be sent out before other pages in the same
1002 * block, and all the pages in last block should have been sent
1003 * out, keeping this order is important, because the 'cont' flag
1004 * is used to avoid resending the block name.
1005 */
6f37bb8b 1006 if (block != rs->last_sent_block) {
56e93d26
JQ
1007 flush_compressed_data(f);
1008 pages = save_zero_page(f, block, offset, p, bytes_transferred);
1009 if (pages == -1) {
fc50438e
LL
1010 /* Make sure the first page is sent out before other pages */
1011 bytes_xmit = save_page_header(f, block, offset |
1012 RAM_SAVE_FLAG_COMPRESS_PAGE);
1013 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
1014 migrate_compress_level());
1015 if (blen > 0) {
1016 *bytes_transferred += bytes_xmit + blen;
b3be2896 1017 acct_info.norm_pages++;
b3be2896 1018 pages = 1;
fc50438e
LL
1019 } else {
1020 qemu_file_set_error(f, blen);
1021 error_report("compressed data failed!");
b3be2896 1022 }
56e93d26 1023 }
53f09a10
PB
1024 if (pages > 0) {
1025 ram_release_pages(ms, block->idstr, pss->offset, pages);
1026 }
56e93d26 1027 } else {
fc50438e 1028 offset |= RAM_SAVE_FLAG_CONTINUE;
56e93d26
JQ
1029 pages = save_zero_page(f, block, offset, p, bytes_transferred);
1030 if (pages == -1) {
1031 pages = compress_page_with_multi_thread(f, block, offset,
1032 bytes_transferred);
53f09a10
PB
1033 } else {
1034 ram_release_pages(ms, block->idstr, pss->offset, pages);
56e93d26
JQ
1035 }
1036 }
1037 }
1038
1039 return pages;
1040}
1041
3d0684b2
JQ
1042/**
1043 * find_dirty_block: find the next dirty page and update any state
1044 * associated with the search process.
b9e60928 1045 *
3d0684b2 1046 * Returns if a page is found
b9e60928 1047 *
6f37bb8b 1048 * @rs: current RAM state
3d0684b2
JQ
1049 * @f: QEMUFile where to send the data
1050 * @pss: data about the state of the current dirty page scan
1051 * @again: set to false if the search has scanned the whole of RAM
1052 * @ram_addr_abs: pointer into which to store the address of the dirty page
1053 * within the global ram_addr space
b9e60928 1054 */
6f37bb8b 1055static bool find_dirty_block(RAMState *rs, QEMUFile *f, PageSearchStatus *pss,
f3f491fc 1056 bool *again, ram_addr_t *ram_addr_abs)
b9e60928 1057{
6f37bb8b 1058 pss->offset = migration_bitmap_find_dirty(rs, pss->block, pss->offset,
a82d593b 1059 ram_addr_abs);
6f37bb8b
JQ
1060 if (pss->complete_round && pss->block == rs->last_seen_block &&
1061 pss->offset >= rs->last_offset) {
b9e60928
DDAG
1062 /*
1063 * We've been once around the RAM and haven't found anything.
1064 * Give up.
1065 */
1066 *again = false;
1067 return false;
1068 }
1069 if (pss->offset >= pss->block->used_length) {
1070 /* Didn't find anything in this RAM Block */
1071 pss->offset = 0;
1072 pss->block = QLIST_NEXT_RCU(pss->block, next);
1073 if (!pss->block) {
1074 /* Hit the end of the list */
1075 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1076 /* Flag that we've looped */
1077 pss->complete_round = true;
6f37bb8b 1078 rs->ram_bulk_stage = false;
b9e60928
DDAG
1079 if (migrate_use_xbzrle()) {
1080 /* If xbzrle is on, stop using the data compression at this
1081 * point. In theory, xbzrle can do better than compression.
1082 */
1083 flush_compressed_data(f);
1084 compression_switch = false;
1085 }
1086 }
1087 /* Didn't find anything this time, but try again on the new block */
1088 *again = true;
1089 return false;
1090 } else {
1091 /* Can go around again, but... */
1092 *again = true;
1093 /* We've found something so probably don't need to */
1094 return true;
1095 }
1096}
1097
3d0684b2
JQ
1098/**
1099 * unqueue_page: gets a page of the queue
1100 *
a82d593b 1101 * Helper for 'get_queued_page' - gets a page off the queue
a82d593b 1102 *
3d0684b2
JQ
1103 * Returns the block of the page (or NULL if none available)
1104 *
1105 * @ms: current migration state
1106 * @offset: used to return the offset within the RAMBlock
1107 * @ram_addr_abs: pointer into which to store the address of the dirty page
1108 * within the global ram_addr space
a82d593b
DDAG
1109 */
1110static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset,
1111 ram_addr_t *ram_addr_abs)
1112{
1113 RAMBlock *block = NULL;
1114
1115 qemu_mutex_lock(&ms->src_page_req_mutex);
1116 if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) {
1117 struct MigrationSrcPageRequest *entry =
1118 QSIMPLEQ_FIRST(&ms->src_page_requests);
1119 block = entry->rb;
1120 *offset = entry->offset;
1121 *ram_addr_abs = (entry->offset + entry->rb->offset) &
1122 TARGET_PAGE_MASK;
1123
1124 if (entry->len > TARGET_PAGE_SIZE) {
1125 entry->len -= TARGET_PAGE_SIZE;
1126 entry->offset += TARGET_PAGE_SIZE;
1127 } else {
1128 memory_region_unref(block->mr);
1129 QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1130 g_free(entry);
1131 }
1132 }
1133 qemu_mutex_unlock(&ms->src_page_req_mutex);
1134
1135 return block;
1136}
1137
3d0684b2
JQ
1138/**
1139 * get_queued_page: unqueue a page from the postocpy requests
1140 *
1141 * Skips pages that are already sent (!dirty)
a82d593b 1142 *
3d0684b2 1143 * Returns if a queued page is found
a82d593b 1144 *
6f37bb8b 1145 * @rs: current RAM state
3d0684b2
JQ
1146 * @ms: current migration state
1147 * @pss: data about the state of the current dirty page scan
1148 * @ram_addr_abs: pointer into which to store the address of the dirty page
1149 * within the global ram_addr space
a82d593b 1150 */
6f37bb8b
JQ
1151static bool get_queued_page(RAMState *rs, MigrationState *ms,
1152 PageSearchStatus *pss,
a82d593b
DDAG
1153 ram_addr_t *ram_addr_abs)
1154{
1155 RAMBlock *block;
1156 ram_addr_t offset;
1157 bool dirty;
1158
1159 do {
1160 block = unqueue_page(ms, &offset, ram_addr_abs);
1161 /*
1162 * We're sending this page, and since it's postcopy nothing else
1163 * will dirty it, and we must make sure it doesn't get sent again
1164 * even if this queue request was received after the background
1165 * search already sent it.
1166 */
1167 if (block) {
1168 unsigned long *bitmap;
1169 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1170 dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap);
1171 if (!dirty) {
1172 trace_get_queued_page_not_dirty(
1173 block->idstr, (uint64_t)offset,
1174 (uint64_t)*ram_addr_abs,
1175 test_bit(*ram_addr_abs >> TARGET_PAGE_BITS,
1176 atomic_rcu_read(&migration_bitmap_rcu)->unsentmap));
1177 } else {
1178 trace_get_queued_page(block->idstr,
1179 (uint64_t)offset,
1180 (uint64_t)*ram_addr_abs);
1181 }
1182 }
1183
1184 } while (block && !dirty);
1185
1186 if (block) {
1187 /*
1188 * As soon as we start servicing pages out of order, then we have
1189 * to kill the bulk stage, since the bulk stage assumes
1190 * in (migration_bitmap_find_and_reset_dirty) that every page is
1191 * dirty, that's no longer true.
1192 */
6f37bb8b 1193 rs->ram_bulk_stage = false;
a82d593b
DDAG
1194
1195 /*
1196 * We want the background search to continue from the queued page
1197 * since the guest is likely to want other pages near to the page
1198 * it just requested.
1199 */
1200 pss->block = block;
1201 pss->offset = offset;
1202 }
1203
1204 return !!block;
1205}
1206
6c595cde 1207/**
5e58f968
JQ
1208 * migration_page_queue_free: drop any remaining pages in the ram
1209 * request queue
6c595cde 1210 *
3d0684b2
JQ
1211 * It should be empty at the end anyway, but in error cases there may
1212 * be some left. in case that there is any page left, we drop it.
1213 *
1214 * @ms: current migration state
6c595cde 1215 */
5e58f968 1216void migration_page_queue_free(MigrationState *ms)
6c595cde
DDAG
1217{
1218 struct MigrationSrcPageRequest *mspr, *next_mspr;
1219 /* This queue generally should be empty - but in the case of a failed
1220 * migration might have some droppings in.
1221 */
1222 rcu_read_lock();
1223 QSIMPLEQ_FOREACH_SAFE(mspr, &ms->src_page_requests, next_req, next_mspr) {
1224 memory_region_unref(mspr->rb->mr);
1225 QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1226 g_free(mspr);
1227 }
1228 rcu_read_unlock();
1229}
1230
1231/**
3d0684b2
JQ
1232 * ram_save_queue_pages: queue the page for transmission
1233 *
1234 * A request from postcopy destination for example.
1235 *
1236 * Returns zero on success or negative on error
1237 *
1238 * @ms: current migration state
1239 * @rbname: Name of the RAMBLock of the request. NULL means the
1240 * same that last one.
1241 * @start: starting address from the start of the RAMBlock
1242 * @len: length (in bytes) to send
6c595cde
DDAG
1243 */
1244int ram_save_queue_pages(MigrationState *ms, const char *rbname,
1245 ram_addr_t start, ram_addr_t len)
1246{
1247 RAMBlock *ramblock;
1248
d3bf5418 1249 ms->postcopy_requests++;
6c595cde
DDAG
1250 rcu_read_lock();
1251 if (!rbname) {
1252 /* Reuse last RAMBlock */
1253 ramblock = ms->last_req_rb;
1254
1255 if (!ramblock) {
1256 /*
1257 * Shouldn't happen, we can't reuse the last RAMBlock if
1258 * it's the 1st request.
1259 */
1260 error_report("ram_save_queue_pages no previous block");
1261 goto err;
1262 }
1263 } else {
1264 ramblock = qemu_ram_block_by_name(rbname);
1265
1266 if (!ramblock) {
1267 /* We shouldn't be asked for a non-existent RAMBlock */
1268 error_report("ram_save_queue_pages no block '%s'", rbname);
1269 goto err;
1270 }
1271 ms->last_req_rb = ramblock;
1272 }
1273 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1274 if (start+len > ramblock->used_length) {
9458ad6b
JQ
1275 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1276 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
6c595cde
DDAG
1277 __func__, start, len, ramblock->used_length);
1278 goto err;
1279 }
1280
1281 struct MigrationSrcPageRequest *new_entry =
1282 g_malloc0(sizeof(struct MigrationSrcPageRequest));
1283 new_entry->rb = ramblock;
1284 new_entry->offset = start;
1285 new_entry->len = len;
1286
1287 memory_region_ref(ramblock->mr);
1288 qemu_mutex_lock(&ms->src_page_req_mutex);
1289 QSIMPLEQ_INSERT_TAIL(&ms->src_page_requests, new_entry, next_req);
1290 qemu_mutex_unlock(&ms->src_page_req_mutex);
1291 rcu_read_unlock();
1292
1293 return 0;
1294
1295err:
1296 rcu_read_unlock();
1297 return -1;
1298}
1299
a82d593b 1300/**
3d0684b2 1301 * ram_save_target_page: save one target page
a82d593b 1302 *
3d0684b2 1303 * Returns the number of pages written
a82d593b 1304 *
6f37bb8b 1305 * @rs: current RAM state
3d0684b2 1306 * @ms: current migration state
a82d593b 1307 * @f: QEMUFile where to send the data
3d0684b2 1308 * @pss: data about the page we want to send
a82d593b
DDAG
1309 * @last_stage: if we are at the completion stage
1310 * @bytes_transferred: increase it with the number of transferred bytes
3d0684b2 1311 * @dirty_ram_abs: address of the start of the dirty page in ram_addr_t space
a82d593b 1312 */
6f37bb8b 1313static int ram_save_target_page(RAMState *rs, MigrationState *ms, QEMUFile *f,
a08f6890 1314 PageSearchStatus *pss,
a82d593b
DDAG
1315 bool last_stage,
1316 uint64_t *bytes_transferred,
1317 ram_addr_t dirty_ram_abs)
1318{
1319 int res = 0;
1320
1321 /* Check the pages is dirty and if it is send it */
1322 if (migration_bitmap_clear_dirty(dirty_ram_abs)) {
1323 unsigned long *unsentmap;
1324 if (compression_switch && migrate_use_compression()) {
6f37bb8b 1325 res = ram_save_compressed_page(rs, ms, f, pss,
a82d593b
DDAG
1326 last_stage,
1327 bytes_transferred);
1328 } else {
6f37bb8b 1329 res = ram_save_page(rs, ms, f, pss, last_stage,
a82d593b
DDAG
1330 bytes_transferred);
1331 }
1332
1333 if (res < 0) {
1334 return res;
1335 }
1336 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1337 if (unsentmap) {
1338 clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap);
1339 }
3fd3c4b3
DDAG
1340 /* Only update last_sent_block if a block was actually sent; xbzrle
1341 * might have decided the page was identical so didn't bother writing
1342 * to the stream.
1343 */
1344 if (res > 0) {
6f37bb8b 1345 rs->last_sent_block = pss->block;
3fd3c4b3 1346 }
a82d593b
DDAG
1347 }
1348
1349 return res;
1350}
1351
1352/**
3d0684b2 1353 * ram_save_host_page: save a whole host page
a82d593b 1354 *
3d0684b2
JQ
1355 * Starting at *offset send pages up to the end of the current host
1356 * page. It's valid for the initial offset to point into the middle of
1357 * a host page in which case the remainder of the hostpage is sent.
1358 * Only dirty target pages are sent. Note that the host page size may
1359 * be a huge page for this block.
a82d593b 1360 *
3d0684b2
JQ
1361 * Returns the number of pages written or negative on error
1362 *
6f37bb8b 1363 * @rs: current RAM state
3d0684b2 1364 * @ms: current migration state
a82d593b 1365 * @f: QEMUFile where to send the data
3d0684b2 1366 * @pss: data about the page we want to send
a82d593b
DDAG
1367 * @last_stage: if we are at the completion stage
1368 * @bytes_transferred: increase it with the number of transferred bytes
1369 * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1370 */
6f37bb8b 1371static int ram_save_host_page(RAMState *rs, MigrationState *ms, QEMUFile *f,
a08f6890
HZ
1372 PageSearchStatus *pss,
1373 bool last_stage,
a82d593b
DDAG
1374 uint64_t *bytes_transferred,
1375 ram_addr_t dirty_ram_abs)
1376{
1377 int tmppages, pages = 0;
4c011c37
DDAG
1378 size_t pagesize = qemu_ram_pagesize(pss->block);
1379
a82d593b 1380 do {
6f37bb8b 1381 tmppages = ram_save_target_page(rs, ms, f, pss, last_stage,
a82d593b
DDAG
1382 bytes_transferred, dirty_ram_abs);
1383 if (tmppages < 0) {
1384 return tmppages;
1385 }
1386
1387 pages += tmppages;
a08f6890 1388 pss->offset += TARGET_PAGE_SIZE;
a82d593b 1389 dirty_ram_abs += TARGET_PAGE_SIZE;
4c011c37 1390 } while (pss->offset & (pagesize - 1));
a82d593b
DDAG
1391
1392 /* The offset we leave with is the last one we looked at */
a08f6890 1393 pss->offset -= TARGET_PAGE_SIZE;
a82d593b
DDAG
1394 return pages;
1395}
6c595cde 1396
56e93d26 1397/**
3d0684b2 1398 * ram_find_and_save_block: finds a dirty page and sends it to f
56e93d26
JQ
1399 *
1400 * Called within an RCU critical section.
1401 *
3d0684b2 1402 * Returns the number of pages written where zero means no dirty pages
56e93d26 1403 *
6f37bb8b 1404 * @rs: current RAM state
56e93d26
JQ
1405 * @f: QEMUFile where to send the data
1406 * @last_stage: if we are at the completion stage
1407 * @bytes_transferred: increase it with the number of transferred bytes
a82d593b
DDAG
1408 *
1409 * On systems where host-page-size > target-page-size it will send all the
1410 * pages in a host page that are dirty.
56e93d26
JQ
1411 */
1412
6f37bb8b 1413static int ram_find_and_save_block(RAMState *rs, QEMUFile *f, bool last_stage,
56e93d26
JQ
1414 uint64_t *bytes_transferred)
1415{
b8fb8cb7 1416 PageSearchStatus pss;
a82d593b 1417 MigrationState *ms = migrate_get_current();
56e93d26 1418 int pages = 0;
b9e60928 1419 bool again, found;
f3f491fc
DDAG
1420 ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
1421 ram_addr_t space */
56e93d26 1422
0827b9e9
AA
1423 /* No dirty page as there is zero RAM */
1424 if (!ram_bytes_total()) {
1425 return pages;
1426 }
1427
6f37bb8b
JQ
1428 pss.block = rs->last_seen_block;
1429 pss.offset = rs->last_offset;
b8fb8cb7
DDAG
1430 pss.complete_round = false;
1431
1432 if (!pss.block) {
1433 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1434 }
56e93d26 1435
b9e60928 1436 do {
a82d593b 1437 again = true;
6f37bb8b 1438 found = get_queued_page(rs, ms, &pss, &dirty_ram_abs);
b9e60928 1439
a82d593b
DDAG
1440 if (!found) {
1441 /* priority queue empty, so just search for something dirty */
6f37bb8b 1442 found = find_dirty_block(rs, f, &pss, &again, &dirty_ram_abs);
a82d593b 1443 }
f3f491fc 1444
a82d593b 1445 if (found) {
6f37bb8b 1446 pages = ram_save_host_page(rs, ms, f, &pss,
a82d593b
DDAG
1447 last_stage, bytes_transferred,
1448 dirty_ram_abs);
56e93d26 1449 }
b9e60928 1450 } while (!pages && again);
56e93d26 1451
6f37bb8b
JQ
1452 rs->last_seen_block = pss.block;
1453 rs->last_offset = pss.offset;
56e93d26
JQ
1454
1455 return pages;
1456}
1457
1458void acct_update_position(QEMUFile *f, size_t size, bool zero)
1459{
1460 uint64_t pages = size / TARGET_PAGE_SIZE;
1461 if (zero) {
1462 acct_info.dup_pages += pages;
1463 } else {
1464 acct_info.norm_pages += pages;
1465 bytes_transferred += size;
1466 qemu_update_position(f, size);
1467 }
1468}
1469
1470static ram_addr_t ram_save_remaining(void)
1471{
1472 return migration_dirty_pages;
1473}
1474
1475uint64_t ram_bytes_remaining(void)
1476{
1477 return ram_save_remaining() * TARGET_PAGE_SIZE;
1478}
1479
1480uint64_t ram_bytes_transferred(void)
1481{
1482 return bytes_transferred;
1483}
1484
1485uint64_t ram_bytes_total(void)
1486{
1487 RAMBlock *block;
1488 uint64_t total = 0;
1489
1490 rcu_read_lock();
1491 QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
1492 total += block->used_length;
1493 rcu_read_unlock();
1494 return total;
1495}
1496
1497void free_xbzrle_decoded_buf(void)
1498{
1499 g_free(xbzrle_decoded_buf);
1500 xbzrle_decoded_buf = NULL;
1501}
1502
60be6340
DL
1503static void migration_bitmap_free(struct BitmapRcu *bmap)
1504{
1505 g_free(bmap->bmap);
f3f491fc 1506 g_free(bmap->unsentmap);
60be6340
DL
1507 g_free(bmap);
1508}
1509
6ad2a215 1510static void ram_migration_cleanup(void *opaque)
56e93d26 1511{
2ff64038
LZ
1512 /* caller have hold iothread lock or is in a bh, so there is
1513 * no writing race against this migration_bitmap
1514 */
60be6340
DL
1515 struct BitmapRcu *bitmap = migration_bitmap_rcu;
1516 atomic_rcu_set(&migration_bitmap_rcu, NULL);
2ff64038 1517 if (bitmap) {
56e93d26 1518 memory_global_dirty_log_stop();
60be6340 1519 call_rcu(bitmap, migration_bitmap_free, rcu);
56e93d26
JQ
1520 }
1521
1522 XBZRLE_cache_lock();
1523 if (XBZRLE.cache) {
1524 cache_fini(XBZRLE.cache);
1525 g_free(XBZRLE.encoded_buf);
1526 g_free(XBZRLE.current_buf);
adb65dec 1527 g_free(ZERO_TARGET_PAGE);
56e93d26
JQ
1528 XBZRLE.cache = NULL;
1529 XBZRLE.encoded_buf = NULL;
1530 XBZRLE.current_buf = NULL;
1531 }
1532 XBZRLE_cache_unlock();
1533}
1534
6f37bb8b 1535static void ram_state_reset(RAMState *rs)
56e93d26 1536{
6f37bb8b
JQ
1537 rs->last_seen_block = NULL;
1538 rs->last_sent_block = NULL;
1539 rs->last_offset = 0;
1540 rs->last_version = ram_list.version;
1541 rs->ram_bulk_stage = true;
56e93d26
JQ
1542}
1543
1544#define MAX_WAIT 50 /* ms, half buffered_file limit */
1545
dd631697
LZ
1546void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
1547{
1548 /* called in qemu main thread, so there is
1549 * no writing race against this migration_bitmap
1550 */
60be6340
DL
1551 if (migration_bitmap_rcu) {
1552 struct BitmapRcu *old_bitmap = migration_bitmap_rcu, *bitmap;
1553 bitmap = g_new(struct BitmapRcu, 1);
1554 bitmap->bmap = bitmap_new(new);
dd631697
LZ
1555
1556 /* prevent migration_bitmap content from being set bit
1557 * by migration_bitmap_sync_range() at the same time.
1558 * it is safe to migration if migration_bitmap is cleared bit
1559 * at the same time.
1560 */
1561 qemu_mutex_lock(&migration_bitmap_mutex);
60be6340
DL
1562 bitmap_copy(bitmap->bmap, old_bitmap->bmap, old);
1563 bitmap_set(bitmap->bmap, old, new - old);
f3f491fc
DDAG
1564
1565 /* We don't have a way to safely extend the sentmap
1566 * with RCU; so mark it as missing, entry to postcopy
1567 * will fail.
1568 */
1569 bitmap->unsentmap = NULL;
1570
60be6340 1571 atomic_rcu_set(&migration_bitmap_rcu, bitmap);
dd631697
LZ
1572 qemu_mutex_unlock(&migration_bitmap_mutex);
1573 migration_dirty_pages += new - old;
60be6340 1574 call_rcu(old_bitmap, migration_bitmap_free, rcu);
dd631697
LZ
1575 }
1576}
56e93d26 1577
4f2e4252
DDAG
1578/*
1579 * 'expected' is the value you expect the bitmap mostly to be full
1580 * of; it won't bother printing lines that are all this value.
1581 * If 'todump' is null the migration bitmap is dumped.
1582 */
1583void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
1584{
1585 int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1586
1587 int64_t cur;
1588 int64_t linelen = 128;
1589 char linebuf[129];
1590
1591 if (!todump) {
1592 todump = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1593 }
1594
1595 for (cur = 0; cur < ram_pages; cur += linelen) {
1596 int64_t curb;
1597 bool found = false;
1598 /*
1599 * Last line; catch the case where the line length
1600 * is longer than remaining ram
1601 */
1602 if (cur + linelen > ram_pages) {
1603 linelen = ram_pages - cur;
1604 }
1605 for (curb = 0; curb < linelen; curb++) {
1606 bool thisbit = test_bit(cur + curb, todump);
1607 linebuf[curb] = thisbit ? '1' : '.';
1608 found = found || (thisbit != expected);
1609 }
1610 if (found) {
1611 linebuf[curb] = '\0';
1612 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
1613 }
1614 }
1615}
1616
e0b266f0
DDAG
1617/* **** functions for postcopy ***** */
1618
ced1c616
PB
1619void ram_postcopy_migrated_memory_release(MigrationState *ms)
1620{
1621 struct RAMBlock *block;
1622 unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1623
1624 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1625 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1626 unsigned long range = first + (block->used_length >> TARGET_PAGE_BITS);
1627 unsigned long run_start = find_next_zero_bit(bitmap, range, first);
1628
1629 while (run_start < range) {
1630 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1631 ram_discard_range(NULL, block->idstr, run_start << TARGET_PAGE_BITS,
1632 (run_end - run_start) << TARGET_PAGE_BITS);
1633 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1634 }
1635 }
1636}
1637
3d0684b2
JQ
1638/**
1639 * postcopy_send_discard_bm_ram: discard a RAMBlock
1640 *
1641 * Returns zero on success
1642 *
e0b266f0
DDAG
1643 * Callback from postcopy_each_ram_send_discard for each RAMBlock
1644 * Note: At this point the 'unsentmap' is the processed bitmap combined
1645 * with the dirtymap; so a '1' means it's either dirty or unsent.
3d0684b2
JQ
1646 *
1647 * @ms: current migration state
1648 * @pds: state for postcopy
1649 * @start: RAMBlock starting page
1650 * @length: RAMBlock size
e0b266f0
DDAG
1651 */
1652static int postcopy_send_discard_bm_ram(MigrationState *ms,
1653 PostcopyDiscardState *pds,
1654 unsigned long start,
1655 unsigned long length)
1656{
1657 unsigned long end = start + length; /* one after the end */
1658 unsigned long current;
1659 unsigned long *unsentmap;
1660
1661 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1662 for (current = start; current < end; ) {
1663 unsigned long one = find_next_bit(unsentmap, end, current);
1664
1665 if (one <= end) {
1666 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1667 unsigned long discard_length;
1668
1669 if (zero >= end) {
1670 discard_length = end - one;
1671 } else {
1672 discard_length = zero - one;
1673 }
d688c62d
DDAG
1674 if (discard_length) {
1675 postcopy_discard_send_range(ms, pds, one, discard_length);
1676 }
e0b266f0
DDAG
1677 current = one + discard_length;
1678 } else {
1679 current = one;
1680 }
1681 }
1682
1683 return 0;
1684}
1685
3d0684b2
JQ
1686/**
1687 * postcopy_each_ram_send_discard: discard all RAMBlocks
1688 *
1689 * Returns 0 for success or negative for error
1690 *
e0b266f0
DDAG
1691 * Utility for the outgoing postcopy code.
1692 * Calls postcopy_send_discard_bm_ram for each RAMBlock
1693 * passing it bitmap indexes and name.
e0b266f0
DDAG
1694 * (qemu_ram_foreach_block ends up passing unscaled lengths
1695 * which would mean postcopy code would have to deal with target page)
3d0684b2
JQ
1696 *
1697 * @ms: current migration state
e0b266f0
DDAG
1698 */
1699static int postcopy_each_ram_send_discard(MigrationState *ms)
1700{
1701 struct RAMBlock *block;
1702 int ret;
1703
1704 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1705 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1706 PostcopyDiscardState *pds = postcopy_discard_send_init(ms,
1707 first,
1708 block->idstr);
1709
1710 /*
1711 * Postcopy sends chunks of bitmap over the wire, but it
1712 * just needs indexes at this point, avoids it having
1713 * target page specific code.
1714 */
1715 ret = postcopy_send_discard_bm_ram(ms, pds, first,
1716 block->used_length >> TARGET_PAGE_BITS);
1717 postcopy_discard_send_finish(ms, pds);
1718 if (ret) {
1719 return ret;
1720 }
1721 }
1722
1723 return 0;
1724}
1725
3d0684b2
JQ
1726/**
1727 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1728 *
1729 * Helper for postcopy_chunk_hostpages; it's called twice to
1730 * canonicalize the two bitmaps, that are similar, but one is
1731 * inverted.
99e314eb 1732 *
3d0684b2
JQ
1733 * Postcopy requires that all target pages in a hostpage are dirty or
1734 * clean, not a mix. This function canonicalizes the bitmaps.
99e314eb 1735 *
3d0684b2
JQ
1736 * @ms: current migration state
1737 * @unsent_pass: if true we need to canonicalize partially unsent host pages
1738 * otherwise we need to canonicalize partially dirty host pages
1739 * @block: block that contains the page we want to canonicalize
1740 * @pds: state for postcopy
99e314eb
DDAG
1741 */
1742static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1743 RAMBlock *block,
1744 PostcopyDiscardState *pds)
1745{
1746 unsigned long *bitmap;
1747 unsigned long *unsentmap;
29c59172 1748 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
99e314eb
DDAG
1749 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1750 unsigned long len = block->used_length >> TARGET_PAGE_BITS;
1751 unsigned long last = first + (len - 1);
1752 unsigned long run_start;
1753
29c59172
DDAG
1754 if (block->page_size == TARGET_PAGE_SIZE) {
1755 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1756 return;
1757 }
1758
99e314eb
DDAG
1759 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1760 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1761
1762 if (unsent_pass) {
1763 /* Find a sent page */
1764 run_start = find_next_zero_bit(unsentmap, last + 1, first);
1765 } else {
1766 /* Find a dirty page */
1767 run_start = find_next_bit(bitmap, last + 1, first);
1768 }
1769
1770 while (run_start <= last) {
1771 bool do_fixup = false;
1772 unsigned long fixup_start_addr;
1773 unsigned long host_offset;
1774
1775 /*
1776 * If the start of this run of pages is in the middle of a host
1777 * page, then we need to fixup this host page.
1778 */
1779 host_offset = run_start % host_ratio;
1780 if (host_offset) {
1781 do_fixup = true;
1782 run_start -= host_offset;
1783 fixup_start_addr = run_start;
1784 /* For the next pass */
1785 run_start = run_start + host_ratio;
1786 } else {
1787 /* Find the end of this run */
1788 unsigned long run_end;
1789 if (unsent_pass) {
1790 run_end = find_next_bit(unsentmap, last + 1, run_start + 1);
1791 } else {
1792 run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1);
1793 }
1794 /*
1795 * If the end isn't at the start of a host page, then the
1796 * run doesn't finish at the end of a host page
1797 * and we need to discard.
1798 */
1799 host_offset = run_end % host_ratio;
1800 if (host_offset) {
1801 do_fixup = true;
1802 fixup_start_addr = run_end - host_offset;
1803 /*
1804 * This host page has gone, the next loop iteration starts
1805 * from after the fixup
1806 */
1807 run_start = fixup_start_addr + host_ratio;
1808 } else {
1809 /*
1810 * No discards on this iteration, next loop starts from
1811 * next sent/dirty page
1812 */
1813 run_start = run_end + 1;
1814 }
1815 }
1816
1817 if (do_fixup) {
1818 unsigned long page;
1819
1820 /* Tell the destination to discard this page */
1821 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1822 /* For the unsent_pass we:
1823 * discard partially sent pages
1824 * For the !unsent_pass (dirty) we:
1825 * discard partially dirty pages that were sent
1826 * (any partially sent pages were already discarded
1827 * by the previous unsent_pass)
1828 */
1829 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1830 host_ratio);
1831 }
1832
1833 /* Clean up the bitmap */
1834 for (page = fixup_start_addr;
1835 page < fixup_start_addr + host_ratio; page++) {
1836 /* All pages in this host page are now not sent */
1837 set_bit(page, unsentmap);
1838
1839 /*
1840 * Remark them as dirty, updating the count for any pages
1841 * that weren't previously dirty.
1842 */
1843 migration_dirty_pages += !test_and_set_bit(page, bitmap);
1844 }
1845 }
1846
1847 if (unsent_pass) {
1848 /* Find the next sent page for the next iteration */
1849 run_start = find_next_zero_bit(unsentmap, last + 1,
1850 run_start);
1851 } else {
1852 /* Find the next dirty page for the next iteration */
1853 run_start = find_next_bit(bitmap, last + 1, run_start);
1854 }
1855 }
1856}
1857
3d0684b2
JQ
1858/**
1859 * postcopy_chuck_hostpages: discrad any partially sent host page
1860 *
99e314eb
DDAG
1861 * Utility for the outgoing postcopy code.
1862 *
1863 * Discard any partially sent host-page size chunks, mark any partially
29c59172
DDAG
1864 * dirty host-page size chunks as all dirty. In this case the host-page
1865 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
99e314eb 1866 *
3d0684b2
JQ
1867 * Returns zero on success
1868 *
1869 * @ms: current migration state
99e314eb
DDAG
1870 */
1871static int postcopy_chunk_hostpages(MigrationState *ms)
1872{
6f37bb8b 1873 RAMState *rs = &ram_state;
99e314eb
DDAG
1874 struct RAMBlock *block;
1875
99e314eb 1876 /* Easiest way to make sure we don't resume in the middle of a host-page */
6f37bb8b
JQ
1877 rs->last_seen_block = NULL;
1878 rs->last_sent_block = NULL;
1879 rs->last_offset = 0;
99e314eb
DDAG
1880
1881 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1882 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1883
1884 PostcopyDiscardState *pds =
1885 postcopy_discard_send_init(ms, first, block->idstr);
1886
1887 /* First pass: Discard all partially sent host pages */
1888 postcopy_chunk_hostpages_pass(ms, true, block, pds);
1889 /*
1890 * Second pass: Ensure that all partially dirty host pages are made
1891 * fully dirty.
1892 */
1893 postcopy_chunk_hostpages_pass(ms, false, block, pds);
1894
1895 postcopy_discard_send_finish(ms, pds);
1896 } /* ram_list loop */
1897
1898 return 0;
1899}
1900
3d0684b2
JQ
1901/**
1902 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1903 *
1904 * Returns zero on success
1905 *
e0b266f0
DDAG
1906 * Transmit the set of pages to be discarded after precopy to the target
1907 * these are pages that:
1908 * a) Have been previously transmitted but are now dirty again
1909 * b) Pages that have never been transmitted, this ensures that
1910 * any pages on the destination that have been mapped by background
1911 * tasks get discarded (transparent huge pages is the specific concern)
1912 * Hopefully this is pretty sparse
3d0684b2
JQ
1913 *
1914 * @ms: current migration state
e0b266f0
DDAG
1915 */
1916int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1917{
1918 int ret;
1919 unsigned long *bitmap, *unsentmap;
1920
1921 rcu_read_lock();
1922
1923 /* This should be our last sync, the src is now paused */
8d820d6f 1924 migration_bitmap_sync(&ram_state);
e0b266f0
DDAG
1925
1926 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1927 if (!unsentmap) {
1928 /* We don't have a safe way to resize the sentmap, so
1929 * if the bitmap was resized it will be NULL at this
1930 * point.
1931 */
1932 error_report("migration ram resized during precopy phase");
1933 rcu_read_unlock();
1934 return -EINVAL;
1935 }
1936
29c59172 1937 /* Deal with TPS != HPS and huge pages */
99e314eb
DDAG
1938 ret = postcopy_chunk_hostpages(ms);
1939 if (ret) {
1940 rcu_read_unlock();
1941 return ret;
1942 }
1943
e0b266f0
DDAG
1944 /*
1945 * Update the unsentmap to be unsentmap = unsentmap | dirty
1946 */
1947 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1948 bitmap_or(unsentmap, unsentmap, bitmap,
1949 last_ram_offset() >> TARGET_PAGE_BITS);
1950
1951
1952 trace_ram_postcopy_send_discard_bitmap();
1953#ifdef DEBUG_POSTCOPY
1954 ram_debug_dump_bitmap(unsentmap, true);
1955#endif
1956
1957 ret = postcopy_each_ram_send_discard(ms);
1958 rcu_read_unlock();
1959
1960 return ret;
1961}
1962
3d0684b2
JQ
1963/**
1964 * ram_discard_range: discard dirtied pages at the beginning of postcopy
e0b266f0 1965 *
3d0684b2 1966 * Returns zero on success
e0b266f0 1967 *
3d0684b2 1968 * @mis: current migration incoming state
36449157
JQ
1969 * @rbname: name of the RAMBlock of the request. NULL means the
1970 * same that last one.
3d0684b2
JQ
1971 * @start: RAMBlock starting page
1972 * @length: RAMBlock size
e0b266f0
DDAG
1973 */
1974int ram_discard_range(MigrationIncomingState *mis,
36449157 1975 const char *rbname,
e0b266f0
DDAG
1976 uint64_t start, size_t length)
1977{
1978 int ret = -1;
1979
36449157 1980 trace_ram_discard_range(rbname, start, length);
d3a5038c 1981
e0b266f0 1982 rcu_read_lock();
36449157 1983 RAMBlock *rb = qemu_ram_block_by_name(rbname);
e0b266f0
DDAG
1984
1985 if (!rb) {
36449157 1986 error_report("ram_discard_range: Failed to find block '%s'", rbname);
e0b266f0
DDAG
1987 goto err;
1988 }
1989
d3a5038c 1990 ret = ram_block_discard_range(rb, start, length);
e0b266f0
DDAG
1991
1992err:
1993 rcu_read_unlock();
1994
1995 return ret;
1996}
1997
6f37bb8b 1998static int ram_save_init_globals(RAMState *rs)
56e93d26 1999{
56e93d26
JQ
2000 int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
2001
8d820d6f 2002 rs->dirty_rate_high_cnt = 0;
5a987738 2003 rs->bitmap_sync_count = 0;
56e93d26 2004 migration_bitmap_sync_init();
dd631697 2005 qemu_mutex_init(&migration_bitmap_mutex);
56e93d26
JQ
2006
2007 if (migrate_use_xbzrle()) {
2008 XBZRLE_cache_lock();
adb65dec 2009 ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE);
56e93d26
JQ
2010 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
2011 TARGET_PAGE_SIZE,
2012 TARGET_PAGE_SIZE);
2013 if (!XBZRLE.cache) {
2014 XBZRLE_cache_unlock();
2015 error_report("Error creating cache");
2016 return -1;
2017 }
2018 XBZRLE_cache_unlock();
2019
2020 /* We prefer not to abort if there is no memory */
2021 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2022 if (!XBZRLE.encoded_buf) {
2023 error_report("Error allocating encoded_buf");
2024 return -1;
2025 }
2026
2027 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2028 if (!XBZRLE.current_buf) {
2029 error_report("Error allocating current_buf");
2030 g_free(XBZRLE.encoded_buf);
2031 XBZRLE.encoded_buf = NULL;
2032 return -1;
2033 }
2034
2035 acct_clear();
2036 }
2037
49877834
PB
2038 /* For memory_global_dirty_log_start below. */
2039 qemu_mutex_lock_iothread();
2040
56e93d26
JQ
2041 qemu_mutex_lock_ramlist();
2042 rcu_read_lock();
2043 bytes_transferred = 0;
6f37bb8b 2044 ram_state_reset(rs);
56e93d26 2045
f3f491fc 2046 migration_bitmap_rcu = g_new0(struct BitmapRcu, 1);
0827b9e9
AA
2047 /* Skip setting bitmap if there is no RAM */
2048 if (ram_bytes_total()) {
2049 ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
2050 migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages);
2051 bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages);
2052
2053 if (migrate_postcopy_ram()) {
2054 migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages);
2055 bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages);
2056 }
f3f491fc
DDAG
2057 }
2058
56e93d26
JQ
2059 /*
2060 * Count the total number of pages used by ram blocks not including any
2061 * gaps due to alignment or unplugs.
2062 */
2063 migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2064
2065 memory_global_dirty_log_start();
8d820d6f 2066 migration_bitmap_sync(rs);
56e93d26 2067 qemu_mutex_unlock_ramlist();
49877834 2068 qemu_mutex_unlock_iothread();
a91246c9
HZ
2069 rcu_read_unlock();
2070
2071 return 0;
2072}
2073
3d0684b2
JQ
2074/*
2075 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
a91246c9
HZ
2076 * long-running RCU critical section. When rcu-reclaims in the code
2077 * start to become numerous it will be necessary to reduce the
2078 * granularity of these critical sections.
2079 */
2080
3d0684b2
JQ
2081/**
2082 * ram_save_setup: Setup RAM for migration
2083 *
2084 * Returns zero to indicate success and negative for error
2085 *
2086 * @f: QEMUFile where to send the data
2087 * @opaque: RAMState pointer
2088 */
a91246c9
HZ
2089static int ram_save_setup(QEMUFile *f, void *opaque)
2090{
6f37bb8b 2091 RAMState *rs = opaque;
a91246c9
HZ
2092 RAMBlock *block;
2093
2094 /* migration has already setup the bitmap, reuse it. */
2095 if (!migration_in_colo_state()) {
6f37bb8b 2096 if (ram_save_init_globals(rs) < 0) {
a91246c9
HZ
2097 return -1;
2098 }
2099 }
2100
2101 rcu_read_lock();
56e93d26
JQ
2102
2103 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2104
2105 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2106 qemu_put_byte(f, strlen(block->idstr));
2107 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2108 qemu_put_be64(f, block->used_length);
ef08fb38
DDAG
2109 if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2110 qemu_put_be64(f, block->page_size);
2111 }
56e93d26
JQ
2112 }
2113
2114 rcu_read_unlock();
2115
2116 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2117 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2118
2119 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2120
2121 return 0;
2122}
2123
3d0684b2
JQ
2124/**
2125 * ram_save_iterate: iterative stage for migration
2126 *
2127 * Returns zero to indicate success and negative for error
2128 *
2129 * @f: QEMUFile where to send the data
2130 * @opaque: RAMState pointer
2131 */
56e93d26
JQ
2132static int ram_save_iterate(QEMUFile *f, void *opaque)
2133{
6f37bb8b 2134 RAMState *rs = opaque;
56e93d26
JQ
2135 int ret;
2136 int i;
2137 int64_t t0;
5c90308f 2138 int done = 0;
56e93d26
JQ
2139
2140 rcu_read_lock();
6f37bb8b
JQ
2141 if (ram_list.version != rs->last_version) {
2142 ram_state_reset(rs);
56e93d26
JQ
2143 }
2144
2145 /* Read version before ram_list.blocks */
2146 smp_rmb();
2147
2148 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2149
2150 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2151 i = 0;
2152 while ((ret = qemu_file_rate_limit(f)) == 0) {
2153 int pages;
2154
6f37bb8b 2155 pages = ram_find_and_save_block(rs, f, false, &bytes_transferred);
56e93d26
JQ
2156 /* no more pages to sent */
2157 if (pages == 0) {
5c90308f 2158 done = 1;
56e93d26
JQ
2159 break;
2160 }
56e93d26 2161 acct_info.iterations++;
070afca2 2162
56e93d26
JQ
2163 /* we want to check in the 1st loop, just in case it was the 1st time
2164 and we had to sync the dirty bitmap.
2165 qemu_get_clock_ns() is a bit expensive, so we only check each some
2166 iterations
2167 */
2168 if ((i & 63) == 0) {
2169 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2170 if (t1 > MAX_WAIT) {
55c4446b 2171 trace_ram_save_iterate_big_wait(t1, i);
56e93d26
JQ
2172 break;
2173 }
2174 }
2175 i++;
2176 }
2177 flush_compressed_data(f);
2178 rcu_read_unlock();
2179
2180 /*
2181 * Must occur before EOS (or any QEMUFile operation)
2182 * because of RDMA protocol.
2183 */
2184 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2185
2186 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2187 bytes_transferred += 8;
2188
2189 ret = qemu_file_get_error(f);
2190 if (ret < 0) {
2191 return ret;
2192 }
2193
5c90308f 2194 return done;
56e93d26
JQ
2195}
2196
3d0684b2
JQ
2197/**
2198 * ram_save_complete: function called to send the remaining amount of ram
2199 *
2200 * Returns zero to indicate success
2201 *
2202 * Called with iothread lock
2203 *
2204 * @f: QEMUFile where to send the data
2205 * @opaque: RAMState pointer
2206 */
56e93d26
JQ
2207static int ram_save_complete(QEMUFile *f, void *opaque)
2208{
6f37bb8b
JQ
2209 RAMState *rs = opaque;
2210
56e93d26
JQ
2211 rcu_read_lock();
2212
663e6c1d 2213 if (!migration_in_postcopy(migrate_get_current())) {
8d820d6f 2214 migration_bitmap_sync(rs);
663e6c1d 2215 }
56e93d26
JQ
2216
2217 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2218
2219 /* try transferring iterative blocks of memory */
2220
2221 /* flush all remaining blocks regardless of rate limiting */
2222 while (true) {
2223 int pages;
2224
6f37bb8b 2225 pages = ram_find_and_save_block(rs, f, !migration_in_colo_state(),
a91246c9 2226 &bytes_transferred);
56e93d26
JQ
2227 /* no more blocks to sent */
2228 if (pages == 0) {
2229 break;
2230 }
2231 }
2232
2233 flush_compressed_data(f);
2234 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
56e93d26
JQ
2235
2236 rcu_read_unlock();
d09a6fde 2237
56e93d26
JQ
2238 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2239
2240 return 0;
2241}
2242
c31b098f
DDAG
2243static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2244 uint64_t *non_postcopiable_pending,
2245 uint64_t *postcopiable_pending)
56e93d26 2246{
8d820d6f 2247 RAMState *rs = opaque;
56e93d26
JQ
2248 uint64_t remaining_size;
2249
2250 remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2251
663e6c1d
DDAG
2252 if (!migration_in_postcopy(migrate_get_current()) &&
2253 remaining_size < max_size) {
56e93d26
JQ
2254 qemu_mutex_lock_iothread();
2255 rcu_read_lock();
8d820d6f 2256 migration_bitmap_sync(rs);
56e93d26
JQ
2257 rcu_read_unlock();
2258 qemu_mutex_unlock_iothread();
2259 remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2260 }
c31b098f
DDAG
2261
2262 /* We can do postcopy, and all the data is postcopiable */
2263 *postcopiable_pending += remaining_size;
56e93d26
JQ
2264}
2265
2266static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2267{
2268 unsigned int xh_len;
2269 int xh_flags;
063e760a 2270 uint8_t *loaded_data;
56e93d26
JQ
2271
2272 if (!xbzrle_decoded_buf) {
2273 xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2274 }
063e760a 2275 loaded_data = xbzrle_decoded_buf;
56e93d26
JQ
2276
2277 /* extract RLE header */
2278 xh_flags = qemu_get_byte(f);
2279 xh_len = qemu_get_be16(f);
2280
2281 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2282 error_report("Failed to load XBZRLE page - wrong compression!");
2283 return -1;
2284 }
2285
2286 if (xh_len > TARGET_PAGE_SIZE) {
2287 error_report("Failed to load XBZRLE page - len overflow!");
2288 return -1;
2289 }
2290 /* load data and decode */
063e760a 2291 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
56e93d26
JQ
2292
2293 /* decode RLE */
063e760a 2294 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
56e93d26
JQ
2295 TARGET_PAGE_SIZE) == -1) {
2296 error_report("Failed to load XBZRLE page - decode error!");
2297 return -1;
2298 }
2299
2300 return 0;
2301}
2302
3d0684b2
JQ
2303/**
2304 * ram_block_from_stream: read a RAMBlock id from the migration stream
2305 *
2306 * Must be called from within a rcu critical section.
2307 *
56e93d26 2308 * Returns a pointer from within the RCU-protected ram_list.
a7180877 2309 *
3d0684b2
JQ
2310 * @f: QEMUFile where to read the data from
2311 * @flags: Page flags (mostly to see if it's a continuation of previous block)
a7180877 2312 */
3d0684b2 2313static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
56e93d26
JQ
2314{
2315 static RAMBlock *block = NULL;
2316 char id[256];
2317 uint8_t len;
2318
2319 if (flags & RAM_SAVE_FLAG_CONTINUE) {
4c4bad48 2320 if (!block) {
56e93d26
JQ
2321 error_report("Ack, bad migration stream!");
2322 return NULL;
2323 }
4c4bad48 2324 return block;
56e93d26
JQ
2325 }
2326
2327 len = qemu_get_byte(f);
2328 qemu_get_buffer(f, (uint8_t *)id, len);
2329 id[len] = 0;
2330
e3dd7493 2331 block = qemu_ram_block_by_name(id);
4c4bad48
HZ
2332 if (!block) {
2333 error_report("Can't find block %s", id);
2334 return NULL;
56e93d26
JQ
2335 }
2336
4c4bad48
HZ
2337 return block;
2338}
2339
2340static inline void *host_from_ram_block_offset(RAMBlock *block,
2341 ram_addr_t offset)
2342{
2343 if (!offset_in_ramblock(block, offset)) {
2344 return NULL;
2345 }
2346
2347 return block->host + offset;
56e93d26
JQ
2348}
2349
3d0684b2
JQ
2350/**
2351 * ram_handle_compressed: handle the zero page case
2352 *
56e93d26
JQ
2353 * If a page (or a whole RDMA chunk) has been
2354 * determined to be zero, then zap it.
3d0684b2
JQ
2355 *
2356 * @host: host address for the zero page
2357 * @ch: what the page is filled from. We only support zero
2358 * @size: size of the zero page
56e93d26
JQ
2359 */
2360void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2361{
2362 if (ch != 0 || !is_zero_range(host, size)) {
2363 memset(host, ch, size);
2364 }
2365}
2366
2367static void *do_data_decompress(void *opaque)
2368{
2369 DecompressParam *param = opaque;
2370 unsigned long pagesize;
33d151f4
LL
2371 uint8_t *des;
2372 int len;
56e93d26 2373
33d151f4 2374 qemu_mutex_lock(&param->mutex);
90e56fb4 2375 while (!param->quit) {
33d151f4
LL
2376 if (param->des) {
2377 des = param->des;
2378 len = param->len;
2379 param->des = 0;
2380 qemu_mutex_unlock(&param->mutex);
2381
56e93d26 2382 pagesize = TARGET_PAGE_SIZE;
73a8912b
LL
2383 /* uncompress() will return failed in some case, especially
2384 * when the page is dirted when doing the compression, it's
2385 * not a problem because the dirty page will be retransferred
2386 * and uncompress() won't break the data in other pages.
2387 */
33d151f4
LL
2388 uncompress((Bytef *)des, &pagesize,
2389 (const Bytef *)param->compbuf, len);
73a8912b 2390
33d151f4
LL
2391 qemu_mutex_lock(&decomp_done_lock);
2392 param->done = true;
2393 qemu_cond_signal(&decomp_done_cond);
2394 qemu_mutex_unlock(&decomp_done_lock);
2395
2396 qemu_mutex_lock(&param->mutex);
2397 } else {
2398 qemu_cond_wait(&param->cond, &param->mutex);
2399 }
56e93d26 2400 }
33d151f4 2401 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
2402
2403 return NULL;
2404}
2405
5533b2e9
LL
2406static void wait_for_decompress_done(void)
2407{
2408 int idx, thread_count;
2409
2410 if (!migrate_use_compression()) {
2411 return;
2412 }
2413
2414 thread_count = migrate_decompress_threads();
2415 qemu_mutex_lock(&decomp_done_lock);
2416 for (idx = 0; idx < thread_count; idx++) {
2417 while (!decomp_param[idx].done) {
2418 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2419 }
2420 }
2421 qemu_mutex_unlock(&decomp_done_lock);
2422}
2423
56e93d26
JQ
2424void migrate_decompress_threads_create(void)
2425{
2426 int i, thread_count;
2427
2428 thread_count = migrate_decompress_threads();
2429 decompress_threads = g_new0(QemuThread, thread_count);
2430 decomp_param = g_new0(DecompressParam, thread_count);
73a8912b
LL
2431 qemu_mutex_init(&decomp_done_lock);
2432 qemu_cond_init(&decomp_done_cond);
56e93d26
JQ
2433 for (i = 0; i < thread_count; i++) {
2434 qemu_mutex_init(&decomp_param[i].mutex);
2435 qemu_cond_init(&decomp_param[i].cond);
2436 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
73a8912b 2437 decomp_param[i].done = true;
90e56fb4 2438 decomp_param[i].quit = false;
56e93d26
JQ
2439 qemu_thread_create(decompress_threads + i, "decompress",
2440 do_data_decompress, decomp_param + i,
2441 QEMU_THREAD_JOINABLE);
2442 }
2443}
2444
2445void migrate_decompress_threads_join(void)
2446{
2447 int i, thread_count;
2448
56e93d26
JQ
2449 thread_count = migrate_decompress_threads();
2450 for (i = 0; i < thread_count; i++) {
2451 qemu_mutex_lock(&decomp_param[i].mutex);
90e56fb4 2452 decomp_param[i].quit = true;
56e93d26
JQ
2453 qemu_cond_signal(&decomp_param[i].cond);
2454 qemu_mutex_unlock(&decomp_param[i].mutex);
2455 }
2456 for (i = 0; i < thread_count; i++) {
2457 qemu_thread_join(decompress_threads + i);
2458 qemu_mutex_destroy(&decomp_param[i].mutex);
2459 qemu_cond_destroy(&decomp_param[i].cond);
2460 g_free(decomp_param[i].compbuf);
2461 }
2462 g_free(decompress_threads);
2463 g_free(decomp_param);
56e93d26
JQ
2464 decompress_threads = NULL;
2465 decomp_param = NULL;
56e93d26
JQ
2466}
2467
c1bc6626 2468static void decompress_data_with_multi_threads(QEMUFile *f,
56e93d26
JQ
2469 void *host, int len)
2470{
2471 int idx, thread_count;
2472
2473 thread_count = migrate_decompress_threads();
73a8912b 2474 qemu_mutex_lock(&decomp_done_lock);
56e93d26
JQ
2475 while (true) {
2476 for (idx = 0; idx < thread_count; idx++) {
73a8912b 2477 if (decomp_param[idx].done) {
33d151f4
LL
2478 decomp_param[idx].done = false;
2479 qemu_mutex_lock(&decomp_param[idx].mutex);
c1bc6626 2480 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
56e93d26
JQ
2481 decomp_param[idx].des = host;
2482 decomp_param[idx].len = len;
33d151f4
LL
2483 qemu_cond_signal(&decomp_param[idx].cond);
2484 qemu_mutex_unlock(&decomp_param[idx].mutex);
56e93d26
JQ
2485 break;
2486 }
2487 }
2488 if (idx < thread_count) {
2489 break;
73a8912b
LL
2490 } else {
2491 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
56e93d26
JQ
2492 }
2493 }
73a8912b 2494 qemu_mutex_unlock(&decomp_done_lock);
56e93d26
JQ
2495}
2496
3d0684b2
JQ
2497/**
2498 * ram_postcopy_incoming_init: allocate postcopy data structures
2499 *
2500 * Returns 0 for success and negative if there was one error
2501 *
2502 * @mis: current migration incoming state
2503 *
2504 * Allocate data structures etc needed by incoming migration with
2505 * postcopy-ram. postcopy-ram's similarly names
2506 * postcopy_ram_incoming_init does the work.
1caddf8a
DDAG
2507 */
2508int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2509{
2510 size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
2511
2512 return postcopy_ram_incoming_init(mis, ram_pages);
2513}
2514
3d0684b2
JQ
2515/**
2516 * ram_load_postcopy: load a page in postcopy case
2517 *
2518 * Returns 0 for success or -errno in case of error
2519 *
a7180877
DDAG
2520 * Called in postcopy mode by ram_load().
2521 * rcu_read_lock is taken prior to this being called.
3d0684b2
JQ
2522 *
2523 * @f: QEMUFile where to send the data
a7180877
DDAG
2524 */
2525static int ram_load_postcopy(QEMUFile *f)
2526{
2527 int flags = 0, ret = 0;
2528 bool place_needed = false;
28abd200 2529 bool matching_page_sizes = false;
a7180877
DDAG
2530 MigrationIncomingState *mis = migration_incoming_get_current();
2531 /* Temporary page that is later 'placed' */
2532 void *postcopy_host_page = postcopy_get_tmp_page(mis);
c53b7ddc 2533 void *last_host = NULL;
a3b6ff6d 2534 bool all_zero = false;
a7180877
DDAG
2535
2536 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2537 ram_addr_t addr;
2538 void *host = NULL;
2539 void *page_buffer = NULL;
2540 void *place_source = NULL;
df9ff5e1 2541 RAMBlock *block = NULL;
a7180877 2542 uint8_t ch;
a7180877
DDAG
2543
2544 addr = qemu_get_be64(f);
2545 flags = addr & ~TARGET_PAGE_MASK;
2546 addr &= TARGET_PAGE_MASK;
2547
2548 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2549 place_needed = false;
2550 if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) {
df9ff5e1 2551 block = ram_block_from_stream(f, flags);
4c4bad48
HZ
2552
2553 host = host_from_ram_block_offset(block, addr);
a7180877
DDAG
2554 if (!host) {
2555 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2556 ret = -EINVAL;
2557 break;
2558 }
28abd200 2559 matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
a7180877 2560 /*
28abd200
DDAG
2561 * Postcopy requires that we place whole host pages atomically;
2562 * these may be huge pages for RAMBlocks that are backed by
2563 * hugetlbfs.
a7180877
DDAG
2564 * To make it atomic, the data is read into a temporary page
2565 * that's moved into place later.
2566 * The migration protocol uses, possibly smaller, target-pages
2567 * however the source ensures it always sends all the components
2568 * of a host page in order.
2569 */
2570 page_buffer = postcopy_host_page +
28abd200 2571 ((uintptr_t)host & (block->page_size - 1));
a7180877 2572 /* If all TP are zero then we can optimise the place */
28abd200 2573 if (!((uintptr_t)host & (block->page_size - 1))) {
a7180877 2574 all_zero = true;
c53b7ddc
DDAG
2575 } else {
2576 /* not the 1st TP within the HP */
2577 if (host != (last_host + TARGET_PAGE_SIZE)) {
9af9e0fe 2578 error_report("Non-sequential target page %p/%p",
c53b7ddc
DDAG
2579 host, last_host);
2580 ret = -EINVAL;
2581 break;
2582 }
a7180877
DDAG
2583 }
2584
c53b7ddc 2585
a7180877
DDAG
2586 /*
2587 * If it's the last part of a host page then we place the host
2588 * page
2589 */
2590 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
28abd200 2591 (block->page_size - 1)) == 0;
a7180877
DDAG
2592 place_source = postcopy_host_page;
2593 }
c53b7ddc 2594 last_host = host;
a7180877
DDAG
2595
2596 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2597 case RAM_SAVE_FLAG_COMPRESS:
2598 ch = qemu_get_byte(f);
2599 memset(page_buffer, ch, TARGET_PAGE_SIZE);
2600 if (ch) {
2601 all_zero = false;
2602 }
2603 break;
2604
2605 case RAM_SAVE_FLAG_PAGE:
2606 all_zero = false;
2607 if (!place_needed || !matching_page_sizes) {
2608 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2609 } else {
2610 /* Avoids the qemu_file copy during postcopy, which is
2611 * going to do a copy later; can only do it when we
2612 * do this read in one go (matching page sizes)
2613 */
2614 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2615 TARGET_PAGE_SIZE);
2616 }
2617 break;
2618 case RAM_SAVE_FLAG_EOS:
2619 /* normal exit */
2620 break;
2621 default:
2622 error_report("Unknown combination of migration flags: %#x"
2623 " (postcopy mode)", flags);
2624 ret = -EINVAL;
2625 }
2626
2627 if (place_needed) {
2628 /* This gets called at the last target page in the host page */
df9ff5e1
DDAG
2629 void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2630
a7180877 2631 if (all_zero) {
df9ff5e1
DDAG
2632 ret = postcopy_place_page_zero(mis, place_dest,
2633 block->page_size);
a7180877 2634 } else {
df9ff5e1
DDAG
2635 ret = postcopy_place_page(mis, place_dest,
2636 place_source, block->page_size);
a7180877
DDAG
2637 }
2638 }
2639 if (!ret) {
2640 ret = qemu_file_get_error(f);
2641 }
2642 }
2643
2644 return ret;
2645}
2646
56e93d26
JQ
2647static int ram_load(QEMUFile *f, void *opaque, int version_id)
2648{
2649 int flags = 0, ret = 0;
2650 static uint64_t seq_iter;
2651 int len = 0;
a7180877
DDAG
2652 /*
2653 * If system is running in postcopy mode, page inserts to host memory must
2654 * be atomic
2655 */
2656 bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
ef08fb38
DDAG
2657 /* ADVISE is earlier, it shows the source has the postcopy capability on */
2658 bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
56e93d26
JQ
2659
2660 seq_iter++;
2661
2662 if (version_id != 4) {
2663 ret = -EINVAL;
2664 }
2665
2666 /* This RCU critical section can be very long running.
2667 * When RCU reclaims in the code start to become numerous,
2668 * it will be necessary to reduce the granularity of this
2669 * critical section.
2670 */
2671 rcu_read_lock();
a7180877
DDAG
2672
2673 if (postcopy_running) {
2674 ret = ram_load_postcopy(f);
2675 }
2676
2677 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
56e93d26 2678 ram_addr_t addr, total_ram_bytes;
a776aa15 2679 void *host = NULL;
56e93d26
JQ
2680 uint8_t ch;
2681
2682 addr = qemu_get_be64(f);
2683 flags = addr & ~TARGET_PAGE_MASK;
2684 addr &= TARGET_PAGE_MASK;
2685
a776aa15
DDAG
2686 if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE |
2687 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4c4bad48
HZ
2688 RAMBlock *block = ram_block_from_stream(f, flags);
2689
2690 host = host_from_ram_block_offset(block, addr);
a776aa15
DDAG
2691 if (!host) {
2692 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2693 ret = -EINVAL;
2694 break;
2695 }
2696 }
2697
56e93d26
JQ
2698 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2699 case RAM_SAVE_FLAG_MEM_SIZE:
2700 /* Synchronize RAM block list */
2701 total_ram_bytes = addr;
2702 while (!ret && total_ram_bytes) {
2703 RAMBlock *block;
56e93d26
JQ
2704 char id[256];
2705 ram_addr_t length;
2706
2707 len = qemu_get_byte(f);
2708 qemu_get_buffer(f, (uint8_t *)id, len);
2709 id[len] = 0;
2710 length = qemu_get_be64(f);
2711
e3dd7493
DDAG
2712 block = qemu_ram_block_by_name(id);
2713 if (block) {
2714 if (length != block->used_length) {
2715 Error *local_err = NULL;
56e93d26 2716
fa53a0e5 2717 ret = qemu_ram_resize(block, length,
e3dd7493
DDAG
2718 &local_err);
2719 if (local_err) {
2720 error_report_err(local_err);
56e93d26 2721 }
56e93d26 2722 }
ef08fb38
DDAG
2723 /* For postcopy we need to check hugepage sizes match */
2724 if (postcopy_advised &&
2725 block->page_size != qemu_host_page_size) {
2726 uint64_t remote_page_size = qemu_get_be64(f);
2727 if (remote_page_size != block->page_size) {
2728 error_report("Mismatched RAM page size %s "
2729 "(local) %zd != %" PRId64,
2730 id, block->page_size,
2731 remote_page_size);
2732 ret = -EINVAL;
2733 }
2734 }
e3dd7493
DDAG
2735 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2736 block->idstr);
2737 } else {
56e93d26
JQ
2738 error_report("Unknown ramblock \"%s\", cannot "
2739 "accept migration", id);
2740 ret = -EINVAL;
2741 }
2742
2743 total_ram_bytes -= length;
2744 }
2745 break;
a776aa15 2746
56e93d26 2747 case RAM_SAVE_FLAG_COMPRESS:
56e93d26
JQ
2748 ch = qemu_get_byte(f);
2749 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2750 break;
a776aa15 2751
56e93d26 2752 case RAM_SAVE_FLAG_PAGE:
56e93d26
JQ
2753 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2754 break;
56e93d26 2755
a776aa15 2756 case RAM_SAVE_FLAG_COMPRESS_PAGE:
56e93d26
JQ
2757 len = qemu_get_be32(f);
2758 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2759 error_report("Invalid compressed data length: %d", len);
2760 ret = -EINVAL;
2761 break;
2762 }
c1bc6626 2763 decompress_data_with_multi_threads(f, host, len);
56e93d26 2764 break;
a776aa15 2765
56e93d26 2766 case RAM_SAVE_FLAG_XBZRLE:
56e93d26
JQ
2767 if (load_xbzrle(f, addr, host) < 0) {
2768 error_report("Failed to decompress XBZRLE page at "
2769 RAM_ADDR_FMT, addr);
2770 ret = -EINVAL;
2771 break;
2772 }
2773 break;
2774 case RAM_SAVE_FLAG_EOS:
2775 /* normal exit */
2776 break;
2777 default:
2778 if (flags & RAM_SAVE_FLAG_HOOK) {
632e3a5c 2779 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
56e93d26
JQ
2780 } else {
2781 error_report("Unknown combination of migration flags: %#x",
2782 flags);
2783 ret = -EINVAL;
2784 }
2785 }
2786 if (!ret) {
2787 ret = qemu_file_get_error(f);
2788 }
2789 }
2790
5533b2e9 2791 wait_for_decompress_done();
56e93d26 2792 rcu_read_unlock();
55c4446b 2793 trace_ram_load_complete(ret, seq_iter);
56e93d26
JQ
2794 return ret;
2795}
2796
2797static SaveVMHandlers savevm_ram_handlers = {
2798 .save_live_setup = ram_save_setup,
2799 .save_live_iterate = ram_save_iterate,
763c906b 2800 .save_live_complete_postcopy = ram_save_complete,
a3e06c3d 2801 .save_live_complete_precopy = ram_save_complete,
56e93d26
JQ
2802 .save_live_pending = ram_save_pending,
2803 .load_state = ram_load,
6ad2a215 2804 .cleanup = ram_migration_cleanup,
56e93d26
JQ
2805};
2806
2807void ram_mig_init(void)
2808{
2809 qemu_mutex_init(&XBZRLE.lock);
6f37bb8b 2810 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
56e93d26 2811}