]> git.proxmox.com Git - mirror_qemu.git/blame - migration/ram.c
migration: remove useless code
[mirror_qemu.git] / migration / ram.c
CommitLineData
56e93d26
JQ
1/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
76cc7b58
JQ
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
56e93d26
JQ
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
1393a485 28#include "qemu/osdep.h"
33c11879
PB
29#include "qemu-common.h"
30#include "cpu.h"
56e93d26 31#include <zlib.h>
4addcd4f 32#include "qapi-event.h"
f348b6d1 33#include "qemu/cutils.h"
56e93d26
JQ
34#include "qemu/bitops.h"
35#include "qemu/bitmap.h"
7205c9ec
JQ
36#include "qemu/timer.h"
37#include "qemu/main-loop.h"
56e93d26 38#include "migration/migration.h"
e0b266f0 39#include "migration/postcopy-ram.h"
56e93d26
JQ
40#include "exec/address-spaces.h"
41#include "migration/page_cache.h"
56e93d26 42#include "qemu/error-report.h"
56e93d26 43#include "trace.h"
56e93d26 44#include "exec/ram_addr.h"
56e93d26
JQ
45#include "qemu/rcu_queue.h"
46
47#ifdef DEBUG_MIGRATION_RAM
48#define DPRINTF(fmt, ...) \
49 do { fprintf(stdout, "migration_ram: " fmt, ## __VA_ARGS__); } while (0)
50#else
51#define DPRINTF(fmt, ...) \
52 do { } while (0)
53#endif
54
56e93d26 55static int dirty_rate_high_cnt;
56e93d26
JQ
56
57static uint64_t bitmap_sync_count;
58
59/***********************************************************/
60/* ram save/restore */
61
62#define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
63#define RAM_SAVE_FLAG_COMPRESS 0x02
64#define RAM_SAVE_FLAG_MEM_SIZE 0x04
65#define RAM_SAVE_FLAG_PAGE 0x08
66#define RAM_SAVE_FLAG_EOS 0x10
67#define RAM_SAVE_FLAG_CONTINUE 0x20
68#define RAM_SAVE_FLAG_XBZRLE 0x40
69/* 0x80 is reserved in migration.h start with 0x100 next */
70#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
71
72static const uint8_t ZERO_TARGET_PAGE[TARGET_PAGE_SIZE];
73
74static inline bool is_zero_range(uint8_t *p, uint64_t size)
75{
76 return buffer_find_nonzero_offset(p, size) == size;
77}
78
79/* struct contains XBZRLE cache and a static page
80 used by the compression */
81static struct {
82 /* buffer used for XBZRLE encoding */
83 uint8_t *encoded_buf;
84 /* buffer for storing page content */
85 uint8_t *current_buf;
86 /* Cache for XBZRLE, Protected by lock. */
87 PageCache *cache;
88 QemuMutex lock;
89} XBZRLE;
90
91/* buffer used for XBZRLE decoding */
92static uint8_t *xbzrle_decoded_buf;
93
94static void XBZRLE_cache_lock(void)
95{
96 if (migrate_use_xbzrle())
97 qemu_mutex_lock(&XBZRLE.lock);
98}
99
100static void XBZRLE_cache_unlock(void)
101{
102 if (migrate_use_xbzrle())
103 qemu_mutex_unlock(&XBZRLE.lock);
104}
105
106/*
107 * called from qmp_migrate_set_cache_size in main thread, possibly while
108 * a migration is in progress.
109 * A running migration maybe using the cache and might finish during this
110 * call, hence changes to the cache are protected by XBZRLE.lock().
111 */
112int64_t xbzrle_cache_resize(int64_t new_size)
113{
114 PageCache *new_cache;
115 int64_t ret;
116
117 if (new_size < TARGET_PAGE_SIZE) {
118 return -1;
119 }
120
121 XBZRLE_cache_lock();
122
123 if (XBZRLE.cache != NULL) {
124 if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
125 goto out_new_size;
126 }
127 new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
128 TARGET_PAGE_SIZE);
129 if (!new_cache) {
130 error_report("Error creating cache");
131 ret = -1;
132 goto out;
133 }
134
135 cache_fini(XBZRLE.cache);
136 XBZRLE.cache = new_cache;
137 }
138
139out_new_size:
140 ret = pow2floor(new_size);
141out:
142 XBZRLE_cache_unlock();
143 return ret;
144}
145
146/* accounting for migration statistics */
147typedef struct AccountingInfo {
148 uint64_t dup_pages;
149 uint64_t skipped_pages;
150 uint64_t norm_pages;
151 uint64_t iterations;
152 uint64_t xbzrle_bytes;
153 uint64_t xbzrle_pages;
154 uint64_t xbzrle_cache_miss;
155 double xbzrle_cache_miss_rate;
156 uint64_t xbzrle_overflows;
157} AccountingInfo;
158
159static AccountingInfo acct_info;
160
161static void acct_clear(void)
162{
163 memset(&acct_info, 0, sizeof(acct_info));
164}
165
166uint64_t dup_mig_bytes_transferred(void)
167{
168 return acct_info.dup_pages * TARGET_PAGE_SIZE;
169}
170
171uint64_t dup_mig_pages_transferred(void)
172{
173 return acct_info.dup_pages;
174}
175
176uint64_t skipped_mig_bytes_transferred(void)
177{
178 return acct_info.skipped_pages * TARGET_PAGE_SIZE;
179}
180
181uint64_t skipped_mig_pages_transferred(void)
182{
183 return acct_info.skipped_pages;
184}
185
186uint64_t norm_mig_bytes_transferred(void)
187{
188 return acct_info.norm_pages * TARGET_PAGE_SIZE;
189}
190
191uint64_t norm_mig_pages_transferred(void)
192{
193 return acct_info.norm_pages;
194}
195
196uint64_t xbzrle_mig_bytes_transferred(void)
197{
198 return acct_info.xbzrle_bytes;
199}
200
201uint64_t xbzrle_mig_pages_transferred(void)
202{
203 return acct_info.xbzrle_pages;
204}
205
206uint64_t xbzrle_mig_pages_cache_miss(void)
207{
208 return acct_info.xbzrle_cache_miss;
209}
210
211double xbzrle_mig_cache_miss_rate(void)
212{
213 return acct_info.xbzrle_cache_miss_rate;
214}
215
216uint64_t xbzrle_mig_pages_overflow(void)
217{
218 return acct_info.xbzrle_overflows;
219}
220
221/* This is the last block that we have visited serching for dirty pages
222 */
223static RAMBlock *last_seen_block;
224/* This is the last block from where we have sent data */
225static RAMBlock *last_sent_block;
226static ram_addr_t last_offset;
dd631697 227static QemuMutex migration_bitmap_mutex;
56e93d26
JQ
228static uint64_t migration_dirty_pages;
229static uint32_t last_version;
230static bool ram_bulk_stage;
231
b8fb8cb7
DDAG
232/* used by the search for pages to send */
233struct PageSearchStatus {
234 /* Current block being searched */
235 RAMBlock *block;
236 /* Current offset to search from */
237 ram_addr_t offset;
238 /* Set once we wrap around */
239 bool complete_round;
240};
241typedef struct PageSearchStatus PageSearchStatus;
242
60be6340
DL
243static struct BitmapRcu {
244 struct rcu_head rcu;
f3f491fc 245 /* Main migration bitmap */
60be6340 246 unsigned long *bmap;
f3f491fc
DDAG
247 /* bitmap of pages that haven't been sent even once
248 * only maintained and used in postcopy at the moment
249 * where it's used to send the dirtymap at the start
250 * of the postcopy phase
251 */
252 unsigned long *unsentmap;
60be6340
DL
253} *migration_bitmap_rcu;
254
56e93d26
JQ
255struct CompressParam {
256 bool start;
257 bool done;
258 QEMUFile *file;
259 QemuMutex mutex;
260 QemuCond cond;
261 RAMBlock *block;
262 ram_addr_t offset;
263};
264typedef struct CompressParam CompressParam;
265
266struct DecompressParam {
267 bool start;
73a8912b 268 bool done;
56e93d26
JQ
269 QemuMutex mutex;
270 QemuCond cond;
271 void *des;
d341d9f3 272 uint8_t *compbuf;
56e93d26
JQ
273 int len;
274};
275typedef struct DecompressParam DecompressParam;
276
277static CompressParam *comp_param;
278static QemuThread *compress_threads;
279/* comp_done_cond is used to wake up the migration thread when
280 * one of the compression threads has finished the compression.
281 * comp_done_lock is used to co-work with comp_done_cond.
282 */
283static QemuMutex *comp_done_lock;
284static QemuCond *comp_done_cond;
285/* The empty QEMUFileOps will be used by file in CompressParam */
286static const QEMUFileOps empty_ops = { };
287
288static bool compression_switch;
289static bool quit_comp_thread;
290static bool quit_decomp_thread;
291static DecompressParam *decomp_param;
292static QemuThread *decompress_threads;
73a8912b
LL
293static QemuMutex decomp_done_lock;
294static QemuCond decomp_done_cond;
56e93d26
JQ
295
296static int do_compress_ram_page(CompressParam *param);
297
298static void *do_data_compress(void *opaque)
299{
300 CompressParam *param = opaque;
301
302 while (!quit_comp_thread) {
303 qemu_mutex_lock(&param->mutex);
304 /* Re-check the quit_comp_thread in case of
305 * terminate_compression_threads is called just before
306 * qemu_mutex_lock(&param->mutex) and after
307 * while(!quit_comp_thread), re-check it here can make
308 * sure the compression thread terminate as expected.
309 */
310 while (!param->start && !quit_comp_thread) {
311 qemu_cond_wait(&param->cond, &param->mutex);
312 }
313 if (!quit_comp_thread) {
314 do_compress_ram_page(param);
315 }
316 param->start = false;
317 qemu_mutex_unlock(&param->mutex);
318
319 qemu_mutex_lock(comp_done_lock);
320 param->done = true;
321 qemu_cond_signal(comp_done_cond);
322 qemu_mutex_unlock(comp_done_lock);
323 }
324
325 return NULL;
326}
327
328static inline void terminate_compression_threads(void)
329{
330 int idx, thread_count;
331
332 thread_count = migrate_compress_threads();
333 quit_comp_thread = true;
334 for (idx = 0; idx < thread_count; idx++) {
335 qemu_mutex_lock(&comp_param[idx].mutex);
336 qemu_cond_signal(&comp_param[idx].cond);
337 qemu_mutex_unlock(&comp_param[idx].mutex);
338 }
339}
340
341void migrate_compress_threads_join(void)
342{
343 int i, thread_count;
344
345 if (!migrate_use_compression()) {
346 return;
347 }
348 terminate_compression_threads();
349 thread_count = migrate_compress_threads();
350 for (i = 0; i < thread_count; i++) {
351 qemu_thread_join(compress_threads + i);
352 qemu_fclose(comp_param[i].file);
353 qemu_mutex_destroy(&comp_param[i].mutex);
354 qemu_cond_destroy(&comp_param[i].cond);
355 }
356 qemu_mutex_destroy(comp_done_lock);
357 qemu_cond_destroy(comp_done_cond);
358 g_free(compress_threads);
359 g_free(comp_param);
360 g_free(comp_done_cond);
361 g_free(comp_done_lock);
362 compress_threads = NULL;
363 comp_param = NULL;
364 comp_done_cond = NULL;
365 comp_done_lock = NULL;
366}
367
368void migrate_compress_threads_create(void)
369{
370 int i, thread_count;
371
372 if (!migrate_use_compression()) {
373 return;
374 }
375 quit_comp_thread = false;
376 compression_switch = true;
377 thread_count = migrate_compress_threads();
378 compress_threads = g_new0(QemuThread, thread_count);
379 comp_param = g_new0(CompressParam, thread_count);
380 comp_done_cond = g_new0(QemuCond, 1);
381 comp_done_lock = g_new0(QemuMutex, 1);
382 qemu_cond_init(comp_done_cond);
383 qemu_mutex_init(comp_done_lock);
384 for (i = 0; i < thread_count; i++) {
385 /* com_param[i].file is just used as a dummy buffer to save data, set
386 * it's ops to empty.
387 */
388 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
389 comp_param[i].done = true;
390 qemu_mutex_init(&comp_param[i].mutex);
391 qemu_cond_init(&comp_param[i].cond);
392 qemu_thread_create(compress_threads + i, "compress",
393 do_data_compress, comp_param + i,
394 QEMU_THREAD_JOINABLE);
395 }
396}
397
398/**
399 * save_page_header: Write page header to wire
400 *
401 * If this is the 1st block, it also writes the block identification
402 *
403 * Returns: Number of bytes written
404 *
405 * @f: QEMUFile where to send the data
406 * @block: block that contains the page we want to send
407 * @offset: offset inside the block for the page
408 * in the lower bits, it contains flags
409 */
410static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
411{
9f5f380b 412 size_t size, len;
56e93d26
JQ
413
414 qemu_put_be64(f, offset);
415 size = 8;
416
417 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
9f5f380b
LL
418 len = strlen(block->idstr);
419 qemu_put_byte(f, len);
420 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
421 size += 1 + len;
56e93d26
JQ
422 }
423 return size;
424}
425
070afca2
JH
426/* Reduce amount of guest cpu execution to hopefully slow down memory writes.
427 * If guest dirty memory rate is reduced below the rate at which we can
428 * transfer pages to the destination then we should be able to complete
429 * migration. Some workloads dirty memory way too fast and will not effectively
430 * converge, even with auto-converge.
431 */
432static void mig_throttle_guest_down(void)
433{
434 MigrationState *s = migrate_get_current();
2594f56d
DB
435 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
436 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
070afca2
JH
437
438 /* We have not started throttling yet. Let's start it. */
439 if (!cpu_throttle_active()) {
440 cpu_throttle_set(pct_initial);
441 } else {
442 /* Throttling already on, just increase the rate */
443 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
444 }
445}
446
56e93d26
JQ
447/* Update the xbzrle cache to reflect a page that's been sent as all 0.
448 * The important thing is that a stale (not-yet-0'd) page be replaced
449 * by the new data.
450 * As a bonus, if the page wasn't in the cache it gets added so that
451 * when a small write is made into the 0'd page it gets XBZRLE sent
452 */
453static void xbzrle_cache_zero_page(ram_addr_t current_addr)
454{
455 if (ram_bulk_stage || !migrate_use_xbzrle()) {
456 return;
457 }
458
459 /* We don't care if this fails to allocate a new cache page
460 * as long as it updated an old one */
461 cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
462 bitmap_sync_count);
463}
464
465#define ENCODING_FLAG_XBZRLE 0x1
466
467/**
468 * save_xbzrle_page: compress and send current page
469 *
470 * Returns: 1 means that we wrote the page
471 * 0 means that page is identical to the one already sent
472 * -1 means that xbzrle would be longer than normal
473 *
474 * @f: QEMUFile where to send the data
475 * @current_data:
476 * @current_addr:
477 * @block: block that contains the page we want to send
478 * @offset: offset inside the block for the page
479 * @last_stage: if we are at the completion stage
480 * @bytes_transferred: increase it with the number of transferred bytes
481 */
482static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data,
483 ram_addr_t current_addr, RAMBlock *block,
484 ram_addr_t offset, bool last_stage,
485 uint64_t *bytes_transferred)
486{
487 int encoded_len = 0, bytes_xbzrle;
488 uint8_t *prev_cached_page;
489
490 if (!cache_is_cached(XBZRLE.cache, current_addr, bitmap_sync_count)) {
491 acct_info.xbzrle_cache_miss++;
492 if (!last_stage) {
493 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
494 bitmap_sync_count) == -1) {
495 return -1;
496 } else {
497 /* update *current_data when the page has been
498 inserted into cache */
499 *current_data = get_cached_data(XBZRLE.cache, current_addr);
500 }
501 }
502 return -1;
503 }
504
505 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
506
507 /* save current buffer into memory */
508 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
509
510 /* XBZRLE encoding (if there is no overflow) */
511 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
512 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
513 TARGET_PAGE_SIZE);
514 if (encoded_len == 0) {
515 DPRINTF("Skipping unmodified page\n");
516 return 0;
517 } else if (encoded_len == -1) {
518 DPRINTF("Overflow\n");
519 acct_info.xbzrle_overflows++;
520 /* update data in the cache */
521 if (!last_stage) {
522 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
523 *current_data = prev_cached_page;
524 }
525 return -1;
526 }
527
528 /* we need to update the data in the cache, in order to get the same data */
529 if (!last_stage) {
530 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
531 }
532
533 /* Send XBZRLE based compressed page */
534 bytes_xbzrle = save_page_header(f, block, offset | RAM_SAVE_FLAG_XBZRLE);
535 qemu_put_byte(f, ENCODING_FLAG_XBZRLE);
536 qemu_put_be16(f, encoded_len);
537 qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len);
538 bytes_xbzrle += encoded_len + 1 + 2;
539 acct_info.xbzrle_pages++;
540 acct_info.xbzrle_bytes += bytes_xbzrle;
541 *bytes_transferred += bytes_xbzrle;
542
543 return 1;
544}
545
f3f491fc
DDAG
546/* Called with rcu_read_lock() to protect migration_bitmap
547 * rb: The RAMBlock to search for dirty pages in
548 * start: Start address (typically so we can continue from previous page)
549 * ram_addr_abs: Pointer into which to store the address of the dirty page
550 * within the global ram_addr space
551 *
552 * Returns: byte offset within memory region of the start of a dirty page
553 */
56e93d26 554static inline
a82d593b
DDAG
555ram_addr_t migration_bitmap_find_dirty(RAMBlock *rb,
556 ram_addr_t start,
557 ram_addr_t *ram_addr_abs)
56e93d26 558{
2f68e399 559 unsigned long base = rb->offset >> TARGET_PAGE_BITS;
56e93d26 560 unsigned long nr = base + (start >> TARGET_PAGE_BITS);
2f68e399
DDAG
561 uint64_t rb_size = rb->used_length;
562 unsigned long size = base + (rb_size >> TARGET_PAGE_BITS);
2ff64038 563 unsigned long *bitmap;
56e93d26
JQ
564
565 unsigned long next;
566
60be6340 567 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
56e93d26
JQ
568 if (ram_bulk_stage && nr > base) {
569 next = nr + 1;
570 } else {
2ff64038 571 next = find_next_bit(bitmap, size, nr);
56e93d26
JQ
572 }
573
f3f491fc 574 *ram_addr_abs = next << TARGET_PAGE_BITS;
56e93d26
JQ
575 return (next - base) << TARGET_PAGE_BITS;
576}
577
a82d593b
DDAG
578static inline bool migration_bitmap_clear_dirty(ram_addr_t addr)
579{
580 bool ret;
581 int nr = addr >> TARGET_PAGE_BITS;
582 unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
583
584 ret = test_and_clear_bit(nr, bitmap);
585
586 if (ret) {
587 migration_dirty_pages--;
588 }
589 return ret;
590}
591
56e93d26
JQ
592static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length)
593{
2ff64038 594 unsigned long *bitmap;
60be6340 595 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
56e93d26 596 migration_dirty_pages +=
2ff64038 597 cpu_physical_memory_sync_dirty_bitmap(bitmap, start, length);
56e93d26
JQ
598}
599
56e93d26
JQ
600/* Fix me: there are too many global variables used in migration process. */
601static int64_t start_time;
602static int64_t bytes_xfer_prev;
603static int64_t num_dirty_pages_period;
604static uint64_t xbzrle_cache_miss_prev;
605static uint64_t iterations_prev;
606
607static void migration_bitmap_sync_init(void)
608{
609 start_time = 0;
610 bytes_xfer_prev = 0;
611 num_dirty_pages_period = 0;
612 xbzrle_cache_miss_prev = 0;
613 iterations_prev = 0;
614}
615
56e93d26
JQ
616static void migration_bitmap_sync(void)
617{
618 RAMBlock *block;
619 uint64_t num_dirty_pages_init = migration_dirty_pages;
620 MigrationState *s = migrate_get_current();
621 int64_t end_time;
622 int64_t bytes_xfer_now;
623
624 bitmap_sync_count++;
625
626 if (!bytes_xfer_prev) {
627 bytes_xfer_prev = ram_bytes_transferred();
628 }
629
630 if (!start_time) {
631 start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
632 }
633
634 trace_migration_bitmap_sync_start();
635 address_space_sync_dirty_bitmap(&address_space_memory);
636
dd631697 637 qemu_mutex_lock(&migration_bitmap_mutex);
56e93d26
JQ
638 rcu_read_lock();
639 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2f68e399 640 migration_bitmap_sync_range(block->offset, block->used_length);
56e93d26
JQ
641 }
642 rcu_read_unlock();
dd631697 643 qemu_mutex_unlock(&migration_bitmap_mutex);
56e93d26
JQ
644
645 trace_migration_bitmap_sync_end(migration_dirty_pages
646 - num_dirty_pages_init);
647 num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init;
648 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
649
650 /* more than 1 second = 1000 millisecons */
651 if (end_time > start_time + 1000) {
652 if (migrate_auto_converge()) {
653 /* The following detection logic can be refined later. For now:
654 Check to see if the dirtied bytes is 50% more than the approx.
655 amount of bytes that just got transferred since the last time we
070afca2
JH
656 were in this routine. If that happens twice, start or increase
657 throttling */
56e93d26 658 bytes_xfer_now = ram_bytes_transferred();
070afca2 659
56e93d26
JQ
660 if (s->dirty_pages_rate &&
661 (num_dirty_pages_period * TARGET_PAGE_SIZE >
662 (bytes_xfer_now - bytes_xfer_prev)/2) &&
070afca2 663 (dirty_rate_high_cnt++ >= 2)) {
56e93d26 664 trace_migration_throttle();
56e93d26 665 dirty_rate_high_cnt = 0;
070afca2 666 mig_throttle_guest_down();
56e93d26
JQ
667 }
668 bytes_xfer_prev = bytes_xfer_now;
56e93d26 669 }
070afca2 670
56e93d26
JQ
671 if (migrate_use_xbzrle()) {
672 if (iterations_prev != acct_info.iterations) {
673 acct_info.xbzrle_cache_miss_rate =
674 (double)(acct_info.xbzrle_cache_miss -
675 xbzrle_cache_miss_prev) /
676 (acct_info.iterations - iterations_prev);
677 }
678 iterations_prev = acct_info.iterations;
679 xbzrle_cache_miss_prev = acct_info.xbzrle_cache_miss;
680 }
681 s->dirty_pages_rate = num_dirty_pages_period * 1000
682 / (end_time - start_time);
683 s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE;
684 start_time = end_time;
685 num_dirty_pages_period = 0;
686 }
687 s->dirty_sync_count = bitmap_sync_count;
4addcd4f
DDAG
688 if (migrate_use_events()) {
689 qapi_event_send_migration_pass(bitmap_sync_count, NULL);
690 }
56e93d26
JQ
691}
692
693/**
694 * save_zero_page: Send the zero page to the stream
695 *
696 * Returns: Number of pages written.
697 *
698 * @f: QEMUFile where to send the data
699 * @block: block that contains the page we want to send
700 * @offset: offset inside the block for the page
701 * @p: pointer to the page
702 * @bytes_transferred: increase it with the number of transferred bytes
703 */
704static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
705 uint8_t *p, uint64_t *bytes_transferred)
706{
707 int pages = -1;
708
709 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
710 acct_info.dup_pages++;
711 *bytes_transferred += save_page_header(f, block,
712 offset | RAM_SAVE_FLAG_COMPRESS);
713 qemu_put_byte(f, 0);
714 *bytes_transferred += 1;
715 pages = 1;
716 }
717
718 return pages;
719}
720
721/**
722 * ram_save_page: Send the given page to the stream
723 *
724 * Returns: Number of pages written.
3fd3c4b3
DDAG
725 * < 0 - error
726 * >=0 - Number of pages written - this might legally be 0
727 * if xbzrle noticed the page was the same.
56e93d26
JQ
728 *
729 * @f: QEMUFile where to send the data
730 * @block: block that contains the page we want to send
731 * @offset: offset inside the block for the page
732 * @last_stage: if we are at the completion stage
733 * @bytes_transferred: increase it with the number of transferred bytes
734 */
a08f6890 735static int ram_save_page(QEMUFile *f, PageSearchStatus *pss,
56e93d26
JQ
736 bool last_stage, uint64_t *bytes_transferred)
737{
738 int pages = -1;
739 uint64_t bytes_xmit;
740 ram_addr_t current_addr;
56e93d26
JQ
741 uint8_t *p;
742 int ret;
743 bool send_async = true;
a08f6890
HZ
744 RAMBlock *block = pss->block;
745 ram_addr_t offset = pss->offset;
56e93d26 746
2f68e399 747 p = block->host + offset;
56e93d26
JQ
748
749 /* In doubt sent page as normal */
750 bytes_xmit = 0;
751 ret = ram_control_save_page(f, block->offset,
752 offset, TARGET_PAGE_SIZE, &bytes_xmit);
753 if (bytes_xmit) {
754 *bytes_transferred += bytes_xmit;
755 pages = 1;
756 }
757
758 XBZRLE_cache_lock();
759
760 current_addr = block->offset + offset;
761
762 if (block == last_sent_block) {
763 offset |= RAM_SAVE_FLAG_CONTINUE;
764 }
765 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
766 if (ret != RAM_SAVE_CONTROL_DELAYED) {
767 if (bytes_xmit > 0) {
768 acct_info.norm_pages++;
769 } else if (bytes_xmit == 0) {
770 acct_info.dup_pages++;
771 }
772 }
773 } else {
774 pages = save_zero_page(f, block, offset, p, bytes_transferred);
775 if (pages > 0) {
776 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
777 * page would be stale
778 */
779 xbzrle_cache_zero_page(current_addr);
780 } else if (!ram_bulk_stage && migrate_use_xbzrle()) {
781 pages = save_xbzrle_page(f, &p, current_addr, block,
782 offset, last_stage, bytes_transferred);
783 if (!last_stage) {
784 /* Can't send this cached data async, since the cache page
785 * might get updated before it gets to the wire
786 */
787 send_async = false;
788 }
789 }
790 }
791
792 /* XBZRLE overflow or normal page */
793 if (pages == -1) {
794 *bytes_transferred += save_page_header(f, block,
795 offset | RAM_SAVE_FLAG_PAGE);
796 if (send_async) {
797 qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE);
798 } else {
799 qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
800 }
801 *bytes_transferred += TARGET_PAGE_SIZE;
802 pages = 1;
803 acct_info.norm_pages++;
804 }
805
806 XBZRLE_cache_unlock();
807
808 return pages;
809}
810
811static int do_compress_ram_page(CompressParam *param)
812{
813 int bytes_sent, blen;
814 uint8_t *p;
815 RAMBlock *block = param->block;
816 ram_addr_t offset = param->offset;
817
2f68e399 818 p = block->host + (offset & TARGET_PAGE_MASK);
56e93d26
JQ
819
820 bytes_sent = save_page_header(param->file, block, offset |
821 RAM_SAVE_FLAG_COMPRESS_PAGE);
822 blen = qemu_put_compression_data(param->file, p, TARGET_PAGE_SIZE,
823 migrate_compress_level());
824 bytes_sent += blen;
825
826 return bytes_sent;
827}
828
829static inline void start_compression(CompressParam *param)
830{
831 param->done = false;
832 qemu_mutex_lock(&param->mutex);
833 param->start = true;
834 qemu_cond_signal(&param->cond);
835 qemu_mutex_unlock(&param->mutex);
836}
837
838static inline void start_decompression(DecompressParam *param)
839{
73a8912b 840 param->done = false;
56e93d26
JQ
841 qemu_mutex_lock(&param->mutex);
842 param->start = true;
843 qemu_cond_signal(&param->cond);
844 qemu_mutex_unlock(&param->mutex);
845}
846
847static uint64_t bytes_transferred;
848
849static void flush_compressed_data(QEMUFile *f)
850{
851 int idx, len, thread_count;
852
853 if (!migrate_use_compression()) {
854 return;
855 }
856 thread_count = migrate_compress_threads();
857 for (idx = 0; idx < thread_count; idx++) {
858 if (!comp_param[idx].done) {
859 qemu_mutex_lock(comp_done_lock);
860 while (!comp_param[idx].done && !quit_comp_thread) {
861 qemu_cond_wait(comp_done_cond, comp_done_lock);
862 }
863 qemu_mutex_unlock(comp_done_lock);
864 }
865 if (!quit_comp_thread) {
866 len = qemu_put_qemu_file(f, comp_param[idx].file);
867 bytes_transferred += len;
868 }
869 }
870}
871
872static inline void set_compress_params(CompressParam *param, RAMBlock *block,
873 ram_addr_t offset)
874{
875 param->block = block;
876 param->offset = offset;
877}
878
879static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block,
880 ram_addr_t offset,
881 uint64_t *bytes_transferred)
882{
883 int idx, thread_count, bytes_xmit = -1, pages = -1;
884
885 thread_count = migrate_compress_threads();
886 qemu_mutex_lock(comp_done_lock);
887 while (true) {
888 for (idx = 0; idx < thread_count; idx++) {
889 if (comp_param[idx].done) {
890 bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file);
891 set_compress_params(&comp_param[idx], block, offset);
892 start_compression(&comp_param[idx]);
893 pages = 1;
894 acct_info.norm_pages++;
895 *bytes_transferred += bytes_xmit;
896 break;
897 }
898 }
899 if (pages > 0) {
900 break;
901 } else {
902 qemu_cond_wait(comp_done_cond, comp_done_lock);
903 }
904 }
905 qemu_mutex_unlock(comp_done_lock);
906
907 return pages;
908}
909
910/**
911 * ram_save_compressed_page: compress the given page and send it to the stream
912 *
913 * Returns: Number of pages written.
914 *
915 * @f: QEMUFile where to send the data
916 * @block: block that contains the page we want to send
917 * @offset: offset inside the block for the page
918 * @last_stage: if we are at the completion stage
919 * @bytes_transferred: increase it with the number of transferred bytes
920 */
a08f6890
HZ
921static int ram_save_compressed_page(QEMUFile *f, PageSearchStatus *pss,
922 bool last_stage,
56e93d26
JQ
923 uint64_t *bytes_transferred)
924{
925 int pages = -1;
926 uint64_t bytes_xmit;
56e93d26
JQ
927 uint8_t *p;
928 int ret;
a08f6890
HZ
929 RAMBlock *block = pss->block;
930 ram_addr_t offset = pss->offset;
56e93d26 931
2f68e399 932 p = block->host + offset;
56e93d26
JQ
933
934 bytes_xmit = 0;
935 ret = ram_control_save_page(f, block->offset,
936 offset, TARGET_PAGE_SIZE, &bytes_xmit);
937 if (bytes_xmit) {
938 *bytes_transferred += bytes_xmit;
939 pages = 1;
940 }
941 if (block == last_sent_block) {
942 offset |= RAM_SAVE_FLAG_CONTINUE;
943 }
944 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
945 if (ret != RAM_SAVE_CONTROL_DELAYED) {
946 if (bytes_xmit > 0) {
947 acct_info.norm_pages++;
948 } else if (bytes_xmit == 0) {
949 acct_info.dup_pages++;
950 }
951 }
952 } else {
953 /* When starting the process of a new block, the first page of
954 * the block should be sent out before other pages in the same
955 * block, and all the pages in last block should have been sent
956 * out, keeping this order is important, because the 'cont' flag
957 * is used to avoid resending the block name.
958 */
959 if (block != last_sent_block) {
960 flush_compressed_data(f);
961 pages = save_zero_page(f, block, offset, p, bytes_transferred);
962 if (pages == -1) {
963 set_compress_params(&comp_param[0], block, offset);
964 /* Use the qemu thread to compress the data to make sure the
965 * first page is sent out before other pages
966 */
967 bytes_xmit = do_compress_ram_page(&comp_param[0]);
968 acct_info.norm_pages++;
969 qemu_put_qemu_file(f, comp_param[0].file);
970 *bytes_transferred += bytes_xmit;
971 pages = 1;
972 }
973 } else {
974 pages = save_zero_page(f, block, offset, p, bytes_transferred);
975 if (pages == -1) {
976 pages = compress_page_with_multi_thread(f, block, offset,
977 bytes_transferred);
978 }
979 }
980 }
981
982 return pages;
983}
984
b9e60928
DDAG
985/*
986 * Find the next dirty page and update any state associated with
987 * the search process.
988 *
989 * Returns: True if a page is found
990 *
991 * @f: Current migration stream.
992 * @pss: Data about the state of the current dirty page scan.
993 * @*again: Set to false if the search has scanned the whole of RAM
e0b266f0
DDAG
994 * *ram_addr_abs: Pointer into which to store the address of the dirty page
995 * within the global ram_addr space
b9e60928
DDAG
996 */
997static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss,
f3f491fc 998 bool *again, ram_addr_t *ram_addr_abs)
b9e60928 999{
a82d593b
DDAG
1000 pss->offset = migration_bitmap_find_dirty(pss->block, pss->offset,
1001 ram_addr_abs);
b9e60928
DDAG
1002 if (pss->complete_round && pss->block == last_seen_block &&
1003 pss->offset >= last_offset) {
1004 /*
1005 * We've been once around the RAM and haven't found anything.
1006 * Give up.
1007 */
1008 *again = false;
1009 return false;
1010 }
1011 if (pss->offset >= pss->block->used_length) {
1012 /* Didn't find anything in this RAM Block */
1013 pss->offset = 0;
1014 pss->block = QLIST_NEXT_RCU(pss->block, next);
1015 if (!pss->block) {
1016 /* Hit the end of the list */
1017 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1018 /* Flag that we've looped */
1019 pss->complete_round = true;
1020 ram_bulk_stage = false;
1021 if (migrate_use_xbzrle()) {
1022 /* If xbzrle is on, stop using the data compression at this
1023 * point. In theory, xbzrle can do better than compression.
1024 */
1025 flush_compressed_data(f);
1026 compression_switch = false;
1027 }
1028 }
1029 /* Didn't find anything this time, but try again on the new block */
1030 *again = true;
1031 return false;
1032 } else {
1033 /* Can go around again, but... */
1034 *again = true;
1035 /* We've found something so probably don't need to */
1036 return true;
1037 }
1038}
1039
a82d593b
DDAG
1040/*
1041 * Helper for 'get_queued_page' - gets a page off the queue
1042 * ms: MigrationState in
1043 * *offset: Used to return the offset within the RAMBlock
1044 * ram_addr_abs: global offset in the dirty/sent bitmaps
1045 *
1046 * Returns: block (or NULL if none available)
1047 */
1048static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset,
1049 ram_addr_t *ram_addr_abs)
1050{
1051 RAMBlock *block = NULL;
1052
1053 qemu_mutex_lock(&ms->src_page_req_mutex);
1054 if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) {
1055 struct MigrationSrcPageRequest *entry =
1056 QSIMPLEQ_FIRST(&ms->src_page_requests);
1057 block = entry->rb;
1058 *offset = entry->offset;
1059 *ram_addr_abs = (entry->offset + entry->rb->offset) &
1060 TARGET_PAGE_MASK;
1061
1062 if (entry->len > TARGET_PAGE_SIZE) {
1063 entry->len -= TARGET_PAGE_SIZE;
1064 entry->offset += TARGET_PAGE_SIZE;
1065 } else {
1066 memory_region_unref(block->mr);
1067 QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1068 g_free(entry);
1069 }
1070 }
1071 qemu_mutex_unlock(&ms->src_page_req_mutex);
1072
1073 return block;
1074}
1075
1076/*
1077 * Unqueue a page from the queue fed by postcopy page requests; skips pages
1078 * that are already sent (!dirty)
1079 *
1080 * ms: MigrationState in
1081 * pss: PageSearchStatus structure updated with found block/offset
1082 * ram_addr_abs: global offset in the dirty/sent bitmaps
1083 *
1084 * Returns: true if a queued page is found
1085 */
1086static bool get_queued_page(MigrationState *ms, PageSearchStatus *pss,
1087 ram_addr_t *ram_addr_abs)
1088{
1089 RAMBlock *block;
1090 ram_addr_t offset;
1091 bool dirty;
1092
1093 do {
1094 block = unqueue_page(ms, &offset, ram_addr_abs);
1095 /*
1096 * We're sending this page, and since it's postcopy nothing else
1097 * will dirty it, and we must make sure it doesn't get sent again
1098 * even if this queue request was received after the background
1099 * search already sent it.
1100 */
1101 if (block) {
1102 unsigned long *bitmap;
1103 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1104 dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap);
1105 if (!dirty) {
1106 trace_get_queued_page_not_dirty(
1107 block->idstr, (uint64_t)offset,
1108 (uint64_t)*ram_addr_abs,
1109 test_bit(*ram_addr_abs >> TARGET_PAGE_BITS,
1110 atomic_rcu_read(&migration_bitmap_rcu)->unsentmap));
1111 } else {
1112 trace_get_queued_page(block->idstr,
1113 (uint64_t)offset,
1114 (uint64_t)*ram_addr_abs);
1115 }
1116 }
1117
1118 } while (block && !dirty);
1119
1120 if (block) {
1121 /*
1122 * As soon as we start servicing pages out of order, then we have
1123 * to kill the bulk stage, since the bulk stage assumes
1124 * in (migration_bitmap_find_and_reset_dirty) that every page is
1125 * dirty, that's no longer true.
1126 */
1127 ram_bulk_stage = false;
1128
1129 /*
1130 * We want the background search to continue from the queued page
1131 * since the guest is likely to want other pages near to the page
1132 * it just requested.
1133 */
1134 pss->block = block;
1135 pss->offset = offset;
1136 }
1137
1138 return !!block;
1139}
1140
6c595cde
DDAG
1141/**
1142 * flush_page_queue: Flush any remaining pages in the ram request queue
1143 * it should be empty at the end anyway, but in error cases there may be
1144 * some left.
1145 *
1146 * ms: MigrationState
1147 */
1148void flush_page_queue(MigrationState *ms)
1149{
1150 struct MigrationSrcPageRequest *mspr, *next_mspr;
1151 /* This queue generally should be empty - but in the case of a failed
1152 * migration might have some droppings in.
1153 */
1154 rcu_read_lock();
1155 QSIMPLEQ_FOREACH_SAFE(mspr, &ms->src_page_requests, next_req, next_mspr) {
1156 memory_region_unref(mspr->rb->mr);
1157 QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1158 g_free(mspr);
1159 }
1160 rcu_read_unlock();
1161}
1162
1163/**
1164 * Queue the pages for transmission, e.g. a request from postcopy destination
1165 * ms: MigrationStatus in which the queue is held
1166 * rbname: The RAMBlock the request is for - may be NULL (to mean reuse last)
1167 * start: Offset from the start of the RAMBlock
1168 * len: Length (in bytes) to send
1169 * Return: 0 on success
1170 */
1171int ram_save_queue_pages(MigrationState *ms, const char *rbname,
1172 ram_addr_t start, ram_addr_t len)
1173{
1174 RAMBlock *ramblock;
1175
d3bf5418 1176 ms->postcopy_requests++;
6c595cde
DDAG
1177 rcu_read_lock();
1178 if (!rbname) {
1179 /* Reuse last RAMBlock */
1180 ramblock = ms->last_req_rb;
1181
1182 if (!ramblock) {
1183 /*
1184 * Shouldn't happen, we can't reuse the last RAMBlock if
1185 * it's the 1st request.
1186 */
1187 error_report("ram_save_queue_pages no previous block");
1188 goto err;
1189 }
1190 } else {
1191 ramblock = qemu_ram_block_by_name(rbname);
1192
1193 if (!ramblock) {
1194 /* We shouldn't be asked for a non-existent RAMBlock */
1195 error_report("ram_save_queue_pages no block '%s'", rbname);
1196 goto err;
1197 }
1198 ms->last_req_rb = ramblock;
1199 }
1200 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1201 if (start+len > ramblock->used_length) {
9458ad6b
JQ
1202 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1203 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
6c595cde
DDAG
1204 __func__, start, len, ramblock->used_length);
1205 goto err;
1206 }
1207
1208 struct MigrationSrcPageRequest *new_entry =
1209 g_malloc0(sizeof(struct MigrationSrcPageRequest));
1210 new_entry->rb = ramblock;
1211 new_entry->offset = start;
1212 new_entry->len = len;
1213
1214 memory_region_ref(ramblock->mr);
1215 qemu_mutex_lock(&ms->src_page_req_mutex);
1216 QSIMPLEQ_INSERT_TAIL(&ms->src_page_requests, new_entry, next_req);
1217 qemu_mutex_unlock(&ms->src_page_req_mutex);
1218 rcu_read_unlock();
1219
1220 return 0;
1221
1222err:
1223 rcu_read_unlock();
1224 return -1;
1225}
1226
a82d593b
DDAG
1227/**
1228 * ram_save_target_page: Save one target page
1229 *
1230 *
1231 * @f: QEMUFile where to send the data
1232 * @block: pointer to block that contains the page we want to send
1233 * @offset: offset inside the block for the page;
1234 * @last_stage: if we are at the completion stage
1235 * @bytes_transferred: increase it with the number of transferred bytes
1236 * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1237 *
1238 * Returns: Number of pages written.
1239 */
1240static int ram_save_target_page(MigrationState *ms, QEMUFile *f,
a08f6890 1241 PageSearchStatus *pss,
a82d593b
DDAG
1242 bool last_stage,
1243 uint64_t *bytes_transferred,
1244 ram_addr_t dirty_ram_abs)
1245{
1246 int res = 0;
1247
1248 /* Check the pages is dirty and if it is send it */
1249 if (migration_bitmap_clear_dirty(dirty_ram_abs)) {
1250 unsigned long *unsentmap;
1251 if (compression_switch && migrate_use_compression()) {
a08f6890 1252 res = ram_save_compressed_page(f, pss,
a82d593b
DDAG
1253 last_stage,
1254 bytes_transferred);
1255 } else {
a08f6890 1256 res = ram_save_page(f, pss, last_stage,
a82d593b
DDAG
1257 bytes_transferred);
1258 }
1259
1260 if (res < 0) {
1261 return res;
1262 }
1263 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1264 if (unsentmap) {
1265 clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap);
1266 }
3fd3c4b3
DDAG
1267 /* Only update last_sent_block if a block was actually sent; xbzrle
1268 * might have decided the page was identical so didn't bother writing
1269 * to the stream.
1270 */
1271 if (res > 0) {
a08f6890 1272 last_sent_block = pss->block;
3fd3c4b3 1273 }
a82d593b
DDAG
1274 }
1275
1276 return res;
1277}
1278
1279/**
cb8d4c8f 1280 * ram_save_host_page: Starting at *offset send pages up to the end
a82d593b
DDAG
1281 * of the current host page. It's valid for the initial
1282 * offset to point into the middle of a host page
1283 * in which case the remainder of the hostpage is sent.
1284 * Only dirty target pages are sent.
1285 *
1286 * Returns: Number of pages written.
1287 *
1288 * @f: QEMUFile where to send the data
1289 * @block: pointer to block that contains the page we want to send
1290 * @offset: offset inside the block for the page; updated to last target page
1291 * sent
1292 * @last_stage: if we are at the completion stage
1293 * @bytes_transferred: increase it with the number of transferred bytes
1294 * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1295 */
a08f6890
HZ
1296static int ram_save_host_page(MigrationState *ms, QEMUFile *f,
1297 PageSearchStatus *pss,
1298 bool last_stage,
a82d593b
DDAG
1299 uint64_t *bytes_transferred,
1300 ram_addr_t dirty_ram_abs)
1301{
1302 int tmppages, pages = 0;
1303 do {
a08f6890 1304 tmppages = ram_save_target_page(ms, f, pss, last_stage,
a82d593b
DDAG
1305 bytes_transferred, dirty_ram_abs);
1306 if (tmppages < 0) {
1307 return tmppages;
1308 }
1309
1310 pages += tmppages;
a08f6890 1311 pss->offset += TARGET_PAGE_SIZE;
a82d593b 1312 dirty_ram_abs += TARGET_PAGE_SIZE;
a08f6890 1313 } while (pss->offset & (qemu_host_page_size - 1));
a82d593b
DDAG
1314
1315 /* The offset we leave with is the last one we looked at */
a08f6890 1316 pss->offset -= TARGET_PAGE_SIZE;
a82d593b
DDAG
1317 return pages;
1318}
6c595cde 1319
56e93d26
JQ
1320/**
1321 * ram_find_and_save_block: Finds a dirty page and sends it to f
1322 *
1323 * Called within an RCU critical section.
1324 *
1325 * Returns: The number of pages written
1326 * 0 means no dirty pages
1327 *
1328 * @f: QEMUFile where to send the data
1329 * @last_stage: if we are at the completion stage
1330 * @bytes_transferred: increase it with the number of transferred bytes
a82d593b
DDAG
1331 *
1332 * On systems where host-page-size > target-page-size it will send all the
1333 * pages in a host page that are dirty.
56e93d26
JQ
1334 */
1335
1336static int ram_find_and_save_block(QEMUFile *f, bool last_stage,
1337 uint64_t *bytes_transferred)
1338{
b8fb8cb7 1339 PageSearchStatus pss;
a82d593b 1340 MigrationState *ms = migrate_get_current();
56e93d26 1341 int pages = 0;
b9e60928 1342 bool again, found;
f3f491fc
DDAG
1343 ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
1344 ram_addr_t space */
56e93d26 1345
b8fb8cb7
DDAG
1346 pss.block = last_seen_block;
1347 pss.offset = last_offset;
1348 pss.complete_round = false;
1349
1350 if (!pss.block) {
1351 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1352 }
56e93d26 1353
b9e60928 1354 do {
a82d593b
DDAG
1355 again = true;
1356 found = get_queued_page(ms, &pss, &dirty_ram_abs);
b9e60928 1357
a82d593b
DDAG
1358 if (!found) {
1359 /* priority queue empty, so just search for something dirty */
1360 found = find_dirty_block(f, &pss, &again, &dirty_ram_abs);
1361 }
f3f491fc 1362
a82d593b 1363 if (found) {
a08f6890 1364 pages = ram_save_host_page(ms, f, &pss,
a82d593b
DDAG
1365 last_stage, bytes_transferred,
1366 dirty_ram_abs);
56e93d26 1367 }
b9e60928 1368 } while (!pages && again);
56e93d26 1369
b8fb8cb7
DDAG
1370 last_seen_block = pss.block;
1371 last_offset = pss.offset;
56e93d26
JQ
1372
1373 return pages;
1374}
1375
1376void acct_update_position(QEMUFile *f, size_t size, bool zero)
1377{
1378 uint64_t pages = size / TARGET_PAGE_SIZE;
1379 if (zero) {
1380 acct_info.dup_pages += pages;
1381 } else {
1382 acct_info.norm_pages += pages;
1383 bytes_transferred += size;
1384 qemu_update_position(f, size);
1385 }
1386}
1387
1388static ram_addr_t ram_save_remaining(void)
1389{
1390 return migration_dirty_pages;
1391}
1392
1393uint64_t ram_bytes_remaining(void)
1394{
1395 return ram_save_remaining() * TARGET_PAGE_SIZE;
1396}
1397
1398uint64_t ram_bytes_transferred(void)
1399{
1400 return bytes_transferred;
1401}
1402
1403uint64_t ram_bytes_total(void)
1404{
1405 RAMBlock *block;
1406 uint64_t total = 0;
1407
1408 rcu_read_lock();
1409 QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
1410 total += block->used_length;
1411 rcu_read_unlock();
1412 return total;
1413}
1414
1415void free_xbzrle_decoded_buf(void)
1416{
1417 g_free(xbzrle_decoded_buf);
1418 xbzrle_decoded_buf = NULL;
1419}
1420
60be6340
DL
1421static void migration_bitmap_free(struct BitmapRcu *bmap)
1422{
1423 g_free(bmap->bmap);
f3f491fc 1424 g_free(bmap->unsentmap);
60be6340
DL
1425 g_free(bmap);
1426}
1427
6ad2a215 1428static void ram_migration_cleanup(void *opaque)
56e93d26 1429{
2ff64038
LZ
1430 /* caller have hold iothread lock or is in a bh, so there is
1431 * no writing race against this migration_bitmap
1432 */
60be6340
DL
1433 struct BitmapRcu *bitmap = migration_bitmap_rcu;
1434 atomic_rcu_set(&migration_bitmap_rcu, NULL);
2ff64038 1435 if (bitmap) {
56e93d26 1436 memory_global_dirty_log_stop();
60be6340 1437 call_rcu(bitmap, migration_bitmap_free, rcu);
56e93d26
JQ
1438 }
1439
1440 XBZRLE_cache_lock();
1441 if (XBZRLE.cache) {
1442 cache_fini(XBZRLE.cache);
1443 g_free(XBZRLE.encoded_buf);
1444 g_free(XBZRLE.current_buf);
1445 XBZRLE.cache = NULL;
1446 XBZRLE.encoded_buf = NULL;
1447 XBZRLE.current_buf = NULL;
1448 }
1449 XBZRLE_cache_unlock();
1450}
1451
56e93d26
JQ
1452static void reset_ram_globals(void)
1453{
1454 last_seen_block = NULL;
1455 last_sent_block = NULL;
1456 last_offset = 0;
1457 last_version = ram_list.version;
1458 ram_bulk_stage = true;
1459}
1460
1461#define MAX_WAIT 50 /* ms, half buffered_file limit */
1462
dd631697
LZ
1463void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
1464{
1465 /* called in qemu main thread, so there is
1466 * no writing race against this migration_bitmap
1467 */
60be6340
DL
1468 if (migration_bitmap_rcu) {
1469 struct BitmapRcu *old_bitmap = migration_bitmap_rcu, *bitmap;
1470 bitmap = g_new(struct BitmapRcu, 1);
1471 bitmap->bmap = bitmap_new(new);
dd631697
LZ
1472
1473 /* prevent migration_bitmap content from being set bit
1474 * by migration_bitmap_sync_range() at the same time.
1475 * it is safe to migration if migration_bitmap is cleared bit
1476 * at the same time.
1477 */
1478 qemu_mutex_lock(&migration_bitmap_mutex);
60be6340
DL
1479 bitmap_copy(bitmap->bmap, old_bitmap->bmap, old);
1480 bitmap_set(bitmap->bmap, old, new - old);
f3f491fc
DDAG
1481
1482 /* We don't have a way to safely extend the sentmap
1483 * with RCU; so mark it as missing, entry to postcopy
1484 * will fail.
1485 */
1486 bitmap->unsentmap = NULL;
1487
60be6340 1488 atomic_rcu_set(&migration_bitmap_rcu, bitmap);
dd631697
LZ
1489 qemu_mutex_unlock(&migration_bitmap_mutex);
1490 migration_dirty_pages += new - old;
60be6340 1491 call_rcu(old_bitmap, migration_bitmap_free, rcu);
dd631697
LZ
1492 }
1493}
56e93d26 1494
4f2e4252
DDAG
1495/*
1496 * 'expected' is the value you expect the bitmap mostly to be full
1497 * of; it won't bother printing lines that are all this value.
1498 * If 'todump' is null the migration bitmap is dumped.
1499 */
1500void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
1501{
1502 int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1503
1504 int64_t cur;
1505 int64_t linelen = 128;
1506 char linebuf[129];
1507
1508 if (!todump) {
1509 todump = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1510 }
1511
1512 for (cur = 0; cur < ram_pages; cur += linelen) {
1513 int64_t curb;
1514 bool found = false;
1515 /*
1516 * Last line; catch the case where the line length
1517 * is longer than remaining ram
1518 */
1519 if (cur + linelen > ram_pages) {
1520 linelen = ram_pages - cur;
1521 }
1522 for (curb = 0; curb < linelen; curb++) {
1523 bool thisbit = test_bit(cur + curb, todump);
1524 linebuf[curb] = thisbit ? '1' : '.';
1525 found = found || (thisbit != expected);
1526 }
1527 if (found) {
1528 linebuf[curb] = '\0';
1529 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
1530 }
1531 }
1532}
1533
e0b266f0
DDAG
1534/* **** functions for postcopy ***** */
1535
1536/*
1537 * Callback from postcopy_each_ram_send_discard for each RAMBlock
1538 * Note: At this point the 'unsentmap' is the processed bitmap combined
1539 * with the dirtymap; so a '1' means it's either dirty or unsent.
1540 * start,length: Indexes into the bitmap for the first bit
1541 * representing the named block and length in target-pages
1542 */
1543static int postcopy_send_discard_bm_ram(MigrationState *ms,
1544 PostcopyDiscardState *pds,
1545 unsigned long start,
1546 unsigned long length)
1547{
1548 unsigned long end = start + length; /* one after the end */
1549 unsigned long current;
1550 unsigned long *unsentmap;
1551
1552 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1553 for (current = start; current < end; ) {
1554 unsigned long one = find_next_bit(unsentmap, end, current);
1555
1556 if (one <= end) {
1557 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1558 unsigned long discard_length;
1559
1560 if (zero >= end) {
1561 discard_length = end - one;
1562 } else {
1563 discard_length = zero - one;
1564 }
d688c62d
DDAG
1565 if (discard_length) {
1566 postcopy_discard_send_range(ms, pds, one, discard_length);
1567 }
e0b266f0
DDAG
1568 current = one + discard_length;
1569 } else {
1570 current = one;
1571 }
1572 }
1573
1574 return 0;
1575}
1576
1577/*
1578 * Utility for the outgoing postcopy code.
1579 * Calls postcopy_send_discard_bm_ram for each RAMBlock
1580 * passing it bitmap indexes and name.
1581 * Returns: 0 on success
1582 * (qemu_ram_foreach_block ends up passing unscaled lengths
1583 * which would mean postcopy code would have to deal with target page)
1584 */
1585static int postcopy_each_ram_send_discard(MigrationState *ms)
1586{
1587 struct RAMBlock *block;
1588 int ret;
1589
1590 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1591 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1592 PostcopyDiscardState *pds = postcopy_discard_send_init(ms,
1593 first,
1594 block->idstr);
1595
1596 /*
1597 * Postcopy sends chunks of bitmap over the wire, but it
1598 * just needs indexes at this point, avoids it having
1599 * target page specific code.
1600 */
1601 ret = postcopy_send_discard_bm_ram(ms, pds, first,
1602 block->used_length >> TARGET_PAGE_BITS);
1603 postcopy_discard_send_finish(ms, pds);
1604 if (ret) {
1605 return ret;
1606 }
1607 }
1608
1609 return 0;
1610}
1611
99e314eb
DDAG
1612/*
1613 * Helper for postcopy_chunk_hostpages; it's called twice to cleanup
1614 * the two bitmaps, that are similar, but one is inverted.
1615 *
1616 * We search for runs of target-pages that don't start or end on a
1617 * host page boundary;
1618 * unsent_pass=true: Cleans up partially unsent host pages by searching
1619 * the unsentmap
1620 * unsent_pass=false: Cleans up partially dirty host pages by searching
1621 * the main migration bitmap
1622 *
1623 */
1624static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1625 RAMBlock *block,
1626 PostcopyDiscardState *pds)
1627{
1628 unsigned long *bitmap;
1629 unsigned long *unsentmap;
1630 unsigned int host_ratio = qemu_host_page_size / TARGET_PAGE_SIZE;
1631 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1632 unsigned long len = block->used_length >> TARGET_PAGE_BITS;
1633 unsigned long last = first + (len - 1);
1634 unsigned long run_start;
1635
1636 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1637 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1638
1639 if (unsent_pass) {
1640 /* Find a sent page */
1641 run_start = find_next_zero_bit(unsentmap, last + 1, first);
1642 } else {
1643 /* Find a dirty page */
1644 run_start = find_next_bit(bitmap, last + 1, first);
1645 }
1646
1647 while (run_start <= last) {
1648 bool do_fixup = false;
1649 unsigned long fixup_start_addr;
1650 unsigned long host_offset;
1651
1652 /*
1653 * If the start of this run of pages is in the middle of a host
1654 * page, then we need to fixup this host page.
1655 */
1656 host_offset = run_start % host_ratio;
1657 if (host_offset) {
1658 do_fixup = true;
1659 run_start -= host_offset;
1660 fixup_start_addr = run_start;
1661 /* For the next pass */
1662 run_start = run_start + host_ratio;
1663 } else {
1664 /* Find the end of this run */
1665 unsigned long run_end;
1666 if (unsent_pass) {
1667 run_end = find_next_bit(unsentmap, last + 1, run_start + 1);
1668 } else {
1669 run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1);
1670 }
1671 /*
1672 * If the end isn't at the start of a host page, then the
1673 * run doesn't finish at the end of a host page
1674 * and we need to discard.
1675 */
1676 host_offset = run_end % host_ratio;
1677 if (host_offset) {
1678 do_fixup = true;
1679 fixup_start_addr = run_end - host_offset;
1680 /*
1681 * This host page has gone, the next loop iteration starts
1682 * from after the fixup
1683 */
1684 run_start = fixup_start_addr + host_ratio;
1685 } else {
1686 /*
1687 * No discards on this iteration, next loop starts from
1688 * next sent/dirty page
1689 */
1690 run_start = run_end + 1;
1691 }
1692 }
1693
1694 if (do_fixup) {
1695 unsigned long page;
1696
1697 /* Tell the destination to discard this page */
1698 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1699 /* For the unsent_pass we:
1700 * discard partially sent pages
1701 * For the !unsent_pass (dirty) we:
1702 * discard partially dirty pages that were sent
1703 * (any partially sent pages were already discarded
1704 * by the previous unsent_pass)
1705 */
1706 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1707 host_ratio);
1708 }
1709
1710 /* Clean up the bitmap */
1711 for (page = fixup_start_addr;
1712 page < fixup_start_addr + host_ratio; page++) {
1713 /* All pages in this host page are now not sent */
1714 set_bit(page, unsentmap);
1715
1716 /*
1717 * Remark them as dirty, updating the count for any pages
1718 * that weren't previously dirty.
1719 */
1720 migration_dirty_pages += !test_and_set_bit(page, bitmap);
1721 }
1722 }
1723
1724 if (unsent_pass) {
1725 /* Find the next sent page for the next iteration */
1726 run_start = find_next_zero_bit(unsentmap, last + 1,
1727 run_start);
1728 } else {
1729 /* Find the next dirty page for the next iteration */
1730 run_start = find_next_bit(bitmap, last + 1, run_start);
1731 }
1732 }
1733}
1734
1735/*
1736 * Utility for the outgoing postcopy code.
1737 *
1738 * Discard any partially sent host-page size chunks, mark any partially
1739 * dirty host-page size chunks as all dirty.
1740 *
1741 * Returns: 0 on success
1742 */
1743static int postcopy_chunk_hostpages(MigrationState *ms)
1744{
1745 struct RAMBlock *block;
1746
1747 if (qemu_host_page_size == TARGET_PAGE_SIZE) {
1748 /* Easy case - TPS==HPS - nothing to be done */
1749 return 0;
1750 }
1751
1752 /* Easiest way to make sure we don't resume in the middle of a host-page */
1753 last_seen_block = NULL;
1754 last_sent_block = NULL;
1755 last_offset = 0;
1756
1757 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1758 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1759
1760 PostcopyDiscardState *pds =
1761 postcopy_discard_send_init(ms, first, block->idstr);
1762
1763 /* First pass: Discard all partially sent host pages */
1764 postcopy_chunk_hostpages_pass(ms, true, block, pds);
1765 /*
1766 * Second pass: Ensure that all partially dirty host pages are made
1767 * fully dirty.
1768 */
1769 postcopy_chunk_hostpages_pass(ms, false, block, pds);
1770
1771 postcopy_discard_send_finish(ms, pds);
1772 } /* ram_list loop */
1773
1774 return 0;
1775}
1776
e0b266f0
DDAG
1777/*
1778 * Transmit the set of pages to be discarded after precopy to the target
1779 * these are pages that:
1780 * a) Have been previously transmitted but are now dirty again
1781 * b) Pages that have never been transmitted, this ensures that
1782 * any pages on the destination that have been mapped by background
1783 * tasks get discarded (transparent huge pages is the specific concern)
1784 * Hopefully this is pretty sparse
1785 */
1786int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1787{
1788 int ret;
1789 unsigned long *bitmap, *unsentmap;
1790
1791 rcu_read_lock();
1792
1793 /* This should be our last sync, the src is now paused */
1794 migration_bitmap_sync();
1795
1796 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1797 if (!unsentmap) {
1798 /* We don't have a safe way to resize the sentmap, so
1799 * if the bitmap was resized it will be NULL at this
1800 * point.
1801 */
1802 error_report("migration ram resized during precopy phase");
1803 rcu_read_unlock();
1804 return -EINVAL;
1805 }
1806
99e314eb
DDAG
1807 /* Deal with TPS != HPS */
1808 ret = postcopy_chunk_hostpages(ms);
1809 if (ret) {
1810 rcu_read_unlock();
1811 return ret;
1812 }
1813
e0b266f0
DDAG
1814 /*
1815 * Update the unsentmap to be unsentmap = unsentmap | dirty
1816 */
1817 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1818 bitmap_or(unsentmap, unsentmap, bitmap,
1819 last_ram_offset() >> TARGET_PAGE_BITS);
1820
1821
1822 trace_ram_postcopy_send_discard_bitmap();
1823#ifdef DEBUG_POSTCOPY
1824 ram_debug_dump_bitmap(unsentmap, true);
1825#endif
1826
1827 ret = postcopy_each_ram_send_discard(ms);
1828 rcu_read_unlock();
1829
1830 return ret;
1831}
1832
1833/*
1834 * At the start of the postcopy phase of migration, any now-dirty
1835 * precopied pages are discarded.
1836 *
1837 * start, length describe a byte address range within the RAMBlock
1838 *
1839 * Returns 0 on success.
1840 */
1841int ram_discard_range(MigrationIncomingState *mis,
1842 const char *block_name,
1843 uint64_t start, size_t length)
1844{
1845 int ret = -1;
1846
1847 rcu_read_lock();
1848 RAMBlock *rb = qemu_ram_block_by_name(block_name);
1849
1850 if (!rb) {
1851 error_report("ram_discard_range: Failed to find block '%s'",
1852 block_name);
1853 goto err;
1854 }
1855
1856 uint8_t *host_startaddr = rb->host + start;
1857
1858 if ((uintptr_t)host_startaddr & (qemu_host_page_size - 1)) {
1859 error_report("ram_discard_range: Unaligned start address: %p",
1860 host_startaddr);
1861 goto err;
1862 }
1863
1864 if ((start + length) <= rb->used_length) {
1865 uint8_t *host_endaddr = host_startaddr + length;
1866 if ((uintptr_t)host_endaddr & (qemu_host_page_size - 1)) {
1867 error_report("ram_discard_range: Unaligned end address: %p",
1868 host_endaddr);
1869 goto err;
1870 }
1871 ret = postcopy_ram_discard_range(mis, host_startaddr, length);
1872 } else {
1873 error_report("ram_discard_range: Overrun block '%s' (%" PRIu64
9458ad6b 1874 "/%zx/" RAM_ADDR_FMT")",
e0b266f0
DDAG
1875 block_name, start, length, rb->used_length);
1876 }
1877
1878err:
1879 rcu_read_unlock();
1880
1881 return ret;
1882}
1883
1884
56e93d26
JQ
1885/* Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1886 * long-running RCU critical section. When rcu-reclaims in the code
1887 * start to become numerous it will be necessary to reduce the
1888 * granularity of these critical sections.
1889 */
1890
1891static int ram_save_setup(QEMUFile *f, void *opaque)
1892{
1893 RAMBlock *block;
1894 int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
1895
56e93d26
JQ
1896 dirty_rate_high_cnt = 0;
1897 bitmap_sync_count = 0;
1898 migration_bitmap_sync_init();
dd631697 1899 qemu_mutex_init(&migration_bitmap_mutex);
56e93d26
JQ
1900
1901 if (migrate_use_xbzrle()) {
1902 XBZRLE_cache_lock();
1903 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1904 TARGET_PAGE_SIZE,
1905 TARGET_PAGE_SIZE);
1906 if (!XBZRLE.cache) {
1907 XBZRLE_cache_unlock();
1908 error_report("Error creating cache");
1909 return -1;
1910 }
1911 XBZRLE_cache_unlock();
1912
1913 /* We prefer not to abort if there is no memory */
1914 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1915 if (!XBZRLE.encoded_buf) {
1916 error_report("Error allocating encoded_buf");
1917 return -1;
1918 }
1919
1920 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1921 if (!XBZRLE.current_buf) {
1922 error_report("Error allocating current_buf");
1923 g_free(XBZRLE.encoded_buf);
1924 XBZRLE.encoded_buf = NULL;
1925 return -1;
1926 }
1927
1928 acct_clear();
1929 }
1930
49877834
PB
1931 /* For memory_global_dirty_log_start below. */
1932 qemu_mutex_lock_iothread();
1933
56e93d26
JQ
1934 qemu_mutex_lock_ramlist();
1935 rcu_read_lock();
1936 bytes_transferred = 0;
1937 reset_ram_globals();
1938
1939 ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
f3f491fc 1940 migration_bitmap_rcu = g_new0(struct BitmapRcu, 1);
60be6340
DL
1941 migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages);
1942 bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages);
56e93d26 1943
f3f491fc
DDAG
1944 if (migrate_postcopy_ram()) {
1945 migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages);
1946 bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages);
1947 }
1948
56e93d26
JQ
1949 /*
1950 * Count the total number of pages used by ram blocks not including any
1951 * gaps due to alignment or unplugs.
1952 */
1953 migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1954
1955 memory_global_dirty_log_start();
1956 migration_bitmap_sync();
1957 qemu_mutex_unlock_ramlist();
49877834 1958 qemu_mutex_unlock_iothread();
56e93d26
JQ
1959
1960 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1961
1962 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1963 qemu_put_byte(f, strlen(block->idstr));
1964 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1965 qemu_put_be64(f, block->used_length);
1966 }
1967
1968 rcu_read_unlock();
1969
1970 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1971 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1972
1973 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1974
1975 return 0;
1976}
1977
1978static int ram_save_iterate(QEMUFile *f, void *opaque)
1979{
1980 int ret;
1981 int i;
1982 int64_t t0;
1983 int pages_sent = 0;
1984
1985 rcu_read_lock();
1986 if (ram_list.version != last_version) {
1987 reset_ram_globals();
1988 }
1989
1990 /* Read version before ram_list.blocks */
1991 smp_rmb();
1992
1993 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
1994
1995 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1996 i = 0;
1997 while ((ret = qemu_file_rate_limit(f)) == 0) {
1998 int pages;
1999
2000 pages = ram_find_and_save_block(f, false, &bytes_transferred);
2001 /* no more pages to sent */
2002 if (pages == 0) {
2003 break;
2004 }
2005 pages_sent += pages;
2006 acct_info.iterations++;
070afca2 2007
56e93d26
JQ
2008 /* we want to check in the 1st loop, just in case it was the 1st time
2009 and we had to sync the dirty bitmap.
2010 qemu_get_clock_ns() is a bit expensive, so we only check each some
2011 iterations
2012 */
2013 if ((i & 63) == 0) {
2014 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2015 if (t1 > MAX_WAIT) {
2016 DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n",
2017 t1, i);
2018 break;
2019 }
2020 }
2021 i++;
2022 }
2023 flush_compressed_data(f);
2024 rcu_read_unlock();
2025
2026 /*
2027 * Must occur before EOS (or any QEMUFile operation)
2028 * because of RDMA protocol.
2029 */
2030 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2031
2032 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2033 bytes_transferred += 8;
2034
2035 ret = qemu_file_get_error(f);
2036 if (ret < 0) {
2037 return ret;
2038 }
2039
2040 return pages_sent;
2041}
2042
2043/* Called with iothread lock */
2044static int ram_save_complete(QEMUFile *f, void *opaque)
2045{
2046 rcu_read_lock();
2047
663e6c1d
DDAG
2048 if (!migration_in_postcopy(migrate_get_current())) {
2049 migration_bitmap_sync();
2050 }
56e93d26
JQ
2051
2052 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2053
2054 /* try transferring iterative blocks of memory */
2055
2056 /* flush all remaining blocks regardless of rate limiting */
2057 while (true) {
2058 int pages;
2059
2060 pages = ram_find_and_save_block(f, true, &bytes_transferred);
2061 /* no more blocks to sent */
2062 if (pages == 0) {
2063 break;
2064 }
2065 }
2066
2067 flush_compressed_data(f);
2068 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
56e93d26
JQ
2069
2070 rcu_read_unlock();
d09a6fde 2071
56e93d26
JQ
2072 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2073
2074 return 0;
2075}
2076
c31b098f
DDAG
2077static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2078 uint64_t *non_postcopiable_pending,
2079 uint64_t *postcopiable_pending)
56e93d26
JQ
2080{
2081 uint64_t remaining_size;
2082
2083 remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2084
663e6c1d
DDAG
2085 if (!migration_in_postcopy(migrate_get_current()) &&
2086 remaining_size < max_size) {
56e93d26
JQ
2087 qemu_mutex_lock_iothread();
2088 rcu_read_lock();
2089 migration_bitmap_sync();
2090 rcu_read_unlock();
2091 qemu_mutex_unlock_iothread();
2092 remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2093 }
c31b098f
DDAG
2094
2095 /* We can do postcopy, and all the data is postcopiable */
2096 *postcopiable_pending += remaining_size;
56e93d26
JQ
2097}
2098
2099static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2100{
2101 unsigned int xh_len;
2102 int xh_flags;
063e760a 2103 uint8_t *loaded_data;
56e93d26
JQ
2104
2105 if (!xbzrle_decoded_buf) {
2106 xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2107 }
063e760a 2108 loaded_data = xbzrle_decoded_buf;
56e93d26
JQ
2109
2110 /* extract RLE header */
2111 xh_flags = qemu_get_byte(f);
2112 xh_len = qemu_get_be16(f);
2113
2114 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2115 error_report("Failed to load XBZRLE page - wrong compression!");
2116 return -1;
2117 }
2118
2119 if (xh_len > TARGET_PAGE_SIZE) {
2120 error_report("Failed to load XBZRLE page - len overflow!");
2121 return -1;
2122 }
2123 /* load data and decode */
063e760a 2124 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
56e93d26
JQ
2125
2126 /* decode RLE */
063e760a 2127 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
56e93d26
JQ
2128 TARGET_PAGE_SIZE) == -1) {
2129 error_report("Failed to load XBZRLE page - decode error!");
2130 return -1;
2131 }
2132
2133 return 0;
2134}
2135
2136/* Must be called from within a rcu critical section.
2137 * Returns a pointer from within the RCU-protected ram_list.
2138 */
a7180877 2139/*
4c4bad48 2140 * Read a RAMBlock ID from the stream f.
a7180877
DDAG
2141 *
2142 * f: Stream to read from
a7180877
DDAG
2143 * flags: Page flags (mostly to see if it's a continuation of previous block)
2144 */
4c4bad48
HZ
2145static inline RAMBlock *ram_block_from_stream(QEMUFile *f,
2146 int flags)
56e93d26
JQ
2147{
2148 static RAMBlock *block = NULL;
2149 char id[256];
2150 uint8_t len;
2151
2152 if (flags & RAM_SAVE_FLAG_CONTINUE) {
4c4bad48 2153 if (!block) {
56e93d26
JQ
2154 error_report("Ack, bad migration stream!");
2155 return NULL;
2156 }
4c4bad48 2157 return block;
56e93d26
JQ
2158 }
2159
2160 len = qemu_get_byte(f);
2161 qemu_get_buffer(f, (uint8_t *)id, len);
2162 id[len] = 0;
2163
e3dd7493 2164 block = qemu_ram_block_by_name(id);
4c4bad48
HZ
2165 if (!block) {
2166 error_report("Can't find block %s", id);
2167 return NULL;
56e93d26
JQ
2168 }
2169
4c4bad48
HZ
2170 return block;
2171}
2172
2173static inline void *host_from_ram_block_offset(RAMBlock *block,
2174 ram_addr_t offset)
2175{
2176 if (!offset_in_ramblock(block, offset)) {
2177 return NULL;
2178 }
2179
2180 return block->host + offset;
56e93d26
JQ
2181}
2182
2183/*
2184 * If a page (or a whole RDMA chunk) has been
2185 * determined to be zero, then zap it.
2186 */
2187void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2188{
2189 if (ch != 0 || !is_zero_range(host, size)) {
2190 memset(host, ch, size);
2191 }
2192}
2193
2194static void *do_data_decompress(void *opaque)
2195{
2196 DecompressParam *param = opaque;
2197 unsigned long pagesize;
2198
2199 while (!quit_decomp_thread) {
2200 qemu_mutex_lock(&param->mutex);
2201 while (!param->start && !quit_decomp_thread) {
2202 qemu_cond_wait(&param->cond, &param->mutex);
73a8912b
LL
2203 }
2204 if (!quit_decomp_thread) {
56e93d26 2205 pagesize = TARGET_PAGE_SIZE;
73a8912b
LL
2206 /* uncompress() will return failed in some case, especially
2207 * when the page is dirted when doing the compression, it's
2208 * not a problem because the dirty page will be retransferred
2209 * and uncompress() won't break the data in other pages.
2210 */
2211 uncompress((Bytef *)param->des, &pagesize,
2212 (const Bytef *)param->compbuf, param->len);
56e93d26 2213 }
73a8912b 2214 param->start = false;
56e93d26 2215 qemu_mutex_unlock(&param->mutex);
73a8912b
LL
2216
2217 qemu_mutex_lock(&decomp_done_lock);
2218 param->done = true;
2219 qemu_cond_signal(&decomp_done_cond);
2220 qemu_mutex_unlock(&decomp_done_lock);
56e93d26
JQ
2221 }
2222
2223 return NULL;
2224}
2225
5533b2e9
LL
2226static void wait_for_decompress_done(void)
2227{
2228 int idx, thread_count;
2229
2230 if (!migrate_use_compression()) {
2231 return;
2232 }
2233
2234 thread_count = migrate_decompress_threads();
2235 qemu_mutex_lock(&decomp_done_lock);
2236 for (idx = 0; idx < thread_count; idx++) {
2237 while (!decomp_param[idx].done) {
2238 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2239 }
2240 }
2241 qemu_mutex_unlock(&decomp_done_lock);
2242}
2243
56e93d26
JQ
2244void migrate_decompress_threads_create(void)
2245{
2246 int i, thread_count;
2247
2248 thread_count = migrate_decompress_threads();
2249 decompress_threads = g_new0(QemuThread, thread_count);
2250 decomp_param = g_new0(DecompressParam, thread_count);
56e93d26 2251 quit_decomp_thread = false;
73a8912b
LL
2252 qemu_mutex_init(&decomp_done_lock);
2253 qemu_cond_init(&decomp_done_cond);
56e93d26
JQ
2254 for (i = 0; i < thread_count; i++) {
2255 qemu_mutex_init(&decomp_param[i].mutex);
2256 qemu_cond_init(&decomp_param[i].cond);
2257 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
73a8912b 2258 decomp_param[i].done = true;
56e93d26
JQ
2259 qemu_thread_create(decompress_threads + i, "decompress",
2260 do_data_decompress, decomp_param + i,
2261 QEMU_THREAD_JOINABLE);
2262 }
2263}
2264
2265void migrate_decompress_threads_join(void)
2266{
2267 int i, thread_count;
2268
2269 quit_decomp_thread = true;
2270 thread_count = migrate_decompress_threads();
2271 for (i = 0; i < thread_count; i++) {
2272 qemu_mutex_lock(&decomp_param[i].mutex);
2273 qemu_cond_signal(&decomp_param[i].cond);
2274 qemu_mutex_unlock(&decomp_param[i].mutex);
2275 }
2276 for (i = 0; i < thread_count; i++) {
2277 qemu_thread_join(decompress_threads + i);
2278 qemu_mutex_destroy(&decomp_param[i].mutex);
2279 qemu_cond_destroy(&decomp_param[i].cond);
2280 g_free(decomp_param[i].compbuf);
2281 }
2282 g_free(decompress_threads);
2283 g_free(decomp_param);
56e93d26
JQ
2284 decompress_threads = NULL;
2285 decomp_param = NULL;
56e93d26
JQ
2286}
2287
c1bc6626 2288static void decompress_data_with_multi_threads(QEMUFile *f,
56e93d26
JQ
2289 void *host, int len)
2290{
2291 int idx, thread_count;
2292
2293 thread_count = migrate_decompress_threads();
73a8912b 2294 qemu_mutex_lock(&decomp_done_lock);
56e93d26
JQ
2295 while (true) {
2296 for (idx = 0; idx < thread_count; idx++) {
73a8912b 2297 if (decomp_param[idx].done) {
c1bc6626 2298 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
56e93d26
JQ
2299 decomp_param[idx].des = host;
2300 decomp_param[idx].len = len;
2301 start_decompression(&decomp_param[idx]);
2302 break;
2303 }
2304 }
2305 if (idx < thread_count) {
2306 break;
73a8912b
LL
2307 } else {
2308 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
56e93d26
JQ
2309 }
2310 }
73a8912b 2311 qemu_mutex_unlock(&decomp_done_lock);
56e93d26
JQ
2312}
2313
1caddf8a
DDAG
2314/*
2315 * Allocate data structures etc needed by incoming migration with postcopy-ram
2316 * postcopy-ram's similarly names postcopy_ram_incoming_init does the work
2317 */
2318int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2319{
2320 size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
2321
2322 return postcopy_ram_incoming_init(mis, ram_pages);
2323}
2324
a7180877
DDAG
2325/*
2326 * Called in postcopy mode by ram_load().
2327 * rcu_read_lock is taken prior to this being called.
2328 */
2329static int ram_load_postcopy(QEMUFile *f)
2330{
2331 int flags = 0, ret = 0;
2332 bool place_needed = false;
2333 bool matching_page_sizes = qemu_host_page_size == TARGET_PAGE_SIZE;
2334 MigrationIncomingState *mis = migration_incoming_get_current();
2335 /* Temporary page that is later 'placed' */
2336 void *postcopy_host_page = postcopy_get_tmp_page(mis);
c53b7ddc 2337 void *last_host = NULL;
a3b6ff6d 2338 bool all_zero = false;
a7180877
DDAG
2339
2340 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2341 ram_addr_t addr;
2342 void *host = NULL;
2343 void *page_buffer = NULL;
2344 void *place_source = NULL;
2345 uint8_t ch;
a7180877
DDAG
2346
2347 addr = qemu_get_be64(f);
2348 flags = addr & ~TARGET_PAGE_MASK;
2349 addr &= TARGET_PAGE_MASK;
2350
2351 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2352 place_needed = false;
2353 if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) {
4c4bad48
HZ
2354 RAMBlock *block = ram_block_from_stream(f, flags);
2355
2356 host = host_from_ram_block_offset(block, addr);
a7180877
DDAG
2357 if (!host) {
2358 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2359 ret = -EINVAL;
2360 break;
2361 }
a7180877
DDAG
2362 /*
2363 * Postcopy requires that we place whole host pages atomically.
2364 * To make it atomic, the data is read into a temporary page
2365 * that's moved into place later.
2366 * The migration protocol uses, possibly smaller, target-pages
2367 * however the source ensures it always sends all the components
2368 * of a host page in order.
2369 */
2370 page_buffer = postcopy_host_page +
2371 ((uintptr_t)host & ~qemu_host_page_mask);
2372 /* If all TP are zero then we can optimise the place */
2373 if (!((uintptr_t)host & ~qemu_host_page_mask)) {
2374 all_zero = true;
c53b7ddc
DDAG
2375 } else {
2376 /* not the 1st TP within the HP */
2377 if (host != (last_host + TARGET_PAGE_SIZE)) {
9af9e0fe 2378 error_report("Non-sequential target page %p/%p",
c53b7ddc
DDAG
2379 host, last_host);
2380 ret = -EINVAL;
2381 break;
2382 }
a7180877
DDAG
2383 }
2384
c53b7ddc 2385
a7180877
DDAG
2386 /*
2387 * If it's the last part of a host page then we place the host
2388 * page
2389 */
2390 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2391 ~qemu_host_page_mask) == 0;
2392 place_source = postcopy_host_page;
2393 }
c53b7ddc 2394 last_host = host;
a7180877
DDAG
2395
2396 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2397 case RAM_SAVE_FLAG_COMPRESS:
2398 ch = qemu_get_byte(f);
2399 memset(page_buffer, ch, TARGET_PAGE_SIZE);
2400 if (ch) {
2401 all_zero = false;
2402 }
2403 break;
2404
2405 case RAM_SAVE_FLAG_PAGE:
2406 all_zero = false;
2407 if (!place_needed || !matching_page_sizes) {
2408 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2409 } else {
2410 /* Avoids the qemu_file copy during postcopy, which is
2411 * going to do a copy later; can only do it when we
2412 * do this read in one go (matching page sizes)
2413 */
2414 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2415 TARGET_PAGE_SIZE);
2416 }
2417 break;
2418 case RAM_SAVE_FLAG_EOS:
2419 /* normal exit */
2420 break;
2421 default:
2422 error_report("Unknown combination of migration flags: %#x"
2423 " (postcopy mode)", flags);
2424 ret = -EINVAL;
2425 }
2426
2427 if (place_needed) {
2428 /* This gets called at the last target page in the host page */
2429 if (all_zero) {
2430 ret = postcopy_place_page_zero(mis,
2431 host + TARGET_PAGE_SIZE -
2432 qemu_host_page_size);
2433 } else {
2434 ret = postcopy_place_page(mis, host + TARGET_PAGE_SIZE -
2435 qemu_host_page_size,
2436 place_source);
2437 }
2438 }
2439 if (!ret) {
2440 ret = qemu_file_get_error(f);
2441 }
2442 }
2443
2444 return ret;
2445}
2446
56e93d26
JQ
2447static int ram_load(QEMUFile *f, void *opaque, int version_id)
2448{
2449 int flags = 0, ret = 0;
2450 static uint64_t seq_iter;
2451 int len = 0;
a7180877
DDAG
2452 /*
2453 * If system is running in postcopy mode, page inserts to host memory must
2454 * be atomic
2455 */
2456 bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
56e93d26
JQ
2457
2458 seq_iter++;
2459
2460 if (version_id != 4) {
2461 ret = -EINVAL;
2462 }
2463
2464 /* This RCU critical section can be very long running.
2465 * When RCU reclaims in the code start to become numerous,
2466 * it will be necessary to reduce the granularity of this
2467 * critical section.
2468 */
2469 rcu_read_lock();
a7180877
DDAG
2470
2471 if (postcopy_running) {
2472 ret = ram_load_postcopy(f);
2473 }
2474
2475 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
56e93d26 2476 ram_addr_t addr, total_ram_bytes;
a776aa15 2477 void *host = NULL;
56e93d26
JQ
2478 uint8_t ch;
2479
2480 addr = qemu_get_be64(f);
2481 flags = addr & ~TARGET_PAGE_MASK;
2482 addr &= TARGET_PAGE_MASK;
2483
a776aa15
DDAG
2484 if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE |
2485 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4c4bad48
HZ
2486 RAMBlock *block = ram_block_from_stream(f, flags);
2487
2488 host = host_from_ram_block_offset(block, addr);
a776aa15
DDAG
2489 if (!host) {
2490 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2491 ret = -EINVAL;
2492 break;
2493 }
2494 }
2495
56e93d26
JQ
2496 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2497 case RAM_SAVE_FLAG_MEM_SIZE:
2498 /* Synchronize RAM block list */
2499 total_ram_bytes = addr;
2500 while (!ret && total_ram_bytes) {
2501 RAMBlock *block;
56e93d26
JQ
2502 char id[256];
2503 ram_addr_t length;
2504
2505 len = qemu_get_byte(f);
2506 qemu_get_buffer(f, (uint8_t *)id, len);
2507 id[len] = 0;
2508 length = qemu_get_be64(f);
2509
e3dd7493
DDAG
2510 block = qemu_ram_block_by_name(id);
2511 if (block) {
2512 if (length != block->used_length) {
2513 Error *local_err = NULL;
56e93d26 2514
fa53a0e5 2515 ret = qemu_ram_resize(block, length,
e3dd7493
DDAG
2516 &local_err);
2517 if (local_err) {
2518 error_report_err(local_err);
56e93d26 2519 }
56e93d26 2520 }
e3dd7493
DDAG
2521 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2522 block->idstr);
2523 } else {
56e93d26
JQ
2524 error_report("Unknown ramblock \"%s\", cannot "
2525 "accept migration", id);
2526 ret = -EINVAL;
2527 }
2528
2529 total_ram_bytes -= length;
2530 }
2531 break;
a776aa15 2532
56e93d26 2533 case RAM_SAVE_FLAG_COMPRESS:
56e93d26
JQ
2534 ch = qemu_get_byte(f);
2535 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2536 break;
a776aa15 2537
56e93d26 2538 case RAM_SAVE_FLAG_PAGE:
56e93d26
JQ
2539 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2540 break;
56e93d26 2541
a776aa15 2542 case RAM_SAVE_FLAG_COMPRESS_PAGE:
56e93d26
JQ
2543 len = qemu_get_be32(f);
2544 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2545 error_report("Invalid compressed data length: %d", len);
2546 ret = -EINVAL;
2547 break;
2548 }
c1bc6626 2549 decompress_data_with_multi_threads(f, host, len);
56e93d26 2550 break;
a776aa15 2551
56e93d26 2552 case RAM_SAVE_FLAG_XBZRLE:
56e93d26
JQ
2553 if (load_xbzrle(f, addr, host) < 0) {
2554 error_report("Failed to decompress XBZRLE page at "
2555 RAM_ADDR_FMT, addr);
2556 ret = -EINVAL;
2557 break;
2558 }
2559 break;
2560 case RAM_SAVE_FLAG_EOS:
2561 /* normal exit */
2562 break;
2563 default:
2564 if (flags & RAM_SAVE_FLAG_HOOK) {
632e3a5c 2565 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
56e93d26
JQ
2566 } else {
2567 error_report("Unknown combination of migration flags: %#x",
2568 flags);
2569 ret = -EINVAL;
2570 }
2571 }
2572 if (!ret) {
2573 ret = qemu_file_get_error(f);
2574 }
2575 }
2576
5533b2e9 2577 wait_for_decompress_done();
56e93d26
JQ
2578 rcu_read_unlock();
2579 DPRINTF("Completed load of VM with exit code %d seq iteration "
2580 "%" PRIu64 "\n", ret, seq_iter);
2581 return ret;
2582}
2583
2584static SaveVMHandlers savevm_ram_handlers = {
2585 .save_live_setup = ram_save_setup,
2586 .save_live_iterate = ram_save_iterate,
763c906b 2587 .save_live_complete_postcopy = ram_save_complete,
a3e06c3d 2588 .save_live_complete_precopy = ram_save_complete,
56e93d26
JQ
2589 .save_live_pending = ram_save_pending,
2590 .load_state = ram_load,
6ad2a215 2591 .cleanup = ram_migration_cleanup,
56e93d26
JQ
2592};
2593
2594void ram_mig_init(void)
2595{
2596 qemu_mutex_init(&XBZRLE.lock);
2597 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, NULL);
2598}