]> git.proxmox.com Git - mirror_qemu.git/blame - migration/ram.c
Add QEMU_MADV_NOHUGEPAGE
[mirror_qemu.git] / migration / ram.c
CommitLineData
56e93d26
JQ
1/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
76cc7b58
JQ
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
56e93d26
JQ
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
28#include <stdint.h>
56e93d26 29#include <zlib.h>
56e93d26
JQ
30#include "qemu/bitops.h"
31#include "qemu/bitmap.h"
7205c9ec
JQ
32#include "qemu/timer.h"
33#include "qemu/main-loop.h"
56e93d26
JQ
34#include "migration/migration.h"
35#include "exec/address-spaces.h"
36#include "migration/page_cache.h"
56e93d26 37#include "qemu/error-report.h"
56e93d26 38#include "trace.h"
56e93d26 39#include "exec/ram_addr.h"
56e93d26
JQ
40#include "qemu/rcu_queue.h"
41
42#ifdef DEBUG_MIGRATION_RAM
43#define DPRINTF(fmt, ...) \
44 do { fprintf(stdout, "migration_ram: " fmt, ## __VA_ARGS__); } while (0)
45#else
46#define DPRINTF(fmt, ...) \
47 do { } while (0)
48#endif
49
56e93d26 50static int dirty_rate_high_cnt;
56e93d26
JQ
51
52static uint64_t bitmap_sync_count;
53
54/***********************************************************/
55/* ram save/restore */
56
57#define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
58#define RAM_SAVE_FLAG_COMPRESS 0x02
59#define RAM_SAVE_FLAG_MEM_SIZE 0x04
60#define RAM_SAVE_FLAG_PAGE 0x08
61#define RAM_SAVE_FLAG_EOS 0x10
62#define RAM_SAVE_FLAG_CONTINUE 0x20
63#define RAM_SAVE_FLAG_XBZRLE 0x40
64/* 0x80 is reserved in migration.h start with 0x100 next */
65#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
66
67static const uint8_t ZERO_TARGET_PAGE[TARGET_PAGE_SIZE];
68
69static inline bool is_zero_range(uint8_t *p, uint64_t size)
70{
71 return buffer_find_nonzero_offset(p, size) == size;
72}
73
74/* struct contains XBZRLE cache and a static page
75 used by the compression */
76static struct {
77 /* buffer used for XBZRLE encoding */
78 uint8_t *encoded_buf;
79 /* buffer for storing page content */
80 uint8_t *current_buf;
81 /* Cache for XBZRLE, Protected by lock. */
82 PageCache *cache;
83 QemuMutex lock;
84} XBZRLE;
85
86/* buffer used for XBZRLE decoding */
87static uint8_t *xbzrle_decoded_buf;
88
89static void XBZRLE_cache_lock(void)
90{
91 if (migrate_use_xbzrle())
92 qemu_mutex_lock(&XBZRLE.lock);
93}
94
95static void XBZRLE_cache_unlock(void)
96{
97 if (migrate_use_xbzrle())
98 qemu_mutex_unlock(&XBZRLE.lock);
99}
100
101/*
102 * called from qmp_migrate_set_cache_size in main thread, possibly while
103 * a migration is in progress.
104 * A running migration maybe using the cache and might finish during this
105 * call, hence changes to the cache are protected by XBZRLE.lock().
106 */
107int64_t xbzrle_cache_resize(int64_t new_size)
108{
109 PageCache *new_cache;
110 int64_t ret;
111
112 if (new_size < TARGET_PAGE_SIZE) {
113 return -1;
114 }
115
116 XBZRLE_cache_lock();
117
118 if (XBZRLE.cache != NULL) {
119 if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
120 goto out_new_size;
121 }
122 new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
123 TARGET_PAGE_SIZE);
124 if (!new_cache) {
125 error_report("Error creating cache");
126 ret = -1;
127 goto out;
128 }
129
130 cache_fini(XBZRLE.cache);
131 XBZRLE.cache = new_cache;
132 }
133
134out_new_size:
135 ret = pow2floor(new_size);
136out:
137 XBZRLE_cache_unlock();
138 return ret;
139}
140
141/* accounting for migration statistics */
142typedef struct AccountingInfo {
143 uint64_t dup_pages;
144 uint64_t skipped_pages;
145 uint64_t norm_pages;
146 uint64_t iterations;
147 uint64_t xbzrle_bytes;
148 uint64_t xbzrle_pages;
149 uint64_t xbzrle_cache_miss;
150 double xbzrle_cache_miss_rate;
151 uint64_t xbzrle_overflows;
152} AccountingInfo;
153
154static AccountingInfo acct_info;
155
156static void acct_clear(void)
157{
158 memset(&acct_info, 0, sizeof(acct_info));
159}
160
161uint64_t dup_mig_bytes_transferred(void)
162{
163 return acct_info.dup_pages * TARGET_PAGE_SIZE;
164}
165
166uint64_t dup_mig_pages_transferred(void)
167{
168 return acct_info.dup_pages;
169}
170
171uint64_t skipped_mig_bytes_transferred(void)
172{
173 return acct_info.skipped_pages * TARGET_PAGE_SIZE;
174}
175
176uint64_t skipped_mig_pages_transferred(void)
177{
178 return acct_info.skipped_pages;
179}
180
181uint64_t norm_mig_bytes_transferred(void)
182{
183 return acct_info.norm_pages * TARGET_PAGE_SIZE;
184}
185
186uint64_t norm_mig_pages_transferred(void)
187{
188 return acct_info.norm_pages;
189}
190
191uint64_t xbzrle_mig_bytes_transferred(void)
192{
193 return acct_info.xbzrle_bytes;
194}
195
196uint64_t xbzrle_mig_pages_transferred(void)
197{
198 return acct_info.xbzrle_pages;
199}
200
201uint64_t xbzrle_mig_pages_cache_miss(void)
202{
203 return acct_info.xbzrle_cache_miss;
204}
205
206double xbzrle_mig_cache_miss_rate(void)
207{
208 return acct_info.xbzrle_cache_miss_rate;
209}
210
211uint64_t xbzrle_mig_pages_overflow(void)
212{
213 return acct_info.xbzrle_overflows;
214}
215
216/* This is the last block that we have visited serching for dirty pages
217 */
218static RAMBlock *last_seen_block;
219/* This is the last block from where we have sent data */
220static RAMBlock *last_sent_block;
221static ram_addr_t last_offset;
dd631697 222static QemuMutex migration_bitmap_mutex;
56e93d26
JQ
223static uint64_t migration_dirty_pages;
224static uint32_t last_version;
225static bool ram_bulk_stage;
226
b8fb8cb7
DDAG
227/* used by the search for pages to send */
228struct PageSearchStatus {
229 /* Current block being searched */
230 RAMBlock *block;
231 /* Current offset to search from */
232 ram_addr_t offset;
233 /* Set once we wrap around */
234 bool complete_round;
235};
236typedef struct PageSearchStatus PageSearchStatus;
237
60be6340
DL
238static struct BitmapRcu {
239 struct rcu_head rcu;
240 unsigned long *bmap;
241} *migration_bitmap_rcu;
242
56e93d26
JQ
243struct CompressParam {
244 bool start;
245 bool done;
246 QEMUFile *file;
247 QemuMutex mutex;
248 QemuCond cond;
249 RAMBlock *block;
250 ram_addr_t offset;
251};
252typedef struct CompressParam CompressParam;
253
254struct DecompressParam {
255 bool start;
256 QemuMutex mutex;
257 QemuCond cond;
258 void *des;
259 uint8 *compbuf;
260 int len;
261};
262typedef struct DecompressParam DecompressParam;
263
264static CompressParam *comp_param;
265static QemuThread *compress_threads;
266/* comp_done_cond is used to wake up the migration thread when
267 * one of the compression threads has finished the compression.
268 * comp_done_lock is used to co-work with comp_done_cond.
269 */
270static QemuMutex *comp_done_lock;
271static QemuCond *comp_done_cond;
272/* The empty QEMUFileOps will be used by file in CompressParam */
273static const QEMUFileOps empty_ops = { };
274
275static bool compression_switch;
276static bool quit_comp_thread;
277static bool quit_decomp_thread;
278static DecompressParam *decomp_param;
279static QemuThread *decompress_threads;
280static uint8_t *compressed_data_buf;
281
282static int do_compress_ram_page(CompressParam *param);
283
284static void *do_data_compress(void *opaque)
285{
286 CompressParam *param = opaque;
287
288 while (!quit_comp_thread) {
289 qemu_mutex_lock(&param->mutex);
290 /* Re-check the quit_comp_thread in case of
291 * terminate_compression_threads is called just before
292 * qemu_mutex_lock(&param->mutex) and after
293 * while(!quit_comp_thread), re-check it here can make
294 * sure the compression thread terminate as expected.
295 */
296 while (!param->start && !quit_comp_thread) {
297 qemu_cond_wait(&param->cond, &param->mutex);
298 }
299 if (!quit_comp_thread) {
300 do_compress_ram_page(param);
301 }
302 param->start = false;
303 qemu_mutex_unlock(&param->mutex);
304
305 qemu_mutex_lock(comp_done_lock);
306 param->done = true;
307 qemu_cond_signal(comp_done_cond);
308 qemu_mutex_unlock(comp_done_lock);
309 }
310
311 return NULL;
312}
313
314static inline void terminate_compression_threads(void)
315{
316 int idx, thread_count;
317
318 thread_count = migrate_compress_threads();
319 quit_comp_thread = true;
320 for (idx = 0; idx < thread_count; idx++) {
321 qemu_mutex_lock(&comp_param[idx].mutex);
322 qemu_cond_signal(&comp_param[idx].cond);
323 qemu_mutex_unlock(&comp_param[idx].mutex);
324 }
325}
326
327void migrate_compress_threads_join(void)
328{
329 int i, thread_count;
330
331 if (!migrate_use_compression()) {
332 return;
333 }
334 terminate_compression_threads();
335 thread_count = migrate_compress_threads();
336 for (i = 0; i < thread_count; i++) {
337 qemu_thread_join(compress_threads + i);
338 qemu_fclose(comp_param[i].file);
339 qemu_mutex_destroy(&comp_param[i].mutex);
340 qemu_cond_destroy(&comp_param[i].cond);
341 }
342 qemu_mutex_destroy(comp_done_lock);
343 qemu_cond_destroy(comp_done_cond);
344 g_free(compress_threads);
345 g_free(comp_param);
346 g_free(comp_done_cond);
347 g_free(comp_done_lock);
348 compress_threads = NULL;
349 comp_param = NULL;
350 comp_done_cond = NULL;
351 comp_done_lock = NULL;
352}
353
354void migrate_compress_threads_create(void)
355{
356 int i, thread_count;
357
358 if (!migrate_use_compression()) {
359 return;
360 }
361 quit_comp_thread = false;
362 compression_switch = true;
363 thread_count = migrate_compress_threads();
364 compress_threads = g_new0(QemuThread, thread_count);
365 comp_param = g_new0(CompressParam, thread_count);
366 comp_done_cond = g_new0(QemuCond, 1);
367 comp_done_lock = g_new0(QemuMutex, 1);
368 qemu_cond_init(comp_done_cond);
369 qemu_mutex_init(comp_done_lock);
370 for (i = 0; i < thread_count; i++) {
371 /* com_param[i].file is just used as a dummy buffer to save data, set
372 * it's ops to empty.
373 */
374 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
375 comp_param[i].done = true;
376 qemu_mutex_init(&comp_param[i].mutex);
377 qemu_cond_init(&comp_param[i].cond);
378 qemu_thread_create(compress_threads + i, "compress",
379 do_data_compress, comp_param + i,
380 QEMU_THREAD_JOINABLE);
381 }
382}
383
384/**
385 * save_page_header: Write page header to wire
386 *
387 * If this is the 1st block, it also writes the block identification
388 *
389 * Returns: Number of bytes written
390 *
391 * @f: QEMUFile where to send the data
392 * @block: block that contains the page we want to send
393 * @offset: offset inside the block for the page
394 * in the lower bits, it contains flags
395 */
396static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
397{
9f5f380b 398 size_t size, len;
56e93d26
JQ
399
400 qemu_put_be64(f, offset);
401 size = 8;
402
403 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
9f5f380b
LL
404 len = strlen(block->idstr);
405 qemu_put_byte(f, len);
406 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
407 size += 1 + len;
56e93d26
JQ
408 }
409 return size;
410}
411
070afca2
JH
412/* Reduce amount of guest cpu execution to hopefully slow down memory writes.
413 * If guest dirty memory rate is reduced below the rate at which we can
414 * transfer pages to the destination then we should be able to complete
415 * migration. Some workloads dirty memory way too fast and will not effectively
416 * converge, even with auto-converge.
417 */
418static void mig_throttle_guest_down(void)
419{
420 MigrationState *s = migrate_get_current();
421 uint64_t pct_initial =
422 s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INITIAL];
423 uint64_t pct_icrement =
424 s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INCREMENT];
425
426 /* We have not started throttling yet. Let's start it. */
427 if (!cpu_throttle_active()) {
428 cpu_throttle_set(pct_initial);
429 } else {
430 /* Throttling already on, just increase the rate */
431 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
432 }
433}
434
56e93d26
JQ
435/* Update the xbzrle cache to reflect a page that's been sent as all 0.
436 * The important thing is that a stale (not-yet-0'd) page be replaced
437 * by the new data.
438 * As a bonus, if the page wasn't in the cache it gets added so that
439 * when a small write is made into the 0'd page it gets XBZRLE sent
440 */
441static void xbzrle_cache_zero_page(ram_addr_t current_addr)
442{
443 if (ram_bulk_stage || !migrate_use_xbzrle()) {
444 return;
445 }
446
447 /* We don't care if this fails to allocate a new cache page
448 * as long as it updated an old one */
449 cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
450 bitmap_sync_count);
451}
452
453#define ENCODING_FLAG_XBZRLE 0x1
454
455/**
456 * save_xbzrle_page: compress and send current page
457 *
458 * Returns: 1 means that we wrote the page
459 * 0 means that page is identical to the one already sent
460 * -1 means that xbzrle would be longer than normal
461 *
462 * @f: QEMUFile where to send the data
463 * @current_data:
464 * @current_addr:
465 * @block: block that contains the page we want to send
466 * @offset: offset inside the block for the page
467 * @last_stage: if we are at the completion stage
468 * @bytes_transferred: increase it with the number of transferred bytes
469 */
470static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data,
471 ram_addr_t current_addr, RAMBlock *block,
472 ram_addr_t offset, bool last_stage,
473 uint64_t *bytes_transferred)
474{
475 int encoded_len = 0, bytes_xbzrle;
476 uint8_t *prev_cached_page;
477
478 if (!cache_is_cached(XBZRLE.cache, current_addr, bitmap_sync_count)) {
479 acct_info.xbzrle_cache_miss++;
480 if (!last_stage) {
481 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
482 bitmap_sync_count) == -1) {
483 return -1;
484 } else {
485 /* update *current_data when the page has been
486 inserted into cache */
487 *current_data = get_cached_data(XBZRLE.cache, current_addr);
488 }
489 }
490 return -1;
491 }
492
493 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
494
495 /* save current buffer into memory */
496 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
497
498 /* XBZRLE encoding (if there is no overflow) */
499 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
500 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
501 TARGET_PAGE_SIZE);
502 if (encoded_len == 0) {
503 DPRINTF("Skipping unmodified page\n");
504 return 0;
505 } else if (encoded_len == -1) {
506 DPRINTF("Overflow\n");
507 acct_info.xbzrle_overflows++;
508 /* update data in the cache */
509 if (!last_stage) {
510 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
511 *current_data = prev_cached_page;
512 }
513 return -1;
514 }
515
516 /* we need to update the data in the cache, in order to get the same data */
517 if (!last_stage) {
518 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
519 }
520
521 /* Send XBZRLE based compressed page */
522 bytes_xbzrle = save_page_header(f, block, offset | RAM_SAVE_FLAG_XBZRLE);
523 qemu_put_byte(f, ENCODING_FLAG_XBZRLE);
524 qemu_put_be16(f, encoded_len);
525 qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len);
526 bytes_xbzrle += encoded_len + 1 + 2;
527 acct_info.xbzrle_pages++;
528 acct_info.xbzrle_bytes += bytes_xbzrle;
529 *bytes_transferred += bytes_xbzrle;
530
531 return 1;
532}
533
2ff64038 534/* Called with rcu_read_lock() to protect migration_bitmap */
56e93d26 535static inline
2f68e399 536ram_addr_t migration_bitmap_find_and_reset_dirty(RAMBlock *rb,
56e93d26
JQ
537 ram_addr_t start)
538{
2f68e399 539 unsigned long base = rb->offset >> TARGET_PAGE_BITS;
56e93d26 540 unsigned long nr = base + (start >> TARGET_PAGE_BITS);
2f68e399
DDAG
541 uint64_t rb_size = rb->used_length;
542 unsigned long size = base + (rb_size >> TARGET_PAGE_BITS);
2ff64038 543 unsigned long *bitmap;
56e93d26
JQ
544
545 unsigned long next;
546
60be6340 547 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
56e93d26
JQ
548 if (ram_bulk_stage && nr > base) {
549 next = nr + 1;
550 } else {
2ff64038 551 next = find_next_bit(bitmap, size, nr);
56e93d26
JQ
552 }
553
554 if (next < size) {
2ff64038 555 clear_bit(next, bitmap);
56e93d26
JQ
556 migration_dirty_pages--;
557 }
558 return (next - base) << TARGET_PAGE_BITS;
559}
560
2ff64038 561/* Called with rcu_read_lock() to protect migration_bitmap */
56e93d26
JQ
562static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length)
563{
2ff64038 564 unsigned long *bitmap;
60be6340 565 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
56e93d26 566 migration_dirty_pages +=
2ff64038 567 cpu_physical_memory_sync_dirty_bitmap(bitmap, start, length);
56e93d26
JQ
568}
569
56e93d26
JQ
570/* Fix me: there are too many global variables used in migration process. */
571static int64_t start_time;
572static int64_t bytes_xfer_prev;
573static int64_t num_dirty_pages_period;
574static uint64_t xbzrle_cache_miss_prev;
575static uint64_t iterations_prev;
576
577static void migration_bitmap_sync_init(void)
578{
579 start_time = 0;
580 bytes_xfer_prev = 0;
581 num_dirty_pages_period = 0;
582 xbzrle_cache_miss_prev = 0;
583 iterations_prev = 0;
584}
585
586/* Called with iothread lock held, to protect ram_list.dirty_memory[] */
587static void migration_bitmap_sync(void)
588{
589 RAMBlock *block;
590 uint64_t num_dirty_pages_init = migration_dirty_pages;
591 MigrationState *s = migrate_get_current();
592 int64_t end_time;
593 int64_t bytes_xfer_now;
594
595 bitmap_sync_count++;
596
597 if (!bytes_xfer_prev) {
598 bytes_xfer_prev = ram_bytes_transferred();
599 }
600
601 if (!start_time) {
602 start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
603 }
604
605 trace_migration_bitmap_sync_start();
606 address_space_sync_dirty_bitmap(&address_space_memory);
607
dd631697 608 qemu_mutex_lock(&migration_bitmap_mutex);
56e93d26
JQ
609 rcu_read_lock();
610 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2f68e399 611 migration_bitmap_sync_range(block->offset, block->used_length);
56e93d26
JQ
612 }
613 rcu_read_unlock();
dd631697 614 qemu_mutex_unlock(&migration_bitmap_mutex);
56e93d26
JQ
615
616 trace_migration_bitmap_sync_end(migration_dirty_pages
617 - num_dirty_pages_init);
618 num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init;
619 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
620
621 /* more than 1 second = 1000 millisecons */
622 if (end_time > start_time + 1000) {
623 if (migrate_auto_converge()) {
624 /* The following detection logic can be refined later. For now:
625 Check to see if the dirtied bytes is 50% more than the approx.
626 amount of bytes that just got transferred since the last time we
070afca2
JH
627 were in this routine. If that happens twice, start or increase
628 throttling */
56e93d26 629 bytes_xfer_now = ram_bytes_transferred();
070afca2 630
56e93d26
JQ
631 if (s->dirty_pages_rate &&
632 (num_dirty_pages_period * TARGET_PAGE_SIZE >
633 (bytes_xfer_now - bytes_xfer_prev)/2) &&
070afca2 634 (dirty_rate_high_cnt++ >= 2)) {
56e93d26 635 trace_migration_throttle();
56e93d26 636 dirty_rate_high_cnt = 0;
070afca2 637 mig_throttle_guest_down();
56e93d26
JQ
638 }
639 bytes_xfer_prev = bytes_xfer_now;
56e93d26 640 }
070afca2 641
56e93d26
JQ
642 if (migrate_use_xbzrle()) {
643 if (iterations_prev != acct_info.iterations) {
644 acct_info.xbzrle_cache_miss_rate =
645 (double)(acct_info.xbzrle_cache_miss -
646 xbzrle_cache_miss_prev) /
647 (acct_info.iterations - iterations_prev);
648 }
649 iterations_prev = acct_info.iterations;
650 xbzrle_cache_miss_prev = acct_info.xbzrle_cache_miss;
651 }
652 s->dirty_pages_rate = num_dirty_pages_period * 1000
653 / (end_time - start_time);
654 s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE;
655 start_time = end_time;
656 num_dirty_pages_period = 0;
657 }
658 s->dirty_sync_count = bitmap_sync_count;
659}
660
661/**
662 * save_zero_page: Send the zero page to the stream
663 *
664 * Returns: Number of pages written.
665 *
666 * @f: QEMUFile where to send the data
667 * @block: block that contains the page we want to send
668 * @offset: offset inside the block for the page
669 * @p: pointer to the page
670 * @bytes_transferred: increase it with the number of transferred bytes
671 */
672static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
673 uint8_t *p, uint64_t *bytes_transferred)
674{
675 int pages = -1;
676
677 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
678 acct_info.dup_pages++;
679 *bytes_transferred += save_page_header(f, block,
680 offset | RAM_SAVE_FLAG_COMPRESS);
681 qemu_put_byte(f, 0);
682 *bytes_transferred += 1;
683 pages = 1;
684 }
685
686 return pages;
687}
688
689/**
690 * ram_save_page: Send the given page to the stream
691 *
692 * Returns: Number of pages written.
693 *
694 * @f: QEMUFile where to send the data
695 * @block: block that contains the page we want to send
696 * @offset: offset inside the block for the page
697 * @last_stage: if we are at the completion stage
698 * @bytes_transferred: increase it with the number of transferred bytes
699 */
700static int ram_save_page(QEMUFile *f, RAMBlock* block, ram_addr_t offset,
701 bool last_stage, uint64_t *bytes_transferred)
702{
703 int pages = -1;
704 uint64_t bytes_xmit;
705 ram_addr_t current_addr;
56e93d26
JQ
706 uint8_t *p;
707 int ret;
708 bool send_async = true;
709
2f68e399 710 p = block->host + offset;
56e93d26
JQ
711
712 /* In doubt sent page as normal */
713 bytes_xmit = 0;
714 ret = ram_control_save_page(f, block->offset,
715 offset, TARGET_PAGE_SIZE, &bytes_xmit);
716 if (bytes_xmit) {
717 *bytes_transferred += bytes_xmit;
718 pages = 1;
719 }
720
721 XBZRLE_cache_lock();
722
723 current_addr = block->offset + offset;
724
725 if (block == last_sent_block) {
726 offset |= RAM_SAVE_FLAG_CONTINUE;
727 }
728 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
729 if (ret != RAM_SAVE_CONTROL_DELAYED) {
730 if (bytes_xmit > 0) {
731 acct_info.norm_pages++;
732 } else if (bytes_xmit == 0) {
733 acct_info.dup_pages++;
734 }
735 }
736 } else {
737 pages = save_zero_page(f, block, offset, p, bytes_transferred);
738 if (pages > 0) {
739 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
740 * page would be stale
741 */
742 xbzrle_cache_zero_page(current_addr);
743 } else if (!ram_bulk_stage && migrate_use_xbzrle()) {
744 pages = save_xbzrle_page(f, &p, current_addr, block,
745 offset, last_stage, bytes_transferred);
746 if (!last_stage) {
747 /* Can't send this cached data async, since the cache page
748 * might get updated before it gets to the wire
749 */
750 send_async = false;
751 }
752 }
753 }
754
755 /* XBZRLE overflow or normal page */
756 if (pages == -1) {
757 *bytes_transferred += save_page_header(f, block,
758 offset | RAM_SAVE_FLAG_PAGE);
759 if (send_async) {
760 qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE);
761 } else {
762 qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
763 }
764 *bytes_transferred += TARGET_PAGE_SIZE;
765 pages = 1;
766 acct_info.norm_pages++;
767 }
768
769 XBZRLE_cache_unlock();
770
771 return pages;
772}
773
774static int do_compress_ram_page(CompressParam *param)
775{
776 int bytes_sent, blen;
777 uint8_t *p;
778 RAMBlock *block = param->block;
779 ram_addr_t offset = param->offset;
780
2f68e399 781 p = block->host + (offset & TARGET_PAGE_MASK);
56e93d26
JQ
782
783 bytes_sent = save_page_header(param->file, block, offset |
784 RAM_SAVE_FLAG_COMPRESS_PAGE);
785 blen = qemu_put_compression_data(param->file, p, TARGET_PAGE_SIZE,
786 migrate_compress_level());
787 bytes_sent += blen;
788
789 return bytes_sent;
790}
791
792static inline void start_compression(CompressParam *param)
793{
794 param->done = false;
795 qemu_mutex_lock(&param->mutex);
796 param->start = true;
797 qemu_cond_signal(&param->cond);
798 qemu_mutex_unlock(&param->mutex);
799}
800
801static inline void start_decompression(DecompressParam *param)
802{
803 qemu_mutex_lock(&param->mutex);
804 param->start = true;
805 qemu_cond_signal(&param->cond);
806 qemu_mutex_unlock(&param->mutex);
807}
808
809static uint64_t bytes_transferred;
810
811static void flush_compressed_data(QEMUFile *f)
812{
813 int idx, len, thread_count;
814
815 if (!migrate_use_compression()) {
816 return;
817 }
818 thread_count = migrate_compress_threads();
819 for (idx = 0; idx < thread_count; idx++) {
820 if (!comp_param[idx].done) {
821 qemu_mutex_lock(comp_done_lock);
822 while (!comp_param[idx].done && !quit_comp_thread) {
823 qemu_cond_wait(comp_done_cond, comp_done_lock);
824 }
825 qemu_mutex_unlock(comp_done_lock);
826 }
827 if (!quit_comp_thread) {
828 len = qemu_put_qemu_file(f, comp_param[idx].file);
829 bytes_transferred += len;
830 }
831 }
832}
833
834static inline void set_compress_params(CompressParam *param, RAMBlock *block,
835 ram_addr_t offset)
836{
837 param->block = block;
838 param->offset = offset;
839}
840
841static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block,
842 ram_addr_t offset,
843 uint64_t *bytes_transferred)
844{
845 int idx, thread_count, bytes_xmit = -1, pages = -1;
846
847 thread_count = migrate_compress_threads();
848 qemu_mutex_lock(comp_done_lock);
849 while (true) {
850 for (idx = 0; idx < thread_count; idx++) {
851 if (comp_param[idx].done) {
852 bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file);
853 set_compress_params(&comp_param[idx], block, offset);
854 start_compression(&comp_param[idx]);
855 pages = 1;
856 acct_info.norm_pages++;
857 *bytes_transferred += bytes_xmit;
858 break;
859 }
860 }
861 if (pages > 0) {
862 break;
863 } else {
864 qemu_cond_wait(comp_done_cond, comp_done_lock);
865 }
866 }
867 qemu_mutex_unlock(comp_done_lock);
868
869 return pages;
870}
871
872/**
873 * ram_save_compressed_page: compress the given page and send it to the stream
874 *
875 * Returns: Number of pages written.
876 *
877 * @f: QEMUFile where to send the data
878 * @block: block that contains the page we want to send
879 * @offset: offset inside the block for the page
880 * @last_stage: if we are at the completion stage
881 * @bytes_transferred: increase it with the number of transferred bytes
882 */
883static int ram_save_compressed_page(QEMUFile *f, RAMBlock *block,
884 ram_addr_t offset, bool last_stage,
885 uint64_t *bytes_transferred)
886{
887 int pages = -1;
888 uint64_t bytes_xmit;
56e93d26
JQ
889 uint8_t *p;
890 int ret;
891
2f68e399 892 p = block->host + offset;
56e93d26
JQ
893
894 bytes_xmit = 0;
895 ret = ram_control_save_page(f, block->offset,
896 offset, TARGET_PAGE_SIZE, &bytes_xmit);
897 if (bytes_xmit) {
898 *bytes_transferred += bytes_xmit;
899 pages = 1;
900 }
901 if (block == last_sent_block) {
902 offset |= RAM_SAVE_FLAG_CONTINUE;
903 }
904 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
905 if (ret != RAM_SAVE_CONTROL_DELAYED) {
906 if (bytes_xmit > 0) {
907 acct_info.norm_pages++;
908 } else if (bytes_xmit == 0) {
909 acct_info.dup_pages++;
910 }
911 }
912 } else {
913 /* When starting the process of a new block, the first page of
914 * the block should be sent out before other pages in the same
915 * block, and all the pages in last block should have been sent
916 * out, keeping this order is important, because the 'cont' flag
917 * is used to avoid resending the block name.
918 */
919 if (block != last_sent_block) {
920 flush_compressed_data(f);
921 pages = save_zero_page(f, block, offset, p, bytes_transferred);
922 if (pages == -1) {
923 set_compress_params(&comp_param[0], block, offset);
924 /* Use the qemu thread to compress the data to make sure the
925 * first page is sent out before other pages
926 */
927 bytes_xmit = do_compress_ram_page(&comp_param[0]);
928 acct_info.norm_pages++;
929 qemu_put_qemu_file(f, comp_param[0].file);
930 *bytes_transferred += bytes_xmit;
931 pages = 1;
932 }
933 } else {
934 pages = save_zero_page(f, block, offset, p, bytes_transferred);
935 if (pages == -1) {
936 pages = compress_page_with_multi_thread(f, block, offset,
937 bytes_transferred);
938 }
939 }
940 }
941
942 return pages;
943}
944
b9e60928
DDAG
945/*
946 * Find the next dirty page and update any state associated with
947 * the search process.
948 *
949 * Returns: True if a page is found
950 *
951 * @f: Current migration stream.
952 * @pss: Data about the state of the current dirty page scan.
953 * @*again: Set to false if the search has scanned the whole of RAM
954 */
955static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss,
956 bool *again)
957{
958 pss->offset = migration_bitmap_find_and_reset_dirty(pss->block,
959 pss->offset);
960 if (pss->complete_round && pss->block == last_seen_block &&
961 pss->offset >= last_offset) {
962 /*
963 * We've been once around the RAM and haven't found anything.
964 * Give up.
965 */
966 *again = false;
967 return false;
968 }
969 if (pss->offset >= pss->block->used_length) {
970 /* Didn't find anything in this RAM Block */
971 pss->offset = 0;
972 pss->block = QLIST_NEXT_RCU(pss->block, next);
973 if (!pss->block) {
974 /* Hit the end of the list */
975 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
976 /* Flag that we've looped */
977 pss->complete_round = true;
978 ram_bulk_stage = false;
979 if (migrate_use_xbzrle()) {
980 /* If xbzrle is on, stop using the data compression at this
981 * point. In theory, xbzrle can do better than compression.
982 */
983 flush_compressed_data(f);
984 compression_switch = false;
985 }
986 }
987 /* Didn't find anything this time, but try again on the new block */
988 *again = true;
989 return false;
990 } else {
991 /* Can go around again, but... */
992 *again = true;
993 /* We've found something so probably don't need to */
994 return true;
995 }
996}
997
56e93d26
JQ
998/**
999 * ram_find_and_save_block: Finds a dirty page and sends it to f
1000 *
1001 * Called within an RCU critical section.
1002 *
1003 * Returns: The number of pages written
1004 * 0 means no dirty pages
1005 *
1006 * @f: QEMUFile where to send the data
1007 * @last_stage: if we are at the completion stage
1008 * @bytes_transferred: increase it with the number of transferred bytes
1009 */
1010
1011static int ram_find_and_save_block(QEMUFile *f, bool last_stage,
1012 uint64_t *bytes_transferred)
1013{
b8fb8cb7 1014 PageSearchStatus pss;
56e93d26 1015 int pages = 0;
b9e60928 1016 bool again, found;
56e93d26 1017
b8fb8cb7
DDAG
1018 pss.block = last_seen_block;
1019 pss.offset = last_offset;
1020 pss.complete_round = false;
1021
1022 if (!pss.block) {
1023 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1024 }
56e93d26 1025
b9e60928
DDAG
1026 do {
1027 found = find_dirty_block(f, &pss, &again);
1028
1029 if (found) {
56e93d26 1030 if (compression_switch && migrate_use_compression()) {
b8fb8cb7
DDAG
1031 pages = ram_save_compressed_page(f, pss.block, pss.offset,
1032 last_stage,
56e93d26
JQ
1033 bytes_transferred);
1034 } else {
b8fb8cb7 1035 pages = ram_save_page(f, pss.block, pss.offset, last_stage,
56e93d26
JQ
1036 bytes_transferred);
1037 }
1038
1039 /* if page is unmodified, continue to the next */
1040 if (pages > 0) {
b8fb8cb7 1041 last_sent_block = pss.block;
56e93d26
JQ
1042 }
1043 }
b9e60928 1044 } while (!pages && again);
56e93d26 1045
b8fb8cb7
DDAG
1046 last_seen_block = pss.block;
1047 last_offset = pss.offset;
56e93d26
JQ
1048
1049 return pages;
1050}
1051
1052void acct_update_position(QEMUFile *f, size_t size, bool zero)
1053{
1054 uint64_t pages = size / TARGET_PAGE_SIZE;
1055 if (zero) {
1056 acct_info.dup_pages += pages;
1057 } else {
1058 acct_info.norm_pages += pages;
1059 bytes_transferred += size;
1060 qemu_update_position(f, size);
1061 }
1062}
1063
1064static ram_addr_t ram_save_remaining(void)
1065{
1066 return migration_dirty_pages;
1067}
1068
1069uint64_t ram_bytes_remaining(void)
1070{
1071 return ram_save_remaining() * TARGET_PAGE_SIZE;
1072}
1073
1074uint64_t ram_bytes_transferred(void)
1075{
1076 return bytes_transferred;
1077}
1078
1079uint64_t ram_bytes_total(void)
1080{
1081 RAMBlock *block;
1082 uint64_t total = 0;
1083
1084 rcu_read_lock();
1085 QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
1086 total += block->used_length;
1087 rcu_read_unlock();
1088 return total;
1089}
1090
1091void free_xbzrle_decoded_buf(void)
1092{
1093 g_free(xbzrle_decoded_buf);
1094 xbzrle_decoded_buf = NULL;
1095}
1096
60be6340
DL
1097static void migration_bitmap_free(struct BitmapRcu *bmap)
1098{
1099 g_free(bmap->bmap);
1100 g_free(bmap);
1101}
1102
6ad2a215 1103static void ram_migration_cleanup(void *opaque)
56e93d26 1104{
2ff64038
LZ
1105 /* caller have hold iothread lock or is in a bh, so there is
1106 * no writing race against this migration_bitmap
1107 */
60be6340
DL
1108 struct BitmapRcu *bitmap = migration_bitmap_rcu;
1109 atomic_rcu_set(&migration_bitmap_rcu, NULL);
2ff64038 1110 if (bitmap) {
56e93d26 1111 memory_global_dirty_log_stop();
60be6340 1112 call_rcu(bitmap, migration_bitmap_free, rcu);
56e93d26
JQ
1113 }
1114
1115 XBZRLE_cache_lock();
1116 if (XBZRLE.cache) {
1117 cache_fini(XBZRLE.cache);
1118 g_free(XBZRLE.encoded_buf);
1119 g_free(XBZRLE.current_buf);
1120 XBZRLE.cache = NULL;
1121 XBZRLE.encoded_buf = NULL;
1122 XBZRLE.current_buf = NULL;
1123 }
1124 XBZRLE_cache_unlock();
1125}
1126
56e93d26
JQ
1127static void reset_ram_globals(void)
1128{
1129 last_seen_block = NULL;
1130 last_sent_block = NULL;
1131 last_offset = 0;
1132 last_version = ram_list.version;
1133 ram_bulk_stage = true;
1134}
1135
1136#define MAX_WAIT 50 /* ms, half buffered_file limit */
1137
dd631697
LZ
1138void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
1139{
1140 /* called in qemu main thread, so there is
1141 * no writing race against this migration_bitmap
1142 */
60be6340
DL
1143 if (migration_bitmap_rcu) {
1144 struct BitmapRcu *old_bitmap = migration_bitmap_rcu, *bitmap;
1145 bitmap = g_new(struct BitmapRcu, 1);
1146 bitmap->bmap = bitmap_new(new);
dd631697
LZ
1147
1148 /* prevent migration_bitmap content from being set bit
1149 * by migration_bitmap_sync_range() at the same time.
1150 * it is safe to migration if migration_bitmap is cleared bit
1151 * at the same time.
1152 */
1153 qemu_mutex_lock(&migration_bitmap_mutex);
60be6340
DL
1154 bitmap_copy(bitmap->bmap, old_bitmap->bmap, old);
1155 bitmap_set(bitmap->bmap, old, new - old);
1156 atomic_rcu_set(&migration_bitmap_rcu, bitmap);
dd631697
LZ
1157 qemu_mutex_unlock(&migration_bitmap_mutex);
1158 migration_dirty_pages += new - old;
60be6340 1159 call_rcu(old_bitmap, migration_bitmap_free, rcu);
dd631697
LZ
1160 }
1161}
56e93d26
JQ
1162
1163/* Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1164 * long-running RCU critical section. When rcu-reclaims in the code
1165 * start to become numerous it will be necessary to reduce the
1166 * granularity of these critical sections.
1167 */
1168
1169static int ram_save_setup(QEMUFile *f, void *opaque)
1170{
1171 RAMBlock *block;
1172 int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
1173
56e93d26
JQ
1174 dirty_rate_high_cnt = 0;
1175 bitmap_sync_count = 0;
1176 migration_bitmap_sync_init();
dd631697 1177 qemu_mutex_init(&migration_bitmap_mutex);
56e93d26
JQ
1178
1179 if (migrate_use_xbzrle()) {
1180 XBZRLE_cache_lock();
1181 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1182 TARGET_PAGE_SIZE,
1183 TARGET_PAGE_SIZE);
1184 if (!XBZRLE.cache) {
1185 XBZRLE_cache_unlock();
1186 error_report("Error creating cache");
1187 return -1;
1188 }
1189 XBZRLE_cache_unlock();
1190
1191 /* We prefer not to abort if there is no memory */
1192 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1193 if (!XBZRLE.encoded_buf) {
1194 error_report("Error allocating encoded_buf");
1195 return -1;
1196 }
1197
1198 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1199 if (!XBZRLE.current_buf) {
1200 error_report("Error allocating current_buf");
1201 g_free(XBZRLE.encoded_buf);
1202 XBZRLE.encoded_buf = NULL;
1203 return -1;
1204 }
1205
1206 acct_clear();
1207 }
1208
1209 /* iothread lock needed for ram_list.dirty_memory[] */
1210 qemu_mutex_lock_iothread();
1211 qemu_mutex_lock_ramlist();
1212 rcu_read_lock();
1213 bytes_transferred = 0;
1214 reset_ram_globals();
1215
1216 ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
60be6340
DL
1217 migration_bitmap_rcu = g_new(struct BitmapRcu, 1);
1218 migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages);
1219 bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages);
56e93d26
JQ
1220
1221 /*
1222 * Count the total number of pages used by ram blocks not including any
1223 * gaps due to alignment or unplugs.
1224 */
1225 migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1226
1227 memory_global_dirty_log_start();
1228 migration_bitmap_sync();
1229 qemu_mutex_unlock_ramlist();
1230 qemu_mutex_unlock_iothread();
1231
1232 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1233
1234 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1235 qemu_put_byte(f, strlen(block->idstr));
1236 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1237 qemu_put_be64(f, block->used_length);
1238 }
1239
1240 rcu_read_unlock();
1241
1242 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1243 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1244
1245 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1246
1247 return 0;
1248}
1249
1250static int ram_save_iterate(QEMUFile *f, void *opaque)
1251{
1252 int ret;
1253 int i;
1254 int64_t t0;
1255 int pages_sent = 0;
1256
1257 rcu_read_lock();
1258 if (ram_list.version != last_version) {
1259 reset_ram_globals();
1260 }
1261
1262 /* Read version before ram_list.blocks */
1263 smp_rmb();
1264
1265 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
1266
1267 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1268 i = 0;
1269 while ((ret = qemu_file_rate_limit(f)) == 0) {
1270 int pages;
1271
1272 pages = ram_find_and_save_block(f, false, &bytes_transferred);
1273 /* no more pages to sent */
1274 if (pages == 0) {
1275 break;
1276 }
1277 pages_sent += pages;
1278 acct_info.iterations++;
070afca2 1279
56e93d26
JQ
1280 /* we want to check in the 1st loop, just in case it was the 1st time
1281 and we had to sync the dirty bitmap.
1282 qemu_get_clock_ns() is a bit expensive, so we only check each some
1283 iterations
1284 */
1285 if ((i & 63) == 0) {
1286 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
1287 if (t1 > MAX_WAIT) {
1288 DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n",
1289 t1, i);
1290 break;
1291 }
1292 }
1293 i++;
1294 }
1295 flush_compressed_data(f);
1296 rcu_read_unlock();
1297
1298 /*
1299 * Must occur before EOS (or any QEMUFile operation)
1300 * because of RDMA protocol.
1301 */
1302 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
1303
1304 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1305 bytes_transferred += 8;
1306
1307 ret = qemu_file_get_error(f);
1308 if (ret < 0) {
1309 return ret;
1310 }
1311
1312 return pages_sent;
1313}
1314
1315/* Called with iothread lock */
1316static int ram_save_complete(QEMUFile *f, void *opaque)
1317{
1318 rcu_read_lock();
1319
1320 migration_bitmap_sync();
1321
1322 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
1323
1324 /* try transferring iterative blocks of memory */
1325
1326 /* flush all remaining blocks regardless of rate limiting */
1327 while (true) {
1328 int pages;
1329
1330 pages = ram_find_and_save_block(f, true, &bytes_transferred);
1331 /* no more blocks to sent */
1332 if (pages == 0) {
1333 break;
1334 }
1335 }
1336
1337 flush_compressed_data(f);
1338 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
56e93d26
JQ
1339
1340 rcu_read_unlock();
d09a6fde 1341
56e93d26
JQ
1342 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1343
1344 return 0;
1345}
1346
1347static uint64_t ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
1348{
1349 uint64_t remaining_size;
1350
1351 remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
1352
1353 if (remaining_size < max_size) {
1354 qemu_mutex_lock_iothread();
1355 rcu_read_lock();
1356 migration_bitmap_sync();
1357 rcu_read_unlock();
1358 qemu_mutex_unlock_iothread();
1359 remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
1360 }
1361 return remaining_size;
1362}
1363
1364static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
1365{
1366 unsigned int xh_len;
1367 int xh_flags;
1368
1369 if (!xbzrle_decoded_buf) {
1370 xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1371 }
1372
1373 /* extract RLE header */
1374 xh_flags = qemu_get_byte(f);
1375 xh_len = qemu_get_be16(f);
1376
1377 if (xh_flags != ENCODING_FLAG_XBZRLE) {
1378 error_report("Failed to load XBZRLE page - wrong compression!");
1379 return -1;
1380 }
1381
1382 if (xh_len > TARGET_PAGE_SIZE) {
1383 error_report("Failed to load XBZRLE page - len overflow!");
1384 return -1;
1385 }
1386 /* load data and decode */
1387 qemu_get_buffer(f, xbzrle_decoded_buf, xh_len);
1388
1389 /* decode RLE */
1390 if (xbzrle_decode_buffer(xbzrle_decoded_buf, xh_len, host,
1391 TARGET_PAGE_SIZE) == -1) {
1392 error_report("Failed to load XBZRLE page - decode error!");
1393 return -1;
1394 }
1395
1396 return 0;
1397}
1398
1399/* Must be called from within a rcu critical section.
1400 * Returns a pointer from within the RCU-protected ram_list.
1401 */
1402static inline void *host_from_stream_offset(QEMUFile *f,
1403 ram_addr_t offset,
1404 int flags)
1405{
1406 static RAMBlock *block = NULL;
1407 char id[256];
1408 uint8_t len;
1409
1410 if (flags & RAM_SAVE_FLAG_CONTINUE) {
1411 if (!block || block->max_length <= offset) {
1412 error_report("Ack, bad migration stream!");
1413 return NULL;
1414 }
1415
2f68e399 1416 return block->host + offset;
56e93d26
JQ
1417 }
1418
1419 len = qemu_get_byte(f);
1420 qemu_get_buffer(f, (uint8_t *)id, len);
1421 id[len] = 0;
1422
e3dd7493
DDAG
1423 block = qemu_ram_block_by_name(id);
1424 if (block && block->max_length > offset) {
1425 return block->host + offset;
56e93d26
JQ
1426 }
1427
e3dd7493 1428 error_report("Can't find block %s", id);
56e93d26
JQ
1429 return NULL;
1430}
1431
1432/*
1433 * If a page (or a whole RDMA chunk) has been
1434 * determined to be zero, then zap it.
1435 */
1436void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
1437{
1438 if (ch != 0 || !is_zero_range(host, size)) {
1439 memset(host, ch, size);
1440 }
1441}
1442
1443static void *do_data_decompress(void *opaque)
1444{
1445 DecompressParam *param = opaque;
1446 unsigned long pagesize;
1447
1448 while (!quit_decomp_thread) {
1449 qemu_mutex_lock(&param->mutex);
1450 while (!param->start && !quit_decomp_thread) {
1451 qemu_cond_wait(&param->cond, &param->mutex);
1452 pagesize = TARGET_PAGE_SIZE;
1453 if (!quit_decomp_thread) {
1454 /* uncompress() will return failed in some case, especially
1455 * when the page is dirted when doing the compression, it's
1456 * not a problem because the dirty page will be retransferred
1457 * and uncompress() won't break the data in other pages.
1458 */
1459 uncompress((Bytef *)param->des, &pagesize,
1460 (const Bytef *)param->compbuf, param->len);
1461 }
1462 param->start = false;
1463 }
1464 qemu_mutex_unlock(&param->mutex);
1465 }
1466
1467 return NULL;
1468}
1469
1470void migrate_decompress_threads_create(void)
1471{
1472 int i, thread_count;
1473
1474 thread_count = migrate_decompress_threads();
1475 decompress_threads = g_new0(QemuThread, thread_count);
1476 decomp_param = g_new0(DecompressParam, thread_count);
1477 compressed_data_buf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
1478 quit_decomp_thread = false;
1479 for (i = 0; i < thread_count; i++) {
1480 qemu_mutex_init(&decomp_param[i].mutex);
1481 qemu_cond_init(&decomp_param[i].cond);
1482 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
1483 qemu_thread_create(decompress_threads + i, "decompress",
1484 do_data_decompress, decomp_param + i,
1485 QEMU_THREAD_JOINABLE);
1486 }
1487}
1488
1489void migrate_decompress_threads_join(void)
1490{
1491 int i, thread_count;
1492
1493 quit_decomp_thread = true;
1494 thread_count = migrate_decompress_threads();
1495 for (i = 0; i < thread_count; i++) {
1496 qemu_mutex_lock(&decomp_param[i].mutex);
1497 qemu_cond_signal(&decomp_param[i].cond);
1498 qemu_mutex_unlock(&decomp_param[i].mutex);
1499 }
1500 for (i = 0; i < thread_count; i++) {
1501 qemu_thread_join(decompress_threads + i);
1502 qemu_mutex_destroy(&decomp_param[i].mutex);
1503 qemu_cond_destroy(&decomp_param[i].cond);
1504 g_free(decomp_param[i].compbuf);
1505 }
1506 g_free(decompress_threads);
1507 g_free(decomp_param);
1508 g_free(compressed_data_buf);
1509 decompress_threads = NULL;
1510 decomp_param = NULL;
1511 compressed_data_buf = NULL;
1512}
1513
1514static void decompress_data_with_multi_threads(uint8_t *compbuf,
1515 void *host, int len)
1516{
1517 int idx, thread_count;
1518
1519 thread_count = migrate_decompress_threads();
1520 while (true) {
1521 for (idx = 0; idx < thread_count; idx++) {
1522 if (!decomp_param[idx].start) {
1523 memcpy(decomp_param[idx].compbuf, compbuf, len);
1524 decomp_param[idx].des = host;
1525 decomp_param[idx].len = len;
1526 start_decompression(&decomp_param[idx]);
1527 break;
1528 }
1529 }
1530 if (idx < thread_count) {
1531 break;
1532 }
1533 }
1534}
1535
1536static int ram_load(QEMUFile *f, void *opaque, int version_id)
1537{
1538 int flags = 0, ret = 0;
1539 static uint64_t seq_iter;
1540 int len = 0;
1541
1542 seq_iter++;
1543
1544 if (version_id != 4) {
1545 ret = -EINVAL;
1546 }
1547
1548 /* This RCU critical section can be very long running.
1549 * When RCU reclaims in the code start to become numerous,
1550 * it will be necessary to reduce the granularity of this
1551 * critical section.
1552 */
1553 rcu_read_lock();
1554 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
1555 ram_addr_t addr, total_ram_bytes;
1556 void *host;
1557 uint8_t ch;
1558
1559 addr = qemu_get_be64(f);
1560 flags = addr & ~TARGET_PAGE_MASK;
1561 addr &= TARGET_PAGE_MASK;
1562
1563 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
1564 case RAM_SAVE_FLAG_MEM_SIZE:
1565 /* Synchronize RAM block list */
1566 total_ram_bytes = addr;
1567 while (!ret && total_ram_bytes) {
1568 RAMBlock *block;
56e93d26
JQ
1569 char id[256];
1570 ram_addr_t length;
1571
1572 len = qemu_get_byte(f);
1573 qemu_get_buffer(f, (uint8_t *)id, len);
1574 id[len] = 0;
1575 length = qemu_get_be64(f);
1576
e3dd7493
DDAG
1577 block = qemu_ram_block_by_name(id);
1578 if (block) {
1579 if (length != block->used_length) {
1580 Error *local_err = NULL;
56e93d26 1581
e3dd7493
DDAG
1582 ret = qemu_ram_resize(block->offset, length,
1583 &local_err);
1584 if (local_err) {
1585 error_report_err(local_err);
56e93d26 1586 }
56e93d26 1587 }
e3dd7493
DDAG
1588 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
1589 block->idstr);
1590 } else {
56e93d26
JQ
1591 error_report("Unknown ramblock \"%s\", cannot "
1592 "accept migration", id);
1593 ret = -EINVAL;
1594 }
1595
1596 total_ram_bytes -= length;
1597 }
1598 break;
1599 case RAM_SAVE_FLAG_COMPRESS:
1600 host = host_from_stream_offset(f, addr, flags);
1601 if (!host) {
1602 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
1603 ret = -EINVAL;
1604 break;
1605 }
1606 ch = qemu_get_byte(f);
1607 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
1608 break;
1609 case RAM_SAVE_FLAG_PAGE:
1610 host = host_from_stream_offset(f, addr, flags);
1611 if (!host) {
1612 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
1613 ret = -EINVAL;
1614 break;
1615 }
1616 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
1617 break;
1618 case RAM_SAVE_FLAG_COMPRESS_PAGE:
1619 host = host_from_stream_offset(f, addr, flags);
1620 if (!host) {
1621 error_report("Invalid RAM offset " RAM_ADDR_FMT, addr);
1622 ret = -EINVAL;
1623 break;
1624 }
1625
1626 len = qemu_get_be32(f);
1627 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
1628 error_report("Invalid compressed data length: %d", len);
1629 ret = -EINVAL;
1630 break;
1631 }
1632 qemu_get_buffer(f, compressed_data_buf, len);
1633 decompress_data_with_multi_threads(compressed_data_buf, host, len);
1634 break;
1635 case RAM_SAVE_FLAG_XBZRLE:
1636 host = host_from_stream_offset(f, addr, flags);
1637 if (!host) {
1638 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
1639 ret = -EINVAL;
1640 break;
1641 }
1642 if (load_xbzrle(f, addr, host) < 0) {
1643 error_report("Failed to decompress XBZRLE page at "
1644 RAM_ADDR_FMT, addr);
1645 ret = -EINVAL;
1646 break;
1647 }
1648 break;
1649 case RAM_SAVE_FLAG_EOS:
1650 /* normal exit */
1651 break;
1652 default:
1653 if (flags & RAM_SAVE_FLAG_HOOK) {
632e3a5c 1654 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
56e93d26
JQ
1655 } else {
1656 error_report("Unknown combination of migration flags: %#x",
1657 flags);
1658 ret = -EINVAL;
1659 }
1660 }
1661 if (!ret) {
1662 ret = qemu_file_get_error(f);
1663 }
1664 }
1665
1666 rcu_read_unlock();
1667 DPRINTF("Completed load of VM with exit code %d seq iteration "
1668 "%" PRIu64 "\n", ret, seq_iter);
1669 return ret;
1670}
1671
1672static SaveVMHandlers savevm_ram_handlers = {
1673 .save_live_setup = ram_save_setup,
1674 .save_live_iterate = ram_save_iterate,
1675 .save_live_complete = ram_save_complete,
1676 .save_live_pending = ram_save_pending,
1677 .load_state = ram_load,
6ad2a215 1678 .cleanup = ram_migration_cleanup,
56e93d26
JQ
1679};
1680
1681void ram_mig_init(void)
1682{
1683 qemu_mutex_init(&XBZRLE.lock);
1684 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, NULL);
1685}