]> git.proxmox.com Git - mirror_qemu.git/blame - migration/ram.c
ram: Remember last_page instead of last_offset
[mirror_qemu.git] / migration / ram.c
CommitLineData
56e93d26
JQ
1/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
76cc7b58
JQ
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
56e93d26
JQ
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
1393a485 28#include "qemu/osdep.h"
33c11879
PB
29#include "qemu-common.h"
30#include "cpu.h"
56e93d26 31#include <zlib.h>
4addcd4f 32#include "qapi-event.h"
f348b6d1 33#include "qemu/cutils.h"
56e93d26
JQ
34#include "qemu/bitops.h"
35#include "qemu/bitmap.h"
7205c9ec
JQ
36#include "qemu/timer.h"
37#include "qemu/main-loop.h"
56e93d26 38#include "migration/migration.h"
e0b266f0 39#include "migration/postcopy-ram.h"
56e93d26
JQ
40#include "exec/address-spaces.h"
41#include "migration/page_cache.h"
56e93d26 42#include "qemu/error-report.h"
56e93d26 43#include "trace.h"
56e93d26 44#include "exec/ram_addr.h"
56e93d26 45#include "qemu/rcu_queue.h"
a91246c9 46#include "migration/colo.h"
56e93d26 47
56e93d26
JQ
48/***********************************************************/
49/* ram save/restore */
50
51#define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
52#define RAM_SAVE_FLAG_COMPRESS 0x02
53#define RAM_SAVE_FLAG_MEM_SIZE 0x04
54#define RAM_SAVE_FLAG_PAGE 0x08
55#define RAM_SAVE_FLAG_EOS 0x10
56#define RAM_SAVE_FLAG_CONTINUE 0x20
57#define RAM_SAVE_FLAG_XBZRLE 0x40
58/* 0x80 is reserved in migration.h start with 0x100 next */
59#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
60
adb65dec 61static uint8_t *ZERO_TARGET_PAGE;
56e93d26
JQ
62
63static inline bool is_zero_range(uint8_t *p, uint64_t size)
64{
a1febc49 65 return buffer_is_zero(p, size);
56e93d26
JQ
66}
67
68/* struct contains XBZRLE cache and a static page
69 used by the compression */
70static struct {
71 /* buffer used for XBZRLE encoding */
72 uint8_t *encoded_buf;
73 /* buffer for storing page content */
74 uint8_t *current_buf;
75 /* Cache for XBZRLE, Protected by lock. */
76 PageCache *cache;
77 QemuMutex lock;
78} XBZRLE;
79
80/* buffer used for XBZRLE decoding */
81static uint8_t *xbzrle_decoded_buf;
82
83static void XBZRLE_cache_lock(void)
84{
85 if (migrate_use_xbzrle())
86 qemu_mutex_lock(&XBZRLE.lock);
87}
88
89static void XBZRLE_cache_unlock(void)
90{
91 if (migrate_use_xbzrle())
92 qemu_mutex_unlock(&XBZRLE.lock);
93}
94
3d0684b2
JQ
95/**
96 * xbzrle_cache_resize: resize the xbzrle cache
97 *
98 * This function is called from qmp_migrate_set_cache_size in main
99 * thread, possibly while a migration is in progress. A running
100 * migration may be using the cache and might finish during this call,
101 * hence changes to the cache are protected by XBZRLE.lock().
102 *
103 * Returns the new_size or negative in case of error.
104 *
105 * @new_size: new cache size
56e93d26
JQ
106 */
107int64_t xbzrle_cache_resize(int64_t new_size)
108{
109 PageCache *new_cache;
110 int64_t ret;
111
112 if (new_size < TARGET_PAGE_SIZE) {
113 return -1;
114 }
115
116 XBZRLE_cache_lock();
117
118 if (XBZRLE.cache != NULL) {
119 if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
120 goto out_new_size;
121 }
122 new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
123 TARGET_PAGE_SIZE);
124 if (!new_cache) {
125 error_report("Error creating cache");
126 ret = -1;
127 goto out;
128 }
129
130 cache_fini(XBZRLE.cache);
131 XBZRLE.cache = new_cache;
132 }
133
134out_new_size:
135 ret = pow2floor(new_size);
136out:
137 XBZRLE_cache_unlock();
138 return ret;
139}
140
eb859c53
JQ
141struct RAMBitmap {
142 struct rcu_head rcu;
143 /* Main migration bitmap */
144 unsigned long *bmap;
145 /* bitmap of pages that haven't been sent even once
146 * only maintained and used in postcopy at the moment
147 * where it's used to send the dirtymap at the start
148 * of the postcopy phase
149 */
150 unsigned long *unsentmap;
151};
152typedef struct RAMBitmap RAMBitmap;
153
ec481c6c
JQ
154/*
155 * An outstanding page request, on the source, having been received
156 * and queued
157 */
158struct RAMSrcPageRequest {
159 RAMBlock *rb;
160 hwaddr offset;
161 hwaddr len;
162
163 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
164};
165
6f37bb8b
JQ
166/* State of RAM for migration */
167struct RAMState {
204b88b8
JQ
168 /* QEMUFile used for this migration */
169 QEMUFile *f;
6f37bb8b
JQ
170 /* Last block that we have visited searching for dirty pages */
171 RAMBlock *last_seen_block;
172 /* Last block from where we have sent data */
173 RAMBlock *last_sent_block;
269ace29
JQ
174 /* Last dirty target page we have sent */
175 ram_addr_t last_page;
6f37bb8b
JQ
176 /* last ram version we have seen */
177 uint32_t last_version;
178 /* We are in the first round */
179 bool ram_bulk_stage;
8d820d6f
JQ
180 /* How many times we have dirty too many pages */
181 int dirty_rate_high_cnt;
5a987738
JQ
182 /* How many times we have synchronized the bitmap */
183 uint64_t bitmap_sync_count;
f664da80
JQ
184 /* these variables are used for bitmap sync */
185 /* last time we did a full bitmap_sync */
186 int64_t time_last_bitmap_sync;
eac74159 187 /* bytes transferred at start_time */
c4bdf0cf 188 uint64_t bytes_xfer_prev;
a66cd90c 189 /* number of dirty pages since start_time */
68908ed6 190 uint64_t num_dirty_pages_period;
b5833fde
JQ
191 /* xbzrle misses since the beginning of the period */
192 uint64_t xbzrle_cache_miss_prev;
36040d9c
JQ
193 /* number of iterations at the beginning of period */
194 uint64_t iterations_prev;
f7ccd61b
JQ
195 /* Accounting fields */
196 /* number of zero pages. It used to be pages filled by the same char. */
197 uint64_t zero_pages;
b4d1c6e7
JQ
198 /* number of normal transferred pages */
199 uint64_t norm_pages;
23b28c3c
JQ
200 /* Iterations since start */
201 uint64_t iterations;
f36ada95
JQ
202 /* xbzrle transmitted bytes. Notice that this is with
203 * compression, they can't be calculated from the pages */
07ed50a2 204 uint64_t xbzrle_bytes;
f36ada95
JQ
205 /* xbzrle transmmited pages */
206 uint64_t xbzrle_pages;
544c36f1
JQ
207 /* xbzrle number of cache miss */
208 uint64_t xbzrle_cache_miss;
b07016b6
JQ
209 /* xbzrle miss rate */
210 double xbzrle_cache_miss_rate;
180f61f7
JQ
211 /* xbzrle number of overflows */
212 uint64_t xbzrle_overflows;
0d8ec885
JQ
213 /* number of dirty bits in the bitmap */
214 uint64_t migration_dirty_pages;
2f4fde93
JQ
215 /* total number of bytes transferred */
216 uint64_t bytes_transferred;
47ad8619
JQ
217 /* number of dirtied pages in the last second */
218 uint64_t dirty_pages_rate;
96506894
JQ
219 /* Count of requests incoming from destination */
220 uint64_t postcopy_requests;
108cfae0
JQ
221 /* protects modification of the bitmap */
222 QemuMutex bitmap_mutex;
eb859c53
JQ
223 /* Ram Bitmap protected by RCU */
224 RAMBitmap *ram_bitmap;
68a098f3
JQ
225 /* The RAMBlock used in the last src_page_requests */
226 RAMBlock *last_req_rb;
ec481c6c
JQ
227 /* Queue of outstanding page requests from the destination */
228 QemuMutex src_page_req_mutex;
229 QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
6f37bb8b
JQ
230};
231typedef struct RAMState RAMState;
232
233static RAMState ram_state;
234
56e93d26
JQ
235uint64_t dup_mig_pages_transferred(void)
236{
f7ccd61b 237 return ram_state.zero_pages;
56e93d26
JQ
238}
239
56e93d26
JQ
240uint64_t norm_mig_pages_transferred(void)
241{
b4d1c6e7 242 return ram_state.norm_pages;
56e93d26
JQ
243}
244
245uint64_t xbzrle_mig_bytes_transferred(void)
246{
07ed50a2 247 return ram_state.xbzrle_bytes;
56e93d26
JQ
248}
249
250uint64_t xbzrle_mig_pages_transferred(void)
251{
f36ada95 252 return ram_state.xbzrle_pages;
56e93d26
JQ
253}
254
255uint64_t xbzrle_mig_pages_cache_miss(void)
256{
544c36f1 257 return ram_state.xbzrle_cache_miss;
56e93d26
JQ
258}
259
260double xbzrle_mig_cache_miss_rate(void)
261{
b07016b6 262 return ram_state.xbzrle_cache_miss_rate;
56e93d26
JQ
263}
264
265uint64_t xbzrle_mig_pages_overflow(void)
266{
180f61f7 267 return ram_state.xbzrle_overflows;
56e93d26
JQ
268}
269
9edabd4d 270uint64_t ram_bytes_transferred(void)
0d8ec885 271{
9edabd4d 272 return ram_state.bytes_transferred;
0d8ec885
JQ
273}
274
9edabd4d 275uint64_t ram_bytes_remaining(void)
2f4fde93 276{
9edabd4d 277 return ram_state.migration_dirty_pages * TARGET_PAGE_SIZE;
2f4fde93
JQ
278}
279
42d219d3
JQ
280uint64_t ram_dirty_sync_count(void)
281{
282 return ram_state.bitmap_sync_count;
283}
284
47ad8619
JQ
285uint64_t ram_dirty_pages_rate(void)
286{
287 return ram_state.dirty_pages_rate;
288}
289
96506894
JQ
290uint64_t ram_postcopy_requests(void)
291{
292 return ram_state.postcopy_requests;
293}
294
b8fb8cb7
DDAG
295/* used by the search for pages to send */
296struct PageSearchStatus {
297 /* Current block being searched */
298 RAMBlock *block;
299 /* Current offset to search from */
300 ram_addr_t offset;
301 /* Set once we wrap around */
302 bool complete_round;
303};
304typedef struct PageSearchStatus PageSearchStatus;
305
56e93d26 306struct CompressParam {
56e93d26 307 bool done;
90e56fb4 308 bool quit;
56e93d26
JQ
309 QEMUFile *file;
310 QemuMutex mutex;
311 QemuCond cond;
312 RAMBlock *block;
313 ram_addr_t offset;
314};
315typedef struct CompressParam CompressParam;
316
317struct DecompressParam {
73a8912b 318 bool done;
90e56fb4 319 bool quit;
56e93d26
JQ
320 QemuMutex mutex;
321 QemuCond cond;
322 void *des;
d341d9f3 323 uint8_t *compbuf;
56e93d26
JQ
324 int len;
325};
326typedef struct DecompressParam DecompressParam;
327
328static CompressParam *comp_param;
329static QemuThread *compress_threads;
330/* comp_done_cond is used to wake up the migration thread when
331 * one of the compression threads has finished the compression.
332 * comp_done_lock is used to co-work with comp_done_cond.
333 */
0d9f9a5c
LL
334static QemuMutex comp_done_lock;
335static QemuCond comp_done_cond;
56e93d26
JQ
336/* The empty QEMUFileOps will be used by file in CompressParam */
337static const QEMUFileOps empty_ops = { };
338
56e93d26
JQ
339static DecompressParam *decomp_param;
340static QemuThread *decompress_threads;
73a8912b
LL
341static QemuMutex decomp_done_lock;
342static QemuCond decomp_done_cond;
56e93d26 343
a7a9a88f
LL
344static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
345 ram_addr_t offset);
56e93d26
JQ
346
347static void *do_data_compress(void *opaque)
348{
349 CompressParam *param = opaque;
a7a9a88f
LL
350 RAMBlock *block;
351 ram_addr_t offset;
56e93d26 352
a7a9a88f 353 qemu_mutex_lock(&param->mutex);
90e56fb4 354 while (!param->quit) {
a7a9a88f
LL
355 if (param->block) {
356 block = param->block;
357 offset = param->offset;
358 param->block = NULL;
359 qemu_mutex_unlock(&param->mutex);
360
361 do_compress_ram_page(param->file, block, offset);
362
0d9f9a5c 363 qemu_mutex_lock(&comp_done_lock);
a7a9a88f 364 param->done = true;
0d9f9a5c
LL
365 qemu_cond_signal(&comp_done_cond);
366 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
367
368 qemu_mutex_lock(&param->mutex);
369 } else {
56e93d26
JQ
370 qemu_cond_wait(&param->cond, &param->mutex);
371 }
56e93d26 372 }
a7a9a88f 373 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
374
375 return NULL;
376}
377
378static inline void terminate_compression_threads(void)
379{
380 int idx, thread_count;
381
382 thread_count = migrate_compress_threads();
3d0684b2 383
56e93d26
JQ
384 for (idx = 0; idx < thread_count; idx++) {
385 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 386 comp_param[idx].quit = true;
56e93d26
JQ
387 qemu_cond_signal(&comp_param[idx].cond);
388 qemu_mutex_unlock(&comp_param[idx].mutex);
389 }
390}
391
392void migrate_compress_threads_join(void)
393{
394 int i, thread_count;
395
396 if (!migrate_use_compression()) {
397 return;
398 }
399 terminate_compression_threads();
400 thread_count = migrate_compress_threads();
401 for (i = 0; i < thread_count; i++) {
402 qemu_thread_join(compress_threads + i);
403 qemu_fclose(comp_param[i].file);
404 qemu_mutex_destroy(&comp_param[i].mutex);
405 qemu_cond_destroy(&comp_param[i].cond);
406 }
0d9f9a5c
LL
407 qemu_mutex_destroy(&comp_done_lock);
408 qemu_cond_destroy(&comp_done_cond);
56e93d26
JQ
409 g_free(compress_threads);
410 g_free(comp_param);
56e93d26
JQ
411 compress_threads = NULL;
412 comp_param = NULL;
56e93d26
JQ
413}
414
415void migrate_compress_threads_create(void)
416{
417 int i, thread_count;
418
419 if (!migrate_use_compression()) {
420 return;
421 }
56e93d26
JQ
422 thread_count = migrate_compress_threads();
423 compress_threads = g_new0(QemuThread, thread_count);
424 comp_param = g_new0(CompressParam, thread_count);
0d9f9a5c
LL
425 qemu_cond_init(&comp_done_cond);
426 qemu_mutex_init(&comp_done_lock);
56e93d26 427 for (i = 0; i < thread_count; i++) {
e110aa91
C
428 /* comp_param[i].file is just used as a dummy buffer to save data,
429 * set its ops to empty.
56e93d26
JQ
430 */
431 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
432 comp_param[i].done = true;
90e56fb4 433 comp_param[i].quit = false;
56e93d26
JQ
434 qemu_mutex_init(&comp_param[i].mutex);
435 qemu_cond_init(&comp_param[i].cond);
436 qemu_thread_create(compress_threads + i, "compress",
437 do_data_compress, comp_param + i,
438 QEMU_THREAD_JOINABLE);
439 }
440}
441
442/**
3d0684b2 443 * save_page_header: write page header to wire
56e93d26
JQ
444 *
445 * If this is the 1st block, it also writes the block identification
446 *
3d0684b2 447 * Returns the number of bytes written
56e93d26
JQ
448 *
449 * @f: QEMUFile where to send the data
450 * @block: block that contains the page we want to send
451 * @offset: offset inside the block for the page
452 * in the lower bits, it contains flags
453 */
24795694 454static size_t save_page_header(RAMState *rs, RAMBlock *block, ram_addr_t offset)
56e93d26 455{
9f5f380b 456 size_t size, len;
56e93d26 457
24795694
JQ
458 if (block == rs->last_sent_block) {
459 offset |= RAM_SAVE_FLAG_CONTINUE;
460 }
461 qemu_put_be64(rs->f, offset);
56e93d26
JQ
462 size = 8;
463
464 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
9f5f380b 465 len = strlen(block->idstr);
24795694
JQ
466 qemu_put_byte(rs->f, len);
467 qemu_put_buffer(rs->f, (uint8_t *)block->idstr, len);
9f5f380b 468 size += 1 + len;
24795694 469 rs->last_sent_block = block;
56e93d26
JQ
470 }
471 return size;
472}
473
3d0684b2
JQ
474/**
475 * mig_throttle_guest_down: throotle down the guest
476 *
477 * Reduce amount of guest cpu execution to hopefully slow down memory
478 * writes. If guest dirty memory rate is reduced below the rate at
479 * which we can transfer pages to the destination then we should be
480 * able to complete migration. Some workloads dirty memory way too
481 * fast and will not effectively converge, even with auto-converge.
070afca2
JH
482 */
483static void mig_throttle_guest_down(void)
484{
485 MigrationState *s = migrate_get_current();
2594f56d
DB
486 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
487 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
070afca2
JH
488
489 /* We have not started throttling yet. Let's start it. */
490 if (!cpu_throttle_active()) {
491 cpu_throttle_set(pct_initial);
492 } else {
493 /* Throttling already on, just increase the rate */
494 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
495 }
496}
497
3d0684b2
JQ
498/**
499 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
500 *
6f37bb8b 501 * @rs: current RAM state
3d0684b2
JQ
502 * @current_addr: address for the zero page
503 *
504 * Update the xbzrle cache to reflect a page that's been sent as all 0.
56e93d26
JQ
505 * The important thing is that a stale (not-yet-0'd) page be replaced
506 * by the new data.
507 * As a bonus, if the page wasn't in the cache it gets added so that
3d0684b2 508 * when a small write is made into the 0'd page it gets XBZRLE sent.
56e93d26 509 */
6f37bb8b 510static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
56e93d26 511{
6f37bb8b 512 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
56e93d26
JQ
513 return;
514 }
515
516 /* We don't care if this fails to allocate a new cache page
517 * as long as it updated an old one */
518 cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
5a987738 519 rs->bitmap_sync_count);
56e93d26
JQ
520}
521
522#define ENCODING_FLAG_XBZRLE 0x1
523
524/**
525 * save_xbzrle_page: compress and send current page
526 *
527 * Returns: 1 means that we wrote the page
528 * 0 means that page is identical to the one already sent
529 * -1 means that xbzrle would be longer than normal
530 *
5a987738 531 * @rs: current RAM state
3d0684b2
JQ
532 * @current_data: pointer to the address of the page contents
533 * @current_addr: addr of the page
56e93d26
JQ
534 * @block: block that contains the page we want to send
535 * @offset: offset inside the block for the page
536 * @last_stage: if we are at the completion stage
56e93d26 537 */
204b88b8 538static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
56e93d26 539 ram_addr_t current_addr, RAMBlock *block,
072c2511 540 ram_addr_t offset, bool last_stage)
56e93d26
JQ
541{
542 int encoded_len = 0, bytes_xbzrle;
543 uint8_t *prev_cached_page;
544
5a987738 545 if (!cache_is_cached(XBZRLE.cache, current_addr, rs->bitmap_sync_count)) {
544c36f1 546 rs->xbzrle_cache_miss++;
56e93d26
JQ
547 if (!last_stage) {
548 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
5a987738 549 rs->bitmap_sync_count) == -1) {
56e93d26
JQ
550 return -1;
551 } else {
552 /* update *current_data when the page has been
553 inserted into cache */
554 *current_data = get_cached_data(XBZRLE.cache, current_addr);
555 }
556 }
557 return -1;
558 }
559
560 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
561
562 /* save current buffer into memory */
563 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
564
565 /* XBZRLE encoding (if there is no overflow) */
566 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
567 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
568 TARGET_PAGE_SIZE);
569 if (encoded_len == 0) {
55c4446b 570 trace_save_xbzrle_page_skipping();
56e93d26
JQ
571 return 0;
572 } else if (encoded_len == -1) {
55c4446b 573 trace_save_xbzrle_page_overflow();
180f61f7 574 rs->xbzrle_overflows++;
56e93d26
JQ
575 /* update data in the cache */
576 if (!last_stage) {
577 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
578 *current_data = prev_cached_page;
579 }
580 return -1;
581 }
582
583 /* we need to update the data in the cache, in order to get the same data */
584 if (!last_stage) {
585 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
586 }
587
588 /* Send XBZRLE based compressed page */
24795694 589 bytes_xbzrle = save_page_header(rs, block,
204b88b8
JQ
590 offset | RAM_SAVE_FLAG_XBZRLE);
591 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
592 qemu_put_be16(rs->f, encoded_len);
593 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
56e93d26 594 bytes_xbzrle += encoded_len + 1 + 2;
f36ada95 595 rs->xbzrle_pages++;
07ed50a2 596 rs->xbzrle_bytes += bytes_xbzrle;
072c2511 597 rs->bytes_transferred += bytes_xbzrle;
56e93d26
JQ
598
599 return 1;
600}
601
3d0684b2
JQ
602/**
603 * migration_bitmap_find_dirty: find the next dirty page from start
f3f491fc 604 *
3d0684b2
JQ
605 * Called with rcu_read_lock() to protect migration_bitmap
606 *
607 * Returns the byte offset within memory region of the start of a dirty page
608 *
6f37bb8b 609 * @rs: current RAM state
3d0684b2
JQ
610 * @rb: RAMBlock where to search for dirty pages
611 * @start: starting address (typically so we can continue from previous page)
06b10688 612 * @page_abs: pointer into where to store the dirty page
f3f491fc 613 */
56e93d26 614static inline
6f37bb8b 615ram_addr_t migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
a82d593b 616 ram_addr_t start,
06b10688 617 unsigned long *page_abs)
56e93d26 618{
2f68e399 619 unsigned long base = rb->offset >> TARGET_PAGE_BITS;
56e93d26 620 unsigned long nr = base + (start >> TARGET_PAGE_BITS);
2f68e399
DDAG
621 uint64_t rb_size = rb->used_length;
622 unsigned long size = base + (rb_size >> TARGET_PAGE_BITS);
2ff64038 623 unsigned long *bitmap;
56e93d26
JQ
624
625 unsigned long next;
626
eb859c53 627 bitmap = atomic_rcu_read(&rs->ram_bitmap)->bmap;
6f37bb8b 628 if (rs->ram_bulk_stage && nr > base) {
56e93d26
JQ
629 next = nr + 1;
630 } else {
2ff64038 631 next = find_next_bit(bitmap, size, nr);
56e93d26
JQ
632 }
633
06b10688 634 *page_abs = next;
56e93d26
JQ
635 return (next - base) << TARGET_PAGE_BITS;
636}
637
06b10688
JQ
638static inline bool migration_bitmap_clear_dirty(RAMState *rs,
639 unsigned long page_abs)
a82d593b
DDAG
640{
641 bool ret;
eb859c53 642 unsigned long *bitmap = atomic_rcu_read(&rs->ram_bitmap)->bmap;
a82d593b 643
06b10688 644 ret = test_and_clear_bit(page_abs, bitmap);
a82d593b
DDAG
645
646 if (ret) {
0d8ec885 647 rs->migration_dirty_pages--;
a82d593b
DDAG
648 }
649 return ret;
650}
651
15440dd5
JQ
652static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
653 ram_addr_t start, ram_addr_t length)
56e93d26 654{
2ff64038 655 unsigned long *bitmap;
eb859c53 656 bitmap = atomic_rcu_read(&rs->ram_bitmap)->bmap;
0d8ec885 657 rs->migration_dirty_pages +=
15440dd5 658 cpu_physical_memory_sync_dirty_bitmap(bitmap, rb, start, length,
0d8ec885 659 &rs->num_dirty_pages_period);
56e93d26
JQ
660}
661
3d0684b2
JQ
662/**
663 * ram_pagesize_summary: calculate all the pagesizes of a VM
664 *
665 * Returns a summary bitmap of the page sizes of all RAMBlocks
666 *
667 * For VMs with just normal pages this is equivalent to the host page
668 * size. If it's got some huge pages then it's the OR of all the
669 * different page sizes.
e8ca1db2
DDAG
670 */
671uint64_t ram_pagesize_summary(void)
672{
673 RAMBlock *block;
674 uint64_t summary = 0;
675
676 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
677 summary |= block->page_size;
678 }
679
680 return summary;
681}
682
8d820d6f 683static void migration_bitmap_sync(RAMState *rs)
56e93d26
JQ
684{
685 RAMBlock *block;
56e93d26 686 int64_t end_time;
c4bdf0cf 687 uint64_t bytes_xfer_now;
56e93d26 688
5a987738 689 rs->bitmap_sync_count++;
56e93d26 690
eac74159
JQ
691 if (!rs->bytes_xfer_prev) {
692 rs->bytes_xfer_prev = ram_bytes_transferred();
56e93d26
JQ
693 }
694
f664da80
JQ
695 if (!rs->time_last_bitmap_sync) {
696 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
56e93d26
JQ
697 }
698
699 trace_migration_bitmap_sync_start();
9c1f8f44 700 memory_global_dirty_log_sync();
56e93d26 701
108cfae0 702 qemu_mutex_lock(&rs->bitmap_mutex);
56e93d26
JQ
703 rcu_read_lock();
704 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
15440dd5 705 migration_bitmap_sync_range(rs, block, 0, block->used_length);
56e93d26
JQ
706 }
707 rcu_read_unlock();
108cfae0 708 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26 709
a66cd90c 710 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1ffb5dfd 711
56e93d26
JQ
712 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
713
714 /* more than 1 second = 1000 millisecons */
f664da80 715 if (end_time > rs->time_last_bitmap_sync + 1000) {
56e93d26
JQ
716 if (migrate_auto_converge()) {
717 /* The following detection logic can be refined later. For now:
718 Check to see if the dirtied bytes is 50% more than the approx.
719 amount of bytes that just got transferred since the last time we
070afca2
JH
720 were in this routine. If that happens twice, start or increase
721 throttling */
56e93d26 722 bytes_xfer_now = ram_bytes_transferred();
070afca2 723
47ad8619 724 if (rs->dirty_pages_rate &&
a66cd90c 725 (rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
eac74159 726 (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
8d820d6f 727 (rs->dirty_rate_high_cnt++ >= 2)) {
56e93d26 728 trace_migration_throttle();
8d820d6f 729 rs->dirty_rate_high_cnt = 0;
070afca2 730 mig_throttle_guest_down();
56e93d26 731 }
eac74159 732 rs->bytes_xfer_prev = bytes_xfer_now;
56e93d26 733 }
070afca2 734
56e93d26 735 if (migrate_use_xbzrle()) {
23b28c3c 736 if (rs->iterations_prev != rs->iterations) {
b07016b6 737 rs->xbzrle_cache_miss_rate =
544c36f1 738 (double)(rs->xbzrle_cache_miss -
b5833fde 739 rs->xbzrle_cache_miss_prev) /
23b28c3c 740 (rs->iterations - rs->iterations_prev);
56e93d26 741 }
23b28c3c 742 rs->iterations_prev = rs->iterations;
544c36f1 743 rs->xbzrle_cache_miss_prev = rs->xbzrle_cache_miss;
56e93d26 744 }
47ad8619 745 rs->dirty_pages_rate = rs->num_dirty_pages_period * 1000
f664da80 746 / (end_time - rs->time_last_bitmap_sync);
f664da80 747 rs->time_last_bitmap_sync = end_time;
a66cd90c 748 rs->num_dirty_pages_period = 0;
56e93d26 749 }
4addcd4f 750 if (migrate_use_events()) {
5a987738 751 qapi_event_send_migration_pass(rs->bitmap_sync_count, NULL);
4addcd4f 752 }
56e93d26
JQ
753}
754
755/**
3d0684b2 756 * save_zero_page: send the zero page to the stream
56e93d26 757 *
3d0684b2 758 * Returns the number of pages written.
56e93d26 759 *
f7ccd61b 760 * @rs: current RAM state
56e93d26
JQ
761 * @block: block that contains the page we want to send
762 * @offset: offset inside the block for the page
763 * @p: pointer to the page
56e93d26 764 */
ce25d337
JQ
765static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
766 uint8_t *p)
56e93d26
JQ
767{
768 int pages = -1;
769
770 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
f7ccd61b 771 rs->zero_pages++;
072c2511 772 rs->bytes_transferred +=
24795694 773 save_page_header(rs, block, offset | RAM_SAVE_FLAG_COMPRESS);
ce25d337 774 qemu_put_byte(rs->f, 0);
072c2511 775 rs->bytes_transferred += 1;
56e93d26
JQ
776 pages = 1;
777 }
778
779 return pages;
780}
781
5727309d 782static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
53f09a10 783{
5727309d 784 if (!migrate_release_ram() || !migration_in_postcopy()) {
53f09a10
PB
785 return;
786 }
787
aaa2064c 788 ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
53f09a10
PB
789}
790
56e93d26 791/**
3d0684b2 792 * ram_save_page: send the given page to the stream
56e93d26 793 *
3d0684b2 794 * Returns the number of pages written.
3fd3c4b3
DDAG
795 * < 0 - error
796 * >=0 - Number of pages written - this might legally be 0
797 * if xbzrle noticed the page was the same.
56e93d26 798 *
6f37bb8b 799 * @rs: current RAM state
56e93d26
JQ
800 * @block: block that contains the page we want to send
801 * @offset: offset inside the block for the page
802 * @last_stage: if we are at the completion stage
56e93d26 803 */
a0a8aa14 804static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
56e93d26
JQ
805{
806 int pages = -1;
807 uint64_t bytes_xmit;
808 ram_addr_t current_addr;
56e93d26
JQ
809 uint8_t *p;
810 int ret;
811 bool send_async = true;
a08f6890
HZ
812 RAMBlock *block = pss->block;
813 ram_addr_t offset = pss->offset;
56e93d26 814
2f68e399 815 p = block->host + offset;
56e93d26
JQ
816
817 /* In doubt sent page as normal */
818 bytes_xmit = 0;
ce25d337 819 ret = ram_control_save_page(rs->f, block->offset,
56e93d26
JQ
820 offset, TARGET_PAGE_SIZE, &bytes_xmit);
821 if (bytes_xmit) {
072c2511 822 rs->bytes_transferred += bytes_xmit;
56e93d26
JQ
823 pages = 1;
824 }
825
826 XBZRLE_cache_lock();
827
828 current_addr = block->offset + offset;
829
56e93d26
JQ
830 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
831 if (ret != RAM_SAVE_CONTROL_DELAYED) {
832 if (bytes_xmit > 0) {
b4d1c6e7 833 rs->norm_pages++;
56e93d26 834 } else if (bytes_xmit == 0) {
f7ccd61b 835 rs->zero_pages++;
56e93d26
JQ
836 }
837 }
838 } else {
ce25d337 839 pages = save_zero_page(rs, block, offset, p);
56e93d26
JQ
840 if (pages > 0) {
841 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
842 * page would be stale
843 */
6f37bb8b 844 xbzrle_cache_zero_page(rs, current_addr);
5727309d 845 ram_release_pages(block->idstr, pss->offset, pages);
6f37bb8b 846 } else if (!rs->ram_bulk_stage &&
5727309d 847 !migration_in_postcopy() && migrate_use_xbzrle()) {
204b88b8 848 pages = save_xbzrle_page(rs, &p, current_addr, block,
072c2511 849 offset, last_stage);
56e93d26
JQ
850 if (!last_stage) {
851 /* Can't send this cached data async, since the cache page
852 * might get updated before it gets to the wire
853 */
854 send_async = false;
855 }
856 }
857 }
858
859 /* XBZRLE overflow or normal page */
860 if (pages == -1) {
24795694
JQ
861 rs->bytes_transferred += save_page_header(rs, block,
862 offset | RAM_SAVE_FLAG_PAGE);
56e93d26 863 if (send_async) {
ce25d337 864 qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
53f09a10 865 migrate_release_ram() &
5727309d 866 migration_in_postcopy());
56e93d26 867 } else {
ce25d337 868 qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
56e93d26 869 }
072c2511 870 rs->bytes_transferred += TARGET_PAGE_SIZE;
56e93d26 871 pages = 1;
b4d1c6e7 872 rs->norm_pages++;
56e93d26
JQ
873 }
874
875 XBZRLE_cache_unlock();
876
877 return pages;
878}
879
a7a9a88f
LL
880static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
881 ram_addr_t offset)
56e93d26 882{
24795694 883 RAMState *rs = &ram_state;
56e93d26 884 int bytes_sent, blen;
a7a9a88f 885 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
56e93d26 886
24795694 887 bytes_sent = save_page_header(rs, block, offset |
56e93d26 888 RAM_SAVE_FLAG_COMPRESS_PAGE);
a7a9a88f 889 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
56e93d26 890 migrate_compress_level());
b3be2896
LL
891 if (blen < 0) {
892 bytes_sent = 0;
893 qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
894 error_report("compressed data failed!");
895 } else {
896 bytes_sent += blen;
5727309d 897 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
b3be2896 898 }
56e93d26
JQ
899
900 return bytes_sent;
901}
902
ce25d337 903static void flush_compressed_data(RAMState *rs)
56e93d26
JQ
904{
905 int idx, len, thread_count;
906
907 if (!migrate_use_compression()) {
908 return;
909 }
910 thread_count = migrate_compress_threads();
a7a9a88f 911
0d9f9a5c 912 qemu_mutex_lock(&comp_done_lock);
56e93d26 913 for (idx = 0; idx < thread_count; idx++) {
a7a9a88f 914 while (!comp_param[idx].done) {
0d9f9a5c 915 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26 916 }
a7a9a88f 917 }
0d9f9a5c 918 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
919
920 for (idx = 0; idx < thread_count; idx++) {
921 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 922 if (!comp_param[idx].quit) {
ce25d337 923 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2f4fde93 924 rs->bytes_transferred += len;
56e93d26 925 }
a7a9a88f 926 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26
JQ
927 }
928}
929
930static inline void set_compress_params(CompressParam *param, RAMBlock *block,
931 ram_addr_t offset)
932{
933 param->block = block;
934 param->offset = offset;
935}
936
ce25d337
JQ
937static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
938 ram_addr_t offset)
56e93d26
JQ
939{
940 int idx, thread_count, bytes_xmit = -1, pages = -1;
941
942 thread_count = migrate_compress_threads();
0d9f9a5c 943 qemu_mutex_lock(&comp_done_lock);
56e93d26
JQ
944 while (true) {
945 for (idx = 0; idx < thread_count; idx++) {
946 if (comp_param[idx].done) {
a7a9a88f 947 comp_param[idx].done = false;
ce25d337 948 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
a7a9a88f 949 qemu_mutex_lock(&comp_param[idx].mutex);
56e93d26 950 set_compress_params(&comp_param[idx], block, offset);
a7a9a88f
LL
951 qemu_cond_signal(&comp_param[idx].cond);
952 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26 953 pages = 1;
b4d1c6e7 954 rs->norm_pages++;
072c2511 955 rs->bytes_transferred += bytes_xmit;
56e93d26
JQ
956 break;
957 }
958 }
959 if (pages > 0) {
960 break;
961 } else {
0d9f9a5c 962 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26
JQ
963 }
964 }
0d9f9a5c 965 qemu_mutex_unlock(&comp_done_lock);
56e93d26
JQ
966
967 return pages;
968}
969
970/**
971 * ram_save_compressed_page: compress the given page and send it to the stream
972 *
3d0684b2 973 * Returns the number of pages written.
56e93d26 974 *
6f37bb8b 975 * @rs: current RAM state
56e93d26
JQ
976 * @block: block that contains the page we want to send
977 * @offset: offset inside the block for the page
978 * @last_stage: if we are at the completion stage
56e93d26 979 */
a0a8aa14
JQ
980static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
981 bool last_stage)
56e93d26
JQ
982{
983 int pages = -1;
fc50438e 984 uint64_t bytes_xmit = 0;
56e93d26 985 uint8_t *p;
fc50438e 986 int ret, blen;
a08f6890
HZ
987 RAMBlock *block = pss->block;
988 ram_addr_t offset = pss->offset;
56e93d26 989
2f68e399 990 p = block->host + offset;
56e93d26 991
ce25d337 992 ret = ram_control_save_page(rs->f, block->offset,
56e93d26
JQ
993 offset, TARGET_PAGE_SIZE, &bytes_xmit);
994 if (bytes_xmit) {
072c2511 995 rs->bytes_transferred += bytes_xmit;
56e93d26
JQ
996 pages = 1;
997 }
56e93d26
JQ
998 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
999 if (ret != RAM_SAVE_CONTROL_DELAYED) {
1000 if (bytes_xmit > 0) {
b4d1c6e7 1001 rs->norm_pages++;
56e93d26 1002 } else if (bytes_xmit == 0) {
f7ccd61b 1003 rs->zero_pages++;
56e93d26
JQ
1004 }
1005 }
1006 } else {
1007 /* When starting the process of a new block, the first page of
1008 * the block should be sent out before other pages in the same
1009 * block, and all the pages in last block should have been sent
1010 * out, keeping this order is important, because the 'cont' flag
1011 * is used to avoid resending the block name.
1012 */
6f37bb8b 1013 if (block != rs->last_sent_block) {
ce25d337
JQ
1014 flush_compressed_data(rs);
1015 pages = save_zero_page(rs, block, offset, p);
56e93d26 1016 if (pages == -1) {
fc50438e 1017 /* Make sure the first page is sent out before other pages */
24795694 1018 bytes_xmit = save_page_header(rs, block, offset |
fc50438e 1019 RAM_SAVE_FLAG_COMPRESS_PAGE);
ce25d337 1020 blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
fc50438e
LL
1021 migrate_compress_level());
1022 if (blen > 0) {
072c2511 1023 rs->bytes_transferred += bytes_xmit + blen;
b4d1c6e7 1024 rs->norm_pages++;
b3be2896 1025 pages = 1;
fc50438e 1026 } else {
ce25d337 1027 qemu_file_set_error(rs->f, blen);
fc50438e 1028 error_report("compressed data failed!");
b3be2896 1029 }
56e93d26 1030 }
53f09a10 1031 if (pages > 0) {
5727309d 1032 ram_release_pages(block->idstr, pss->offset, pages);
53f09a10 1033 }
56e93d26 1034 } else {
ce25d337 1035 pages = save_zero_page(rs, block, offset, p);
56e93d26 1036 if (pages == -1) {
ce25d337 1037 pages = compress_page_with_multi_thread(rs, block, offset);
53f09a10 1038 } else {
5727309d 1039 ram_release_pages(block->idstr, pss->offset, pages);
56e93d26
JQ
1040 }
1041 }
1042 }
1043
1044 return pages;
1045}
1046
3d0684b2
JQ
1047/**
1048 * find_dirty_block: find the next dirty page and update any state
1049 * associated with the search process.
b9e60928 1050 *
3d0684b2 1051 * Returns if a page is found
b9e60928 1052 *
6f37bb8b 1053 * @rs: current RAM state
3d0684b2
JQ
1054 * @pss: data about the state of the current dirty page scan
1055 * @again: set to false if the search has scanned the whole of RAM
06b10688 1056 * @page_abs: pointer into where to store the dirty page
b9e60928 1057 */
ce25d337 1058static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss,
06b10688 1059 bool *again, unsigned long *page_abs)
b9e60928 1060{
6f37bb8b 1061 pss->offset = migration_bitmap_find_dirty(rs, pss->block, pss->offset,
06b10688 1062 page_abs);
6f37bb8b 1063 if (pss->complete_round && pss->block == rs->last_seen_block &&
269ace29 1064 (pss->offset >> TARGET_PAGE_BITS) >= rs->last_page) {
b9e60928
DDAG
1065 /*
1066 * We've been once around the RAM and haven't found anything.
1067 * Give up.
1068 */
1069 *again = false;
1070 return false;
1071 }
1072 if (pss->offset >= pss->block->used_length) {
1073 /* Didn't find anything in this RAM Block */
1074 pss->offset = 0;
1075 pss->block = QLIST_NEXT_RCU(pss->block, next);
1076 if (!pss->block) {
1077 /* Hit the end of the list */
1078 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1079 /* Flag that we've looped */
1080 pss->complete_round = true;
6f37bb8b 1081 rs->ram_bulk_stage = false;
b9e60928
DDAG
1082 if (migrate_use_xbzrle()) {
1083 /* If xbzrle is on, stop using the data compression at this
1084 * point. In theory, xbzrle can do better than compression.
1085 */
ce25d337 1086 flush_compressed_data(rs);
b9e60928
DDAG
1087 }
1088 }
1089 /* Didn't find anything this time, but try again on the new block */
1090 *again = true;
1091 return false;
1092 } else {
1093 /* Can go around again, but... */
1094 *again = true;
1095 /* We've found something so probably don't need to */
1096 return true;
1097 }
1098}
1099
3d0684b2
JQ
1100/**
1101 * unqueue_page: gets a page of the queue
1102 *
a82d593b 1103 * Helper for 'get_queued_page' - gets a page off the queue
a82d593b 1104 *
3d0684b2
JQ
1105 * Returns the block of the page (or NULL if none available)
1106 *
ec481c6c 1107 * @rs: current RAM state
3d0684b2 1108 * @offset: used to return the offset within the RAMBlock
06b10688 1109 * @page_abs: pointer into where to store the dirty page
a82d593b 1110 */
ec481c6c 1111static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset,
06b10688 1112 unsigned long *page_abs)
a82d593b
DDAG
1113{
1114 RAMBlock *block = NULL;
1115
ec481c6c
JQ
1116 qemu_mutex_lock(&rs->src_page_req_mutex);
1117 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1118 struct RAMSrcPageRequest *entry =
1119 QSIMPLEQ_FIRST(&rs->src_page_requests);
a82d593b
DDAG
1120 block = entry->rb;
1121 *offset = entry->offset;
06b10688 1122 *page_abs = (entry->offset + entry->rb->offset) >> TARGET_PAGE_BITS;
a82d593b
DDAG
1123
1124 if (entry->len > TARGET_PAGE_SIZE) {
1125 entry->len -= TARGET_PAGE_SIZE;
1126 entry->offset += TARGET_PAGE_SIZE;
1127 } else {
1128 memory_region_unref(block->mr);
ec481c6c 1129 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
a82d593b
DDAG
1130 g_free(entry);
1131 }
1132 }
ec481c6c 1133 qemu_mutex_unlock(&rs->src_page_req_mutex);
a82d593b
DDAG
1134
1135 return block;
1136}
1137
3d0684b2
JQ
1138/**
1139 * get_queued_page: unqueue a page from the postocpy requests
1140 *
1141 * Skips pages that are already sent (!dirty)
a82d593b 1142 *
3d0684b2 1143 * Returns if a queued page is found
a82d593b 1144 *
6f37bb8b 1145 * @rs: current RAM state
3d0684b2 1146 * @pss: data about the state of the current dirty page scan
06b10688 1147 * @page_abs: pointer into where to store the dirty page
a82d593b 1148 */
ec481c6c 1149static bool get_queued_page(RAMState *rs, PageSearchStatus *pss,
06b10688 1150 unsigned long *page_abs)
a82d593b
DDAG
1151{
1152 RAMBlock *block;
1153 ram_addr_t offset;
1154 bool dirty;
1155
1156 do {
06b10688 1157 block = unqueue_page(rs, &offset, page_abs);
a82d593b
DDAG
1158 /*
1159 * We're sending this page, and since it's postcopy nothing else
1160 * will dirty it, and we must make sure it doesn't get sent again
1161 * even if this queue request was received after the background
1162 * search already sent it.
1163 */
1164 if (block) {
1165 unsigned long *bitmap;
eb859c53 1166 bitmap = atomic_rcu_read(&rs->ram_bitmap)->bmap;
06b10688 1167 dirty = test_bit(*page_abs, bitmap);
a82d593b 1168 if (!dirty) {
06b10688
JQ
1169 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1170 *page_abs,
1171 test_bit(*page_abs,
1172 atomic_rcu_read(&rs->ram_bitmap)->unsentmap));
a82d593b 1173 } else {
06b10688
JQ
1174 trace_get_queued_page(block->idstr, (uint64_t)offset,
1175 *page_abs);
a82d593b
DDAG
1176 }
1177 }
1178
1179 } while (block && !dirty);
1180
1181 if (block) {
1182 /*
1183 * As soon as we start servicing pages out of order, then we have
1184 * to kill the bulk stage, since the bulk stage assumes
1185 * in (migration_bitmap_find_and_reset_dirty) that every page is
1186 * dirty, that's no longer true.
1187 */
6f37bb8b 1188 rs->ram_bulk_stage = false;
a82d593b
DDAG
1189
1190 /*
1191 * We want the background search to continue from the queued page
1192 * since the guest is likely to want other pages near to the page
1193 * it just requested.
1194 */
1195 pss->block = block;
1196 pss->offset = offset;
1197 }
1198
1199 return !!block;
1200}
1201
6c595cde 1202/**
5e58f968
JQ
1203 * migration_page_queue_free: drop any remaining pages in the ram
1204 * request queue
6c595cde 1205 *
3d0684b2
JQ
1206 * It should be empty at the end anyway, but in error cases there may
1207 * be some left. in case that there is any page left, we drop it.
1208 *
6c595cde 1209 */
ec481c6c 1210void migration_page_queue_free(void)
6c595cde 1211{
ec481c6c
JQ
1212 struct RAMSrcPageRequest *mspr, *next_mspr;
1213 RAMState *rs = &ram_state;
6c595cde
DDAG
1214 /* This queue generally should be empty - but in the case of a failed
1215 * migration might have some droppings in.
1216 */
1217 rcu_read_lock();
ec481c6c 1218 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
6c595cde 1219 memory_region_unref(mspr->rb->mr);
ec481c6c 1220 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
6c595cde
DDAG
1221 g_free(mspr);
1222 }
1223 rcu_read_unlock();
1224}
1225
1226/**
3d0684b2
JQ
1227 * ram_save_queue_pages: queue the page for transmission
1228 *
1229 * A request from postcopy destination for example.
1230 *
1231 * Returns zero on success or negative on error
1232 *
3d0684b2
JQ
1233 * @rbname: Name of the RAMBLock of the request. NULL means the
1234 * same that last one.
1235 * @start: starting address from the start of the RAMBlock
1236 * @len: length (in bytes) to send
6c595cde 1237 */
96506894 1238int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
6c595cde
DDAG
1239{
1240 RAMBlock *ramblock;
68a098f3 1241 RAMState *rs = &ram_state;
6c595cde 1242
96506894 1243 rs->postcopy_requests++;
6c595cde
DDAG
1244 rcu_read_lock();
1245 if (!rbname) {
1246 /* Reuse last RAMBlock */
68a098f3 1247 ramblock = rs->last_req_rb;
6c595cde
DDAG
1248
1249 if (!ramblock) {
1250 /*
1251 * Shouldn't happen, we can't reuse the last RAMBlock if
1252 * it's the 1st request.
1253 */
1254 error_report("ram_save_queue_pages no previous block");
1255 goto err;
1256 }
1257 } else {
1258 ramblock = qemu_ram_block_by_name(rbname);
1259
1260 if (!ramblock) {
1261 /* We shouldn't be asked for a non-existent RAMBlock */
1262 error_report("ram_save_queue_pages no block '%s'", rbname);
1263 goto err;
1264 }
68a098f3 1265 rs->last_req_rb = ramblock;
6c595cde
DDAG
1266 }
1267 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1268 if (start+len > ramblock->used_length) {
9458ad6b
JQ
1269 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1270 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
6c595cde
DDAG
1271 __func__, start, len, ramblock->used_length);
1272 goto err;
1273 }
1274
ec481c6c
JQ
1275 struct RAMSrcPageRequest *new_entry =
1276 g_malloc0(sizeof(struct RAMSrcPageRequest));
6c595cde
DDAG
1277 new_entry->rb = ramblock;
1278 new_entry->offset = start;
1279 new_entry->len = len;
1280
1281 memory_region_ref(ramblock->mr);
ec481c6c
JQ
1282 qemu_mutex_lock(&rs->src_page_req_mutex);
1283 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1284 qemu_mutex_unlock(&rs->src_page_req_mutex);
6c595cde
DDAG
1285 rcu_read_unlock();
1286
1287 return 0;
1288
1289err:
1290 rcu_read_unlock();
1291 return -1;
1292}
1293
a82d593b 1294/**
3d0684b2 1295 * ram_save_target_page: save one target page
a82d593b 1296 *
3d0684b2 1297 * Returns the number of pages written
a82d593b 1298 *
6f37bb8b 1299 * @rs: current RAM state
3d0684b2 1300 * @ms: current migration state
3d0684b2 1301 * @pss: data about the page we want to send
a82d593b 1302 * @last_stage: if we are at the completion stage
06b10688 1303 * @page_abs: page number of the dirty page
a82d593b 1304 */
a0a8aa14 1305static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
06b10688 1306 bool last_stage, unsigned long page_abs)
a82d593b
DDAG
1307{
1308 int res = 0;
1309
1310 /* Check the pages is dirty and if it is send it */
06b10688 1311 if (migration_bitmap_clear_dirty(rs, page_abs)) {
a82d593b 1312 unsigned long *unsentmap;
6d358d94
JQ
1313 /*
1314 * If xbzrle is on, stop using the data compression after first
1315 * round of migration even if compression is enabled. In theory,
1316 * xbzrle can do better than compression.
1317 */
1318
1319 if (migrate_use_compression()
1320 && (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
a0a8aa14 1321 res = ram_save_compressed_page(rs, pss, last_stage);
a82d593b 1322 } else {
a0a8aa14 1323 res = ram_save_page(rs, pss, last_stage);
a82d593b
DDAG
1324 }
1325
1326 if (res < 0) {
1327 return res;
1328 }
eb859c53 1329 unsentmap = atomic_rcu_read(&rs->ram_bitmap)->unsentmap;
a82d593b 1330 if (unsentmap) {
06b10688 1331 clear_bit(page_abs, unsentmap);
a82d593b
DDAG
1332 }
1333 }
1334
1335 return res;
1336}
1337
1338/**
3d0684b2 1339 * ram_save_host_page: save a whole host page
a82d593b 1340 *
3d0684b2
JQ
1341 * Starting at *offset send pages up to the end of the current host
1342 * page. It's valid for the initial offset to point into the middle of
1343 * a host page in which case the remainder of the hostpage is sent.
1344 * Only dirty target pages are sent. Note that the host page size may
1345 * be a huge page for this block.
a82d593b 1346 *
3d0684b2
JQ
1347 * Returns the number of pages written or negative on error
1348 *
6f37bb8b 1349 * @rs: current RAM state
3d0684b2 1350 * @ms: current migration state
3d0684b2 1351 * @pss: data about the page we want to send
a82d593b 1352 * @last_stage: if we are at the completion stage
06b10688 1353 * @page_abs: Page number of the dirty page
a82d593b 1354 */
a0a8aa14 1355static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
a08f6890 1356 bool last_stage,
06b10688 1357 unsigned long page_abs)
a82d593b
DDAG
1358{
1359 int tmppages, pages = 0;
4c011c37
DDAG
1360 size_t pagesize = qemu_ram_pagesize(pss->block);
1361
a82d593b 1362 do {
06b10688 1363 tmppages = ram_save_target_page(rs, pss, last_stage, page_abs);
a82d593b
DDAG
1364 if (tmppages < 0) {
1365 return tmppages;
1366 }
1367
1368 pages += tmppages;
a08f6890 1369 pss->offset += TARGET_PAGE_SIZE;
06b10688 1370 page_abs++;
4c011c37 1371 } while (pss->offset & (pagesize - 1));
a82d593b
DDAG
1372
1373 /* The offset we leave with is the last one we looked at */
a08f6890 1374 pss->offset -= TARGET_PAGE_SIZE;
a82d593b
DDAG
1375 return pages;
1376}
6c595cde 1377
56e93d26 1378/**
3d0684b2 1379 * ram_find_and_save_block: finds a dirty page and sends it to f
56e93d26
JQ
1380 *
1381 * Called within an RCU critical section.
1382 *
3d0684b2 1383 * Returns the number of pages written where zero means no dirty pages
56e93d26 1384 *
6f37bb8b 1385 * @rs: current RAM state
56e93d26 1386 * @last_stage: if we are at the completion stage
a82d593b
DDAG
1387 *
1388 * On systems where host-page-size > target-page-size it will send all the
1389 * pages in a host page that are dirty.
56e93d26
JQ
1390 */
1391
ce25d337 1392static int ram_find_and_save_block(RAMState *rs, bool last_stage)
56e93d26 1393{
b8fb8cb7 1394 PageSearchStatus pss;
56e93d26 1395 int pages = 0;
b9e60928 1396 bool again, found;
06b10688 1397 unsigned long page_abs; /* Page number of the dirty page */
56e93d26 1398
0827b9e9
AA
1399 /* No dirty page as there is zero RAM */
1400 if (!ram_bytes_total()) {
1401 return pages;
1402 }
1403
6f37bb8b 1404 pss.block = rs->last_seen_block;
269ace29 1405 pss.offset = rs->last_page << TARGET_PAGE_BITS;
b8fb8cb7
DDAG
1406 pss.complete_round = false;
1407
1408 if (!pss.block) {
1409 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1410 }
56e93d26 1411
b9e60928 1412 do {
a82d593b 1413 again = true;
06b10688 1414 found = get_queued_page(rs, &pss, &page_abs);
b9e60928 1415
a82d593b
DDAG
1416 if (!found) {
1417 /* priority queue empty, so just search for something dirty */
06b10688 1418 found = find_dirty_block(rs, &pss, &again, &page_abs);
a82d593b 1419 }
f3f491fc 1420
a82d593b 1421 if (found) {
06b10688 1422 pages = ram_save_host_page(rs, &pss, last_stage, page_abs);
56e93d26 1423 }
b9e60928 1424 } while (!pages && again);
56e93d26 1425
6f37bb8b 1426 rs->last_seen_block = pss.block;
269ace29 1427 rs->last_page = pss.offset >> TARGET_PAGE_BITS;
56e93d26
JQ
1428
1429 return pages;
1430}
1431
1432void acct_update_position(QEMUFile *f, size_t size, bool zero)
1433{
1434 uint64_t pages = size / TARGET_PAGE_SIZE;
f7ccd61b
JQ
1435 RAMState *rs = &ram_state;
1436
56e93d26 1437 if (zero) {
f7ccd61b 1438 rs->zero_pages += pages;
56e93d26 1439 } else {
b4d1c6e7 1440 rs->norm_pages += pages;
2f4fde93 1441 rs->bytes_transferred += size;
56e93d26
JQ
1442 qemu_update_position(f, size);
1443 }
1444}
1445
56e93d26
JQ
1446uint64_t ram_bytes_total(void)
1447{
1448 RAMBlock *block;
1449 uint64_t total = 0;
1450
1451 rcu_read_lock();
1452 QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
1453 total += block->used_length;
1454 rcu_read_unlock();
1455 return total;
1456}
1457
1458void free_xbzrle_decoded_buf(void)
1459{
1460 g_free(xbzrle_decoded_buf);
1461 xbzrle_decoded_buf = NULL;
1462}
1463
eb859c53 1464static void migration_bitmap_free(struct RAMBitmap *bmap)
60be6340
DL
1465{
1466 g_free(bmap->bmap);
f3f491fc 1467 g_free(bmap->unsentmap);
60be6340
DL
1468 g_free(bmap);
1469}
1470
6ad2a215 1471static void ram_migration_cleanup(void *opaque)
56e93d26 1472{
eb859c53
JQ
1473 RAMState *rs = opaque;
1474
2ff64038
LZ
1475 /* caller have hold iothread lock or is in a bh, so there is
1476 * no writing race against this migration_bitmap
1477 */
eb859c53
JQ
1478 struct RAMBitmap *bitmap = rs->ram_bitmap;
1479 atomic_rcu_set(&rs->ram_bitmap, NULL);
2ff64038 1480 if (bitmap) {
56e93d26 1481 memory_global_dirty_log_stop();
60be6340 1482 call_rcu(bitmap, migration_bitmap_free, rcu);
56e93d26
JQ
1483 }
1484
1485 XBZRLE_cache_lock();
1486 if (XBZRLE.cache) {
1487 cache_fini(XBZRLE.cache);
1488 g_free(XBZRLE.encoded_buf);
1489 g_free(XBZRLE.current_buf);
adb65dec 1490 g_free(ZERO_TARGET_PAGE);
56e93d26
JQ
1491 XBZRLE.cache = NULL;
1492 XBZRLE.encoded_buf = NULL;
1493 XBZRLE.current_buf = NULL;
1494 }
1495 XBZRLE_cache_unlock();
1496}
1497
6f37bb8b 1498static void ram_state_reset(RAMState *rs)
56e93d26 1499{
6f37bb8b
JQ
1500 rs->last_seen_block = NULL;
1501 rs->last_sent_block = NULL;
269ace29 1502 rs->last_page = 0;
6f37bb8b
JQ
1503 rs->last_version = ram_list.version;
1504 rs->ram_bulk_stage = true;
56e93d26
JQ
1505}
1506
1507#define MAX_WAIT 50 /* ms, half buffered_file limit */
1508
dd631697
LZ
1509void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
1510{
0d8ec885 1511 RAMState *rs = &ram_state;
108cfae0 1512
dd631697
LZ
1513 /* called in qemu main thread, so there is
1514 * no writing race against this migration_bitmap
1515 */
eb859c53
JQ
1516 if (rs->ram_bitmap) {
1517 struct RAMBitmap *old_bitmap = rs->ram_bitmap, *bitmap;
1518 bitmap = g_new(struct RAMBitmap, 1);
60be6340 1519 bitmap->bmap = bitmap_new(new);
dd631697
LZ
1520
1521 /* prevent migration_bitmap content from being set bit
1522 * by migration_bitmap_sync_range() at the same time.
1523 * it is safe to migration if migration_bitmap is cleared bit
1524 * at the same time.
1525 */
108cfae0 1526 qemu_mutex_lock(&rs->bitmap_mutex);
60be6340
DL
1527 bitmap_copy(bitmap->bmap, old_bitmap->bmap, old);
1528 bitmap_set(bitmap->bmap, old, new - old);
f3f491fc
DDAG
1529
1530 /* We don't have a way to safely extend the sentmap
1531 * with RCU; so mark it as missing, entry to postcopy
1532 * will fail.
1533 */
1534 bitmap->unsentmap = NULL;
1535
eb859c53 1536 atomic_rcu_set(&rs->ram_bitmap, bitmap);
108cfae0 1537 qemu_mutex_unlock(&rs->bitmap_mutex);
0d8ec885 1538 rs->migration_dirty_pages += new - old;
60be6340 1539 call_rcu(old_bitmap, migration_bitmap_free, rcu);
dd631697
LZ
1540 }
1541}
56e93d26 1542
4f2e4252
DDAG
1543/*
1544 * 'expected' is the value you expect the bitmap mostly to be full
1545 * of; it won't bother printing lines that are all this value.
1546 * If 'todump' is null the migration bitmap is dumped.
1547 */
1548void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
1549{
1550 int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
eb859c53 1551 RAMState *rs = &ram_state;
4f2e4252
DDAG
1552 int64_t cur;
1553 int64_t linelen = 128;
1554 char linebuf[129];
1555
1556 if (!todump) {
eb859c53 1557 todump = atomic_rcu_read(&rs->ram_bitmap)->bmap;
4f2e4252
DDAG
1558 }
1559
1560 for (cur = 0; cur < ram_pages; cur += linelen) {
1561 int64_t curb;
1562 bool found = false;
1563 /*
1564 * Last line; catch the case where the line length
1565 * is longer than remaining ram
1566 */
1567 if (cur + linelen > ram_pages) {
1568 linelen = ram_pages - cur;
1569 }
1570 for (curb = 0; curb < linelen; curb++) {
1571 bool thisbit = test_bit(cur + curb, todump);
1572 linebuf[curb] = thisbit ? '1' : '.';
1573 found = found || (thisbit != expected);
1574 }
1575 if (found) {
1576 linebuf[curb] = '\0';
1577 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
1578 }
1579 }
1580}
1581
e0b266f0
DDAG
1582/* **** functions for postcopy ***** */
1583
ced1c616
PB
1584void ram_postcopy_migrated_memory_release(MigrationState *ms)
1585{
eb859c53 1586 RAMState *rs = &ram_state;
ced1c616 1587 struct RAMBlock *block;
eb859c53 1588 unsigned long *bitmap = atomic_rcu_read(&rs->ram_bitmap)->bmap;
ced1c616
PB
1589
1590 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1591 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1592 unsigned long range = first + (block->used_length >> TARGET_PAGE_BITS);
1593 unsigned long run_start = find_next_zero_bit(bitmap, range, first);
1594
1595 while (run_start < range) {
1596 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
aaa2064c 1597 ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
ced1c616
PB
1598 (run_end - run_start) << TARGET_PAGE_BITS);
1599 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1600 }
1601 }
1602}
1603
3d0684b2
JQ
1604/**
1605 * postcopy_send_discard_bm_ram: discard a RAMBlock
1606 *
1607 * Returns zero on success
1608 *
e0b266f0
DDAG
1609 * Callback from postcopy_each_ram_send_discard for each RAMBlock
1610 * Note: At this point the 'unsentmap' is the processed bitmap combined
1611 * with the dirtymap; so a '1' means it's either dirty or unsent.
3d0684b2
JQ
1612 *
1613 * @ms: current migration state
1614 * @pds: state for postcopy
1615 * @start: RAMBlock starting page
1616 * @length: RAMBlock size
e0b266f0
DDAG
1617 */
1618static int postcopy_send_discard_bm_ram(MigrationState *ms,
1619 PostcopyDiscardState *pds,
1620 unsigned long start,
1621 unsigned long length)
1622{
eb859c53 1623 RAMState *rs = &ram_state;
e0b266f0
DDAG
1624 unsigned long end = start + length; /* one after the end */
1625 unsigned long current;
1626 unsigned long *unsentmap;
1627
eb859c53 1628 unsentmap = atomic_rcu_read(&rs->ram_bitmap)->unsentmap;
e0b266f0
DDAG
1629 for (current = start; current < end; ) {
1630 unsigned long one = find_next_bit(unsentmap, end, current);
1631
1632 if (one <= end) {
1633 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1634 unsigned long discard_length;
1635
1636 if (zero >= end) {
1637 discard_length = end - one;
1638 } else {
1639 discard_length = zero - one;
1640 }
d688c62d
DDAG
1641 if (discard_length) {
1642 postcopy_discard_send_range(ms, pds, one, discard_length);
1643 }
e0b266f0
DDAG
1644 current = one + discard_length;
1645 } else {
1646 current = one;
1647 }
1648 }
1649
1650 return 0;
1651}
1652
3d0684b2
JQ
1653/**
1654 * postcopy_each_ram_send_discard: discard all RAMBlocks
1655 *
1656 * Returns 0 for success or negative for error
1657 *
e0b266f0
DDAG
1658 * Utility for the outgoing postcopy code.
1659 * Calls postcopy_send_discard_bm_ram for each RAMBlock
1660 * passing it bitmap indexes and name.
e0b266f0
DDAG
1661 * (qemu_ram_foreach_block ends up passing unscaled lengths
1662 * which would mean postcopy code would have to deal with target page)
3d0684b2
JQ
1663 *
1664 * @ms: current migration state
e0b266f0
DDAG
1665 */
1666static int postcopy_each_ram_send_discard(MigrationState *ms)
1667{
1668 struct RAMBlock *block;
1669 int ret;
1670
1671 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1672 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1673 PostcopyDiscardState *pds = postcopy_discard_send_init(ms,
1674 first,
1675 block->idstr);
1676
1677 /*
1678 * Postcopy sends chunks of bitmap over the wire, but it
1679 * just needs indexes at this point, avoids it having
1680 * target page specific code.
1681 */
1682 ret = postcopy_send_discard_bm_ram(ms, pds, first,
1683 block->used_length >> TARGET_PAGE_BITS);
1684 postcopy_discard_send_finish(ms, pds);
1685 if (ret) {
1686 return ret;
1687 }
1688 }
1689
1690 return 0;
1691}
1692
3d0684b2
JQ
1693/**
1694 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1695 *
1696 * Helper for postcopy_chunk_hostpages; it's called twice to
1697 * canonicalize the two bitmaps, that are similar, but one is
1698 * inverted.
99e314eb 1699 *
3d0684b2
JQ
1700 * Postcopy requires that all target pages in a hostpage are dirty or
1701 * clean, not a mix. This function canonicalizes the bitmaps.
99e314eb 1702 *
3d0684b2
JQ
1703 * @ms: current migration state
1704 * @unsent_pass: if true we need to canonicalize partially unsent host pages
1705 * otherwise we need to canonicalize partially dirty host pages
1706 * @block: block that contains the page we want to canonicalize
1707 * @pds: state for postcopy
99e314eb
DDAG
1708 */
1709static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1710 RAMBlock *block,
1711 PostcopyDiscardState *pds)
1712{
0d8ec885 1713 RAMState *rs = &ram_state;
99e314eb
DDAG
1714 unsigned long *bitmap;
1715 unsigned long *unsentmap;
29c59172 1716 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
99e314eb
DDAG
1717 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1718 unsigned long len = block->used_length >> TARGET_PAGE_BITS;
1719 unsigned long last = first + (len - 1);
1720 unsigned long run_start;
1721
29c59172
DDAG
1722 if (block->page_size == TARGET_PAGE_SIZE) {
1723 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1724 return;
1725 }
1726
eb859c53
JQ
1727 bitmap = atomic_rcu_read(&rs->ram_bitmap)->bmap;
1728 unsentmap = atomic_rcu_read(&rs->ram_bitmap)->unsentmap;
99e314eb
DDAG
1729
1730 if (unsent_pass) {
1731 /* Find a sent page */
1732 run_start = find_next_zero_bit(unsentmap, last + 1, first);
1733 } else {
1734 /* Find a dirty page */
1735 run_start = find_next_bit(bitmap, last + 1, first);
1736 }
1737
1738 while (run_start <= last) {
1739 bool do_fixup = false;
1740 unsigned long fixup_start_addr;
1741 unsigned long host_offset;
1742
1743 /*
1744 * If the start of this run of pages is in the middle of a host
1745 * page, then we need to fixup this host page.
1746 */
1747 host_offset = run_start % host_ratio;
1748 if (host_offset) {
1749 do_fixup = true;
1750 run_start -= host_offset;
1751 fixup_start_addr = run_start;
1752 /* For the next pass */
1753 run_start = run_start + host_ratio;
1754 } else {
1755 /* Find the end of this run */
1756 unsigned long run_end;
1757 if (unsent_pass) {
1758 run_end = find_next_bit(unsentmap, last + 1, run_start + 1);
1759 } else {
1760 run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1);
1761 }
1762 /*
1763 * If the end isn't at the start of a host page, then the
1764 * run doesn't finish at the end of a host page
1765 * and we need to discard.
1766 */
1767 host_offset = run_end % host_ratio;
1768 if (host_offset) {
1769 do_fixup = true;
1770 fixup_start_addr = run_end - host_offset;
1771 /*
1772 * This host page has gone, the next loop iteration starts
1773 * from after the fixup
1774 */
1775 run_start = fixup_start_addr + host_ratio;
1776 } else {
1777 /*
1778 * No discards on this iteration, next loop starts from
1779 * next sent/dirty page
1780 */
1781 run_start = run_end + 1;
1782 }
1783 }
1784
1785 if (do_fixup) {
1786 unsigned long page;
1787
1788 /* Tell the destination to discard this page */
1789 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1790 /* For the unsent_pass we:
1791 * discard partially sent pages
1792 * For the !unsent_pass (dirty) we:
1793 * discard partially dirty pages that were sent
1794 * (any partially sent pages were already discarded
1795 * by the previous unsent_pass)
1796 */
1797 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1798 host_ratio);
1799 }
1800
1801 /* Clean up the bitmap */
1802 for (page = fixup_start_addr;
1803 page < fixup_start_addr + host_ratio; page++) {
1804 /* All pages in this host page are now not sent */
1805 set_bit(page, unsentmap);
1806
1807 /*
1808 * Remark them as dirty, updating the count for any pages
1809 * that weren't previously dirty.
1810 */
0d8ec885 1811 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
99e314eb
DDAG
1812 }
1813 }
1814
1815 if (unsent_pass) {
1816 /* Find the next sent page for the next iteration */
1817 run_start = find_next_zero_bit(unsentmap, last + 1,
1818 run_start);
1819 } else {
1820 /* Find the next dirty page for the next iteration */
1821 run_start = find_next_bit(bitmap, last + 1, run_start);
1822 }
1823 }
1824}
1825
3d0684b2
JQ
1826/**
1827 * postcopy_chuck_hostpages: discrad any partially sent host page
1828 *
99e314eb
DDAG
1829 * Utility for the outgoing postcopy code.
1830 *
1831 * Discard any partially sent host-page size chunks, mark any partially
29c59172
DDAG
1832 * dirty host-page size chunks as all dirty. In this case the host-page
1833 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
99e314eb 1834 *
3d0684b2
JQ
1835 * Returns zero on success
1836 *
1837 * @ms: current migration state
99e314eb
DDAG
1838 */
1839static int postcopy_chunk_hostpages(MigrationState *ms)
1840{
6f37bb8b 1841 RAMState *rs = &ram_state;
99e314eb
DDAG
1842 struct RAMBlock *block;
1843
99e314eb 1844 /* Easiest way to make sure we don't resume in the middle of a host-page */
6f37bb8b
JQ
1845 rs->last_seen_block = NULL;
1846 rs->last_sent_block = NULL;
269ace29 1847 rs->last_page = 0;
99e314eb
DDAG
1848
1849 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1850 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1851
1852 PostcopyDiscardState *pds =
1853 postcopy_discard_send_init(ms, first, block->idstr);
1854
1855 /* First pass: Discard all partially sent host pages */
1856 postcopy_chunk_hostpages_pass(ms, true, block, pds);
1857 /*
1858 * Second pass: Ensure that all partially dirty host pages are made
1859 * fully dirty.
1860 */
1861 postcopy_chunk_hostpages_pass(ms, false, block, pds);
1862
1863 postcopy_discard_send_finish(ms, pds);
1864 } /* ram_list loop */
1865
1866 return 0;
1867}
1868
3d0684b2
JQ
1869/**
1870 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1871 *
1872 * Returns zero on success
1873 *
e0b266f0
DDAG
1874 * Transmit the set of pages to be discarded after precopy to the target
1875 * these are pages that:
1876 * a) Have been previously transmitted but are now dirty again
1877 * b) Pages that have never been transmitted, this ensures that
1878 * any pages on the destination that have been mapped by background
1879 * tasks get discarded (transparent huge pages is the specific concern)
1880 * Hopefully this is pretty sparse
3d0684b2
JQ
1881 *
1882 * @ms: current migration state
e0b266f0
DDAG
1883 */
1884int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1885{
eb859c53 1886 RAMState *rs = &ram_state;
e0b266f0
DDAG
1887 int ret;
1888 unsigned long *bitmap, *unsentmap;
1889
1890 rcu_read_lock();
1891
1892 /* This should be our last sync, the src is now paused */
eb859c53 1893 migration_bitmap_sync(rs);
e0b266f0 1894
eb859c53 1895 unsentmap = atomic_rcu_read(&rs->ram_bitmap)->unsentmap;
e0b266f0
DDAG
1896 if (!unsentmap) {
1897 /* We don't have a safe way to resize the sentmap, so
1898 * if the bitmap was resized it will be NULL at this
1899 * point.
1900 */
1901 error_report("migration ram resized during precopy phase");
1902 rcu_read_unlock();
1903 return -EINVAL;
1904 }
1905
29c59172 1906 /* Deal with TPS != HPS and huge pages */
99e314eb
DDAG
1907 ret = postcopy_chunk_hostpages(ms);
1908 if (ret) {
1909 rcu_read_unlock();
1910 return ret;
1911 }
1912
e0b266f0
DDAG
1913 /*
1914 * Update the unsentmap to be unsentmap = unsentmap | dirty
1915 */
eb859c53 1916 bitmap = atomic_rcu_read(&rs->ram_bitmap)->bmap;
e0b266f0
DDAG
1917 bitmap_or(unsentmap, unsentmap, bitmap,
1918 last_ram_offset() >> TARGET_PAGE_BITS);
1919
1920
1921 trace_ram_postcopy_send_discard_bitmap();
1922#ifdef DEBUG_POSTCOPY
1923 ram_debug_dump_bitmap(unsentmap, true);
1924#endif
1925
1926 ret = postcopy_each_ram_send_discard(ms);
1927 rcu_read_unlock();
1928
1929 return ret;
1930}
1931
3d0684b2
JQ
1932/**
1933 * ram_discard_range: discard dirtied pages at the beginning of postcopy
e0b266f0 1934 *
3d0684b2 1935 * Returns zero on success
e0b266f0 1936 *
36449157
JQ
1937 * @rbname: name of the RAMBlock of the request. NULL means the
1938 * same that last one.
3d0684b2
JQ
1939 * @start: RAMBlock starting page
1940 * @length: RAMBlock size
e0b266f0 1941 */
aaa2064c 1942int ram_discard_range(const char *rbname, uint64_t start, size_t length)
e0b266f0
DDAG
1943{
1944 int ret = -1;
1945
36449157 1946 trace_ram_discard_range(rbname, start, length);
d3a5038c 1947
e0b266f0 1948 rcu_read_lock();
36449157 1949 RAMBlock *rb = qemu_ram_block_by_name(rbname);
e0b266f0
DDAG
1950
1951 if (!rb) {
36449157 1952 error_report("ram_discard_range: Failed to find block '%s'", rbname);
e0b266f0
DDAG
1953 goto err;
1954 }
1955
d3a5038c 1956 ret = ram_block_discard_range(rb, start, length);
e0b266f0
DDAG
1957
1958err:
1959 rcu_read_unlock();
1960
1961 return ret;
1962}
1963
ceb4d168 1964static int ram_state_init(RAMState *rs)
56e93d26 1965{
56e93d26
JQ
1966 int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
1967
ceb4d168 1968 memset(rs, 0, sizeof(*rs));
108cfae0 1969 qemu_mutex_init(&rs->bitmap_mutex);
ec481c6c
JQ
1970 qemu_mutex_init(&rs->src_page_req_mutex);
1971 QSIMPLEQ_INIT(&rs->src_page_requests);
56e93d26
JQ
1972
1973 if (migrate_use_xbzrle()) {
1974 XBZRLE_cache_lock();
adb65dec 1975 ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE);
56e93d26
JQ
1976 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1977 TARGET_PAGE_SIZE,
1978 TARGET_PAGE_SIZE);
1979 if (!XBZRLE.cache) {
1980 XBZRLE_cache_unlock();
1981 error_report("Error creating cache");
1982 return -1;
1983 }
1984 XBZRLE_cache_unlock();
1985
1986 /* We prefer not to abort if there is no memory */
1987 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1988 if (!XBZRLE.encoded_buf) {
1989 error_report("Error allocating encoded_buf");
1990 return -1;
1991 }
1992
1993 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1994 if (!XBZRLE.current_buf) {
1995 error_report("Error allocating current_buf");
1996 g_free(XBZRLE.encoded_buf);
1997 XBZRLE.encoded_buf = NULL;
1998 return -1;
1999 }
56e93d26
JQ
2000 }
2001
49877834
PB
2002 /* For memory_global_dirty_log_start below. */
2003 qemu_mutex_lock_iothread();
2004
56e93d26
JQ
2005 qemu_mutex_lock_ramlist();
2006 rcu_read_lock();
6f37bb8b 2007 ram_state_reset(rs);
56e93d26 2008
eb859c53 2009 rs->ram_bitmap = g_new0(struct RAMBitmap, 1);
0827b9e9
AA
2010 /* Skip setting bitmap if there is no RAM */
2011 if (ram_bytes_total()) {
2012 ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
eb859c53
JQ
2013 rs->ram_bitmap->bmap = bitmap_new(ram_bitmap_pages);
2014 bitmap_set(rs->ram_bitmap->bmap, 0, ram_bitmap_pages);
0827b9e9
AA
2015
2016 if (migrate_postcopy_ram()) {
eb859c53
JQ
2017 rs->ram_bitmap->unsentmap = bitmap_new(ram_bitmap_pages);
2018 bitmap_set(rs->ram_bitmap->unsentmap, 0, ram_bitmap_pages);
0827b9e9 2019 }
f3f491fc
DDAG
2020 }
2021
56e93d26
JQ
2022 /*
2023 * Count the total number of pages used by ram blocks not including any
2024 * gaps due to alignment or unplugs.
2025 */
0d8ec885 2026 rs->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
56e93d26
JQ
2027
2028 memory_global_dirty_log_start();
8d820d6f 2029 migration_bitmap_sync(rs);
56e93d26 2030 qemu_mutex_unlock_ramlist();
49877834 2031 qemu_mutex_unlock_iothread();
a91246c9
HZ
2032 rcu_read_unlock();
2033
2034 return 0;
2035}
2036
3d0684b2
JQ
2037/*
2038 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
a91246c9
HZ
2039 * long-running RCU critical section. When rcu-reclaims in the code
2040 * start to become numerous it will be necessary to reduce the
2041 * granularity of these critical sections.
2042 */
2043
3d0684b2
JQ
2044/**
2045 * ram_save_setup: Setup RAM for migration
2046 *
2047 * Returns zero to indicate success and negative for error
2048 *
2049 * @f: QEMUFile where to send the data
2050 * @opaque: RAMState pointer
2051 */
a91246c9
HZ
2052static int ram_save_setup(QEMUFile *f, void *opaque)
2053{
6f37bb8b 2054 RAMState *rs = opaque;
a91246c9
HZ
2055 RAMBlock *block;
2056
2057 /* migration has already setup the bitmap, reuse it. */
2058 if (!migration_in_colo_state()) {
ceb4d168 2059 if (ram_state_init(rs) < 0) {
a91246c9
HZ
2060 return -1;
2061 }
2062 }
204b88b8 2063 rs->f = f;
a91246c9
HZ
2064
2065 rcu_read_lock();
56e93d26
JQ
2066
2067 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2068
2069 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2070 qemu_put_byte(f, strlen(block->idstr));
2071 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2072 qemu_put_be64(f, block->used_length);
ef08fb38
DDAG
2073 if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2074 qemu_put_be64(f, block->page_size);
2075 }
56e93d26
JQ
2076 }
2077
2078 rcu_read_unlock();
2079
2080 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2081 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2082
2083 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2084
2085 return 0;
2086}
2087
3d0684b2
JQ
2088/**
2089 * ram_save_iterate: iterative stage for migration
2090 *
2091 * Returns zero to indicate success and negative for error
2092 *
2093 * @f: QEMUFile where to send the data
2094 * @opaque: RAMState pointer
2095 */
56e93d26
JQ
2096static int ram_save_iterate(QEMUFile *f, void *opaque)
2097{
6f37bb8b 2098 RAMState *rs = opaque;
56e93d26
JQ
2099 int ret;
2100 int i;
2101 int64_t t0;
5c90308f 2102 int done = 0;
56e93d26
JQ
2103
2104 rcu_read_lock();
6f37bb8b
JQ
2105 if (ram_list.version != rs->last_version) {
2106 ram_state_reset(rs);
56e93d26
JQ
2107 }
2108
2109 /* Read version before ram_list.blocks */
2110 smp_rmb();
2111
2112 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2113
2114 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2115 i = 0;
2116 while ((ret = qemu_file_rate_limit(f)) == 0) {
2117 int pages;
2118
ce25d337 2119 pages = ram_find_and_save_block(rs, false);
56e93d26
JQ
2120 /* no more pages to sent */
2121 if (pages == 0) {
5c90308f 2122 done = 1;
56e93d26
JQ
2123 break;
2124 }
23b28c3c 2125 rs->iterations++;
070afca2 2126
56e93d26
JQ
2127 /* we want to check in the 1st loop, just in case it was the 1st time
2128 and we had to sync the dirty bitmap.
2129 qemu_get_clock_ns() is a bit expensive, so we only check each some
2130 iterations
2131 */
2132 if ((i & 63) == 0) {
2133 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2134 if (t1 > MAX_WAIT) {
55c4446b 2135 trace_ram_save_iterate_big_wait(t1, i);
56e93d26
JQ
2136 break;
2137 }
2138 }
2139 i++;
2140 }
ce25d337 2141 flush_compressed_data(rs);
56e93d26
JQ
2142 rcu_read_unlock();
2143
2144 /*
2145 * Must occur before EOS (or any QEMUFile operation)
2146 * because of RDMA protocol.
2147 */
2148 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2149
2150 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2f4fde93 2151 rs->bytes_transferred += 8;
56e93d26
JQ
2152
2153 ret = qemu_file_get_error(f);
2154 if (ret < 0) {
2155 return ret;
2156 }
2157
5c90308f 2158 return done;
56e93d26
JQ
2159}
2160
3d0684b2
JQ
2161/**
2162 * ram_save_complete: function called to send the remaining amount of ram
2163 *
2164 * Returns zero to indicate success
2165 *
2166 * Called with iothread lock
2167 *
2168 * @f: QEMUFile where to send the data
2169 * @opaque: RAMState pointer
2170 */
56e93d26
JQ
2171static int ram_save_complete(QEMUFile *f, void *opaque)
2172{
6f37bb8b
JQ
2173 RAMState *rs = opaque;
2174
56e93d26
JQ
2175 rcu_read_lock();
2176
5727309d 2177 if (!migration_in_postcopy()) {
8d820d6f 2178 migration_bitmap_sync(rs);
663e6c1d 2179 }
56e93d26
JQ
2180
2181 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2182
2183 /* try transferring iterative blocks of memory */
2184
2185 /* flush all remaining blocks regardless of rate limiting */
2186 while (true) {
2187 int pages;
2188
ce25d337 2189 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
56e93d26
JQ
2190 /* no more blocks to sent */
2191 if (pages == 0) {
2192 break;
2193 }
2194 }
2195
ce25d337 2196 flush_compressed_data(rs);
56e93d26 2197 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
56e93d26
JQ
2198
2199 rcu_read_unlock();
d09a6fde 2200
56e93d26
JQ
2201 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2202
2203 return 0;
2204}
2205
c31b098f
DDAG
2206static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2207 uint64_t *non_postcopiable_pending,
2208 uint64_t *postcopiable_pending)
56e93d26 2209{
8d820d6f 2210 RAMState *rs = opaque;
56e93d26
JQ
2211 uint64_t remaining_size;
2212
9edabd4d 2213 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 2214
5727309d 2215 if (!migration_in_postcopy() &&
663e6c1d 2216 remaining_size < max_size) {
56e93d26
JQ
2217 qemu_mutex_lock_iothread();
2218 rcu_read_lock();
8d820d6f 2219 migration_bitmap_sync(rs);
56e93d26
JQ
2220 rcu_read_unlock();
2221 qemu_mutex_unlock_iothread();
9edabd4d 2222 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 2223 }
c31b098f
DDAG
2224
2225 /* We can do postcopy, and all the data is postcopiable */
2226 *postcopiable_pending += remaining_size;
56e93d26
JQ
2227}
2228
2229static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2230{
2231 unsigned int xh_len;
2232 int xh_flags;
063e760a 2233 uint8_t *loaded_data;
56e93d26
JQ
2234
2235 if (!xbzrle_decoded_buf) {
2236 xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2237 }
063e760a 2238 loaded_data = xbzrle_decoded_buf;
56e93d26
JQ
2239
2240 /* extract RLE header */
2241 xh_flags = qemu_get_byte(f);
2242 xh_len = qemu_get_be16(f);
2243
2244 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2245 error_report("Failed to load XBZRLE page - wrong compression!");
2246 return -1;
2247 }
2248
2249 if (xh_len > TARGET_PAGE_SIZE) {
2250 error_report("Failed to load XBZRLE page - len overflow!");
2251 return -1;
2252 }
2253 /* load data and decode */
063e760a 2254 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
56e93d26
JQ
2255
2256 /* decode RLE */
063e760a 2257 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
56e93d26
JQ
2258 TARGET_PAGE_SIZE) == -1) {
2259 error_report("Failed to load XBZRLE page - decode error!");
2260 return -1;
2261 }
2262
2263 return 0;
2264}
2265
3d0684b2
JQ
2266/**
2267 * ram_block_from_stream: read a RAMBlock id from the migration stream
2268 *
2269 * Must be called from within a rcu critical section.
2270 *
56e93d26 2271 * Returns a pointer from within the RCU-protected ram_list.
a7180877 2272 *
3d0684b2
JQ
2273 * @f: QEMUFile where to read the data from
2274 * @flags: Page flags (mostly to see if it's a continuation of previous block)
a7180877 2275 */
3d0684b2 2276static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
56e93d26
JQ
2277{
2278 static RAMBlock *block = NULL;
2279 char id[256];
2280 uint8_t len;
2281
2282 if (flags & RAM_SAVE_FLAG_CONTINUE) {
4c4bad48 2283 if (!block) {
56e93d26
JQ
2284 error_report("Ack, bad migration stream!");
2285 return NULL;
2286 }
4c4bad48 2287 return block;
56e93d26
JQ
2288 }
2289
2290 len = qemu_get_byte(f);
2291 qemu_get_buffer(f, (uint8_t *)id, len);
2292 id[len] = 0;
2293
e3dd7493 2294 block = qemu_ram_block_by_name(id);
4c4bad48
HZ
2295 if (!block) {
2296 error_report("Can't find block %s", id);
2297 return NULL;
56e93d26
JQ
2298 }
2299
4c4bad48
HZ
2300 return block;
2301}
2302
2303static inline void *host_from_ram_block_offset(RAMBlock *block,
2304 ram_addr_t offset)
2305{
2306 if (!offset_in_ramblock(block, offset)) {
2307 return NULL;
2308 }
2309
2310 return block->host + offset;
56e93d26
JQ
2311}
2312
3d0684b2
JQ
2313/**
2314 * ram_handle_compressed: handle the zero page case
2315 *
56e93d26
JQ
2316 * If a page (or a whole RDMA chunk) has been
2317 * determined to be zero, then zap it.
3d0684b2
JQ
2318 *
2319 * @host: host address for the zero page
2320 * @ch: what the page is filled from. We only support zero
2321 * @size: size of the zero page
56e93d26
JQ
2322 */
2323void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2324{
2325 if (ch != 0 || !is_zero_range(host, size)) {
2326 memset(host, ch, size);
2327 }
2328}
2329
2330static void *do_data_decompress(void *opaque)
2331{
2332 DecompressParam *param = opaque;
2333 unsigned long pagesize;
33d151f4
LL
2334 uint8_t *des;
2335 int len;
56e93d26 2336
33d151f4 2337 qemu_mutex_lock(&param->mutex);
90e56fb4 2338 while (!param->quit) {
33d151f4
LL
2339 if (param->des) {
2340 des = param->des;
2341 len = param->len;
2342 param->des = 0;
2343 qemu_mutex_unlock(&param->mutex);
2344
56e93d26 2345 pagesize = TARGET_PAGE_SIZE;
73a8912b
LL
2346 /* uncompress() will return failed in some case, especially
2347 * when the page is dirted when doing the compression, it's
2348 * not a problem because the dirty page will be retransferred
2349 * and uncompress() won't break the data in other pages.
2350 */
33d151f4
LL
2351 uncompress((Bytef *)des, &pagesize,
2352 (const Bytef *)param->compbuf, len);
73a8912b 2353
33d151f4
LL
2354 qemu_mutex_lock(&decomp_done_lock);
2355 param->done = true;
2356 qemu_cond_signal(&decomp_done_cond);
2357 qemu_mutex_unlock(&decomp_done_lock);
2358
2359 qemu_mutex_lock(&param->mutex);
2360 } else {
2361 qemu_cond_wait(&param->cond, &param->mutex);
2362 }
56e93d26 2363 }
33d151f4 2364 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
2365
2366 return NULL;
2367}
2368
5533b2e9
LL
2369static void wait_for_decompress_done(void)
2370{
2371 int idx, thread_count;
2372
2373 if (!migrate_use_compression()) {
2374 return;
2375 }
2376
2377 thread_count = migrate_decompress_threads();
2378 qemu_mutex_lock(&decomp_done_lock);
2379 for (idx = 0; idx < thread_count; idx++) {
2380 while (!decomp_param[idx].done) {
2381 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2382 }
2383 }
2384 qemu_mutex_unlock(&decomp_done_lock);
2385}
2386
56e93d26
JQ
2387void migrate_decompress_threads_create(void)
2388{
2389 int i, thread_count;
2390
2391 thread_count = migrate_decompress_threads();
2392 decompress_threads = g_new0(QemuThread, thread_count);
2393 decomp_param = g_new0(DecompressParam, thread_count);
73a8912b
LL
2394 qemu_mutex_init(&decomp_done_lock);
2395 qemu_cond_init(&decomp_done_cond);
56e93d26
JQ
2396 for (i = 0; i < thread_count; i++) {
2397 qemu_mutex_init(&decomp_param[i].mutex);
2398 qemu_cond_init(&decomp_param[i].cond);
2399 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
73a8912b 2400 decomp_param[i].done = true;
90e56fb4 2401 decomp_param[i].quit = false;
56e93d26
JQ
2402 qemu_thread_create(decompress_threads + i, "decompress",
2403 do_data_decompress, decomp_param + i,
2404 QEMU_THREAD_JOINABLE);
2405 }
2406}
2407
2408void migrate_decompress_threads_join(void)
2409{
2410 int i, thread_count;
2411
56e93d26
JQ
2412 thread_count = migrate_decompress_threads();
2413 for (i = 0; i < thread_count; i++) {
2414 qemu_mutex_lock(&decomp_param[i].mutex);
90e56fb4 2415 decomp_param[i].quit = true;
56e93d26
JQ
2416 qemu_cond_signal(&decomp_param[i].cond);
2417 qemu_mutex_unlock(&decomp_param[i].mutex);
2418 }
2419 for (i = 0; i < thread_count; i++) {
2420 qemu_thread_join(decompress_threads + i);
2421 qemu_mutex_destroy(&decomp_param[i].mutex);
2422 qemu_cond_destroy(&decomp_param[i].cond);
2423 g_free(decomp_param[i].compbuf);
2424 }
2425 g_free(decompress_threads);
2426 g_free(decomp_param);
56e93d26
JQ
2427 decompress_threads = NULL;
2428 decomp_param = NULL;
56e93d26
JQ
2429}
2430
c1bc6626 2431static void decompress_data_with_multi_threads(QEMUFile *f,
56e93d26
JQ
2432 void *host, int len)
2433{
2434 int idx, thread_count;
2435
2436 thread_count = migrate_decompress_threads();
73a8912b 2437 qemu_mutex_lock(&decomp_done_lock);
56e93d26
JQ
2438 while (true) {
2439 for (idx = 0; idx < thread_count; idx++) {
73a8912b 2440 if (decomp_param[idx].done) {
33d151f4
LL
2441 decomp_param[idx].done = false;
2442 qemu_mutex_lock(&decomp_param[idx].mutex);
c1bc6626 2443 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
56e93d26
JQ
2444 decomp_param[idx].des = host;
2445 decomp_param[idx].len = len;
33d151f4
LL
2446 qemu_cond_signal(&decomp_param[idx].cond);
2447 qemu_mutex_unlock(&decomp_param[idx].mutex);
56e93d26
JQ
2448 break;
2449 }
2450 }
2451 if (idx < thread_count) {
2452 break;
73a8912b
LL
2453 } else {
2454 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
56e93d26
JQ
2455 }
2456 }
73a8912b 2457 qemu_mutex_unlock(&decomp_done_lock);
56e93d26
JQ
2458}
2459
3d0684b2
JQ
2460/**
2461 * ram_postcopy_incoming_init: allocate postcopy data structures
2462 *
2463 * Returns 0 for success and negative if there was one error
2464 *
2465 * @mis: current migration incoming state
2466 *
2467 * Allocate data structures etc needed by incoming migration with
2468 * postcopy-ram. postcopy-ram's similarly names
2469 * postcopy_ram_incoming_init does the work.
1caddf8a
DDAG
2470 */
2471int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2472{
2473 size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
2474
2475 return postcopy_ram_incoming_init(mis, ram_pages);
2476}
2477
3d0684b2
JQ
2478/**
2479 * ram_load_postcopy: load a page in postcopy case
2480 *
2481 * Returns 0 for success or -errno in case of error
2482 *
a7180877
DDAG
2483 * Called in postcopy mode by ram_load().
2484 * rcu_read_lock is taken prior to this being called.
3d0684b2
JQ
2485 *
2486 * @f: QEMUFile where to send the data
a7180877
DDAG
2487 */
2488static int ram_load_postcopy(QEMUFile *f)
2489{
2490 int flags = 0, ret = 0;
2491 bool place_needed = false;
28abd200 2492 bool matching_page_sizes = false;
a7180877
DDAG
2493 MigrationIncomingState *mis = migration_incoming_get_current();
2494 /* Temporary page that is later 'placed' */
2495 void *postcopy_host_page = postcopy_get_tmp_page(mis);
c53b7ddc 2496 void *last_host = NULL;
a3b6ff6d 2497 bool all_zero = false;
a7180877
DDAG
2498
2499 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2500 ram_addr_t addr;
2501 void *host = NULL;
2502 void *page_buffer = NULL;
2503 void *place_source = NULL;
df9ff5e1 2504 RAMBlock *block = NULL;
a7180877 2505 uint8_t ch;
a7180877
DDAG
2506
2507 addr = qemu_get_be64(f);
2508 flags = addr & ~TARGET_PAGE_MASK;
2509 addr &= TARGET_PAGE_MASK;
2510
2511 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2512 place_needed = false;
2513 if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) {
df9ff5e1 2514 block = ram_block_from_stream(f, flags);
4c4bad48
HZ
2515
2516 host = host_from_ram_block_offset(block, addr);
a7180877
DDAG
2517 if (!host) {
2518 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2519 ret = -EINVAL;
2520 break;
2521 }
28abd200 2522 matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
a7180877 2523 /*
28abd200
DDAG
2524 * Postcopy requires that we place whole host pages atomically;
2525 * these may be huge pages for RAMBlocks that are backed by
2526 * hugetlbfs.
a7180877
DDAG
2527 * To make it atomic, the data is read into a temporary page
2528 * that's moved into place later.
2529 * The migration protocol uses, possibly smaller, target-pages
2530 * however the source ensures it always sends all the components
2531 * of a host page in order.
2532 */
2533 page_buffer = postcopy_host_page +
28abd200 2534 ((uintptr_t)host & (block->page_size - 1));
a7180877 2535 /* If all TP are zero then we can optimise the place */
28abd200 2536 if (!((uintptr_t)host & (block->page_size - 1))) {
a7180877 2537 all_zero = true;
c53b7ddc
DDAG
2538 } else {
2539 /* not the 1st TP within the HP */
2540 if (host != (last_host + TARGET_PAGE_SIZE)) {
9af9e0fe 2541 error_report("Non-sequential target page %p/%p",
c53b7ddc
DDAG
2542 host, last_host);
2543 ret = -EINVAL;
2544 break;
2545 }
a7180877
DDAG
2546 }
2547
c53b7ddc 2548
a7180877
DDAG
2549 /*
2550 * If it's the last part of a host page then we place the host
2551 * page
2552 */
2553 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
28abd200 2554 (block->page_size - 1)) == 0;
a7180877
DDAG
2555 place_source = postcopy_host_page;
2556 }
c53b7ddc 2557 last_host = host;
a7180877
DDAG
2558
2559 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2560 case RAM_SAVE_FLAG_COMPRESS:
2561 ch = qemu_get_byte(f);
2562 memset(page_buffer, ch, TARGET_PAGE_SIZE);
2563 if (ch) {
2564 all_zero = false;
2565 }
2566 break;
2567
2568 case RAM_SAVE_FLAG_PAGE:
2569 all_zero = false;
2570 if (!place_needed || !matching_page_sizes) {
2571 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2572 } else {
2573 /* Avoids the qemu_file copy during postcopy, which is
2574 * going to do a copy later; can only do it when we
2575 * do this read in one go (matching page sizes)
2576 */
2577 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2578 TARGET_PAGE_SIZE);
2579 }
2580 break;
2581 case RAM_SAVE_FLAG_EOS:
2582 /* normal exit */
2583 break;
2584 default:
2585 error_report("Unknown combination of migration flags: %#x"
2586 " (postcopy mode)", flags);
2587 ret = -EINVAL;
2588 }
2589
2590 if (place_needed) {
2591 /* This gets called at the last target page in the host page */
df9ff5e1
DDAG
2592 void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2593
a7180877 2594 if (all_zero) {
df9ff5e1
DDAG
2595 ret = postcopy_place_page_zero(mis, place_dest,
2596 block->page_size);
a7180877 2597 } else {
df9ff5e1
DDAG
2598 ret = postcopy_place_page(mis, place_dest,
2599 place_source, block->page_size);
a7180877
DDAG
2600 }
2601 }
2602 if (!ret) {
2603 ret = qemu_file_get_error(f);
2604 }
2605 }
2606
2607 return ret;
2608}
2609
56e93d26
JQ
2610static int ram_load(QEMUFile *f, void *opaque, int version_id)
2611{
2612 int flags = 0, ret = 0;
2613 static uint64_t seq_iter;
2614 int len = 0;
a7180877
DDAG
2615 /*
2616 * If system is running in postcopy mode, page inserts to host memory must
2617 * be atomic
2618 */
2619 bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
ef08fb38
DDAG
2620 /* ADVISE is earlier, it shows the source has the postcopy capability on */
2621 bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
56e93d26
JQ
2622
2623 seq_iter++;
2624
2625 if (version_id != 4) {
2626 ret = -EINVAL;
2627 }
2628
2629 /* This RCU critical section can be very long running.
2630 * When RCU reclaims in the code start to become numerous,
2631 * it will be necessary to reduce the granularity of this
2632 * critical section.
2633 */
2634 rcu_read_lock();
a7180877
DDAG
2635
2636 if (postcopy_running) {
2637 ret = ram_load_postcopy(f);
2638 }
2639
2640 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
56e93d26 2641 ram_addr_t addr, total_ram_bytes;
a776aa15 2642 void *host = NULL;
56e93d26
JQ
2643 uint8_t ch;
2644
2645 addr = qemu_get_be64(f);
2646 flags = addr & ~TARGET_PAGE_MASK;
2647 addr &= TARGET_PAGE_MASK;
2648
a776aa15
DDAG
2649 if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE |
2650 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4c4bad48
HZ
2651 RAMBlock *block = ram_block_from_stream(f, flags);
2652
2653 host = host_from_ram_block_offset(block, addr);
a776aa15
DDAG
2654 if (!host) {
2655 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2656 ret = -EINVAL;
2657 break;
2658 }
2659 }
2660
56e93d26
JQ
2661 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2662 case RAM_SAVE_FLAG_MEM_SIZE:
2663 /* Synchronize RAM block list */
2664 total_ram_bytes = addr;
2665 while (!ret && total_ram_bytes) {
2666 RAMBlock *block;
56e93d26
JQ
2667 char id[256];
2668 ram_addr_t length;
2669
2670 len = qemu_get_byte(f);
2671 qemu_get_buffer(f, (uint8_t *)id, len);
2672 id[len] = 0;
2673 length = qemu_get_be64(f);
2674
e3dd7493
DDAG
2675 block = qemu_ram_block_by_name(id);
2676 if (block) {
2677 if (length != block->used_length) {
2678 Error *local_err = NULL;
56e93d26 2679
fa53a0e5 2680 ret = qemu_ram_resize(block, length,
e3dd7493
DDAG
2681 &local_err);
2682 if (local_err) {
2683 error_report_err(local_err);
56e93d26 2684 }
56e93d26 2685 }
ef08fb38
DDAG
2686 /* For postcopy we need to check hugepage sizes match */
2687 if (postcopy_advised &&
2688 block->page_size != qemu_host_page_size) {
2689 uint64_t remote_page_size = qemu_get_be64(f);
2690 if (remote_page_size != block->page_size) {
2691 error_report("Mismatched RAM page size %s "
2692 "(local) %zd != %" PRId64,
2693 id, block->page_size,
2694 remote_page_size);
2695 ret = -EINVAL;
2696 }
2697 }
e3dd7493
DDAG
2698 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2699 block->idstr);
2700 } else {
56e93d26
JQ
2701 error_report("Unknown ramblock \"%s\", cannot "
2702 "accept migration", id);
2703 ret = -EINVAL;
2704 }
2705
2706 total_ram_bytes -= length;
2707 }
2708 break;
a776aa15 2709
56e93d26 2710 case RAM_SAVE_FLAG_COMPRESS:
56e93d26
JQ
2711 ch = qemu_get_byte(f);
2712 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2713 break;
a776aa15 2714
56e93d26 2715 case RAM_SAVE_FLAG_PAGE:
56e93d26
JQ
2716 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2717 break;
56e93d26 2718
a776aa15 2719 case RAM_SAVE_FLAG_COMPRESS_PAGE:
56e93d26
JQ
2720 len = qemu_get_be32(f);
2721 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2722 error_report("Invalid compressed data length: %d", len);
2723 ret = -EINVAL;
2724 break;
2725 }
c1bc6626 2726 decompress_data_with_multi_threads(f, host, len);
56e93d26 2727 break;
a776aa15 2728
56e93d26 2729 case RAM_SAVE_FLAG_XBZRLE:
56e93d26
JQ
2730 if (load_xbzrle(f, addr, host) < 0) {
2731 error_report("Failed to decompress XBZRLE page at "
2732 RAM_ADDR_FMT, addr);
2733 ret = -EINVAL;
2734 break;
2735 }
2736 break;
2737 case RAM_SAVE_FLAG_EOS:
2738 /* normal exit */
2739 break;
2740 default:
2741 if (flags & RAM_SAVE_FLAG_HOOK) {
632e3a5c 2742 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
56e93d26
JQ
2743 } else {
2744 error_report("Unknown combination of migration flags: %#x",
2745 flags);
2746 ret = -EINVAL;
2747 }
2748 }
2749 if (!ret) {
2750 ret = qemu_file_get_error(f);
2751 }
2752 }
2753
5533b2e9 2754 wait_for_decompress_done();
56e93d26 2755 rcu_read_unlock();
55c4446b 2756 trace_ram_load_complete(ret, seq_iter);
56e93d26
JQ
2757 return ret;
2758}
2759
2760static SaveVMHandlers savevm_ram_handlers = {
2761 .save_live_setup = ram_save_setup,
2762 .save_live_iterate = ram_save_iterate,
763c906b 2763 .save_live_complete_postcopy = ram_save_complete,
a3e06c3d 2764 .save_live_complete_precopy = ram_save_complete,
56e93d26
JQ
2765 .save_live_pending = ram_save_pending,
2766 .load_state = ram_load,
6ad2a215 2767 .cleanup = ram_migration_cleanup,
56e93d26
JQ
2768};
2769
2770void ram_mig_init(void)
2771{
2772 qemu_mutex_init(&XBZRLE.lock);
6f37bb8b 2773 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
56e93d26 2774}