]> git.proxmox.com Git - mirror_qemu.git/blame - migration/ram.c
ram: Add QEMUFile to RAMState
[mirror_qemu.git] / migration / ram.c
CommitLineData
56e93d26
JQ
1/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
76cc7b58
JQ
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
56e93d26
JQ
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
1393a485 28#include "qemu/osdep.h"
33c11879
PB
29#include "qemu-common.h"
30#include "cpu.h"
56e93d26 31#include <zlib.h>
4addcd4f 32#include "qapi-event.h"
f348b6d1 33#include "qemu/cutils.h"
56e93d26
JQ
34#include "qemu/bitops.h"
35#include "qemu/bitmap.h"
7205c9ec
JQ
36#include "qemu/timer.h"
37#include "qemu/main-loop.h"
56e93d26 38#include "migration/migration.h"
e0b266f0 39#include "migration/postcopy-ram.h"
56e93d26
JQ
40#include "exec/address-spaces.h"
41#include "migration/page_cache.h"
56e93d26 42#include "qemu/error-report.h"
56e93d26 43#include "trace.h"
56e93d26 44#include "exec/ram_addr.h"
56e93d26 45#include "qemu/rcu_queue.h"
a91246c9 46#include "migration/colo.h"
56e93d26 47
56e93d26
JQ
48/***********************************************************/
49/* ram save/restore */
50
51#define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
52#define RAM_SAVE_FLAG_COMPRESS 0x02
53#define RAM_SAVE_FLAG_MEM_SIZE 0x04
54#define RAM_SAVE_FLAG_PAGE 0x08
55#define RAM_SAVE_FLAG_EOS 0x10
56#define RAM_SAVE_FLAG_CONTINUE 0x20
57#define RAM_SAVE_FLAG_XBZRLE 0x40
58/* 0x80 is reserved in migration.h start with 0x100 next */
59#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
60
adb65dec 61static uint8_t *ZERO_TARGET_PAGE;
56e93d26
JQ
62
63static inline bool is_zero_range(uint8_t *p, uint64_t size)
64{
a1febc49 65 return buffer_is_zero(p, size);
56e93d26
JQ
66}
67
68/* struct contains XBZRLE cache and a static page
69 used by the compression */
70static struct {
71 /* buffer used for XBZRLE encoding */
72 uint8_t *encoded_buf;
73 /* buffer for storing page content */
74 uint8_t *current_buf;
75 /* Cache for XBZRLE, Protected by lock. */
76 PageCache *cache;
77 QemuMutex lock;
78} XBZRLE;
79
80/* buffer used for XBZRLE decoding */
81static uint8_t *xbzrle_decoded_buf;
82
83static void XBZRLE_cache_lock(void)
84{
85 if (migrate_use_xbzrle())
86 qemu_mutex_lock(&XBZRLE.lock);
87}
88
89static void XBZRLE_cache_unlock(void)
90{
91 if (migrate_use_xbzrle())
92 qemu_mutex_unlock(&XBZRLE.lock);
93}
94
3d0684b2
JQ
95/**
96 * xbzrle_cache_resize: resize the xbzrle cache
97 *
98 * This function is called from qmp_migrate_set_cache_size in main
99 * thread, possibly while a migration is in progress. A running
100 * migration may be using the cache and might finish during this call,
101 * hence changes to the cache are protected by XBZRLE.lock().
102 *
103 * Returns the new_size or negative in case of error.
104 *
105 * @new_size: new cache size
56e93d26
JQ
106 */
107int64_t xbzrle_cache_resize(int64_t new_size)
108{
109 PageCache *new_cache;
110 int64_t ret;
111
112 if (new_size < TARGET_PAGE_SIZE) {
113 return -1;
114 }
115
116 XBZRLE_cache_lock();
117
118 if (XBZRLE.cache != NULL) {
119 if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
120 goto out_new_size;
121 }
122 new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
123 TARGET_PAGE_SIZE);
124 if (!new_cache) {
125 error_report("Error creating cache");
126 ret = -1;
127 goto out;
128 }
129
130 cache_fini(XBZRLE.cache);
131 XBZRLE.cache = new_cache;
132 }
133
134out_new_size:
135 ret = pow2floor(new_size);
136out:
137 XBZRLE_cache_unlock();
138 return ret;
139}
140
eb859c53
JQ
141struct RAMBitmap {
142 struct rcu_head rcu;
143 /* Main migration bitmap */
144 unsigned long *bmap;
145 /* bitmap of pages that haven't been sent even once
146 * only maintained and used in postcopy at the moment
147 * where it's used to send the dirtymap at the start
148 * of the postcopy phase
149 */
150 unsigned long *unsentmap;
151};
152typedef struct RAMBitmap RAMBitmap;
153
ec481c6c
JQ
154/*
155 * An outstanding page request, on the source, having been received
156 * and queued
157 */
158struct RAMSrcPageRequest {
159 RAMBlock *rb;
160 hwaddr offset;
161 hwaddr len;
162
163 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
164};
165
6f37bb8b
JQ
166/* State of RAM for migration */
167struct RAMState {
204b88b8
JQ
168 /* QEMUFile used for this migration */
169 QEMUFile *f;
6f37bb8b
JQ
170 /* Last block that we have visited searching for dirty pages */
171 RAMBlock *last_seen_block;
172 /* Last block from where we have sent data */
173 RAMBlock *last_sent_block;
174 /* Last offset we have sent data from */
175 ram_addr_t last_offset;
176 /* last ram version we have seen */
177 uint32_t last_version;
178 /* We are in the first round */
179 bool ram_bulk_stage;
8d820d6f
JQ
180 /* How many times we have dirty too many pages */
181 int dirty_rate_high_cnt;
5a987738
JQ
182 /* How many times we have synchronized the bitmap */
183 uint64_t bitmap_sync_count;
f664da80
JQ
184 /* these variables are used for bitmap sync */
185 /* last time we did a full bitmap_sync */
186 int64_t time_last_bitmap_sync;
eac74159 187 /* bytes transferred at start_time */
c4bdf0cf 188 uint64_t bytes_xfer_prev;
a66cd90c 189 /* number of dirty pages since start_time */
68908ed6 190 uint64_t num_dirty_pages_period;
b5833fde
JQ
191 /* xbzrle misses since the beginning of the period */
192 uint64_t xbzrle_cache_miss_prev;
36040d9c
JQ
193 /* number of iterations at the beginning of period */
194 uint64_t iterations_prev;
f7ccd61b
JQ
195 /* Accounting fields */
196 /* number of zero pages. It used to be pages filled by the same char. */
197 uint64_t zero_pages;
b4d1c6e7
JQ
198 /* number of normal transferred pages */
199 uint64_t norm_pages;
23b28c3c
JQ
200 /* Iterations since start */
201 uint64_t iterations;
f36ada95
JQ
202 /* xbzrle transmitted bytes. Notice that this is with
203 * compression, they can't be calculated from the pages */
07ed50a2 204 uint64_t xbzrle_bytes;
f36ada95
JQ
205 /* xbzrle transmmited pages */
206 uint64_t xbzrle_pages;
544c36f1
JQ
207 /* xbzrle number of cache miss */
208 uint64_t xbzrle_cache_miss;
b07016b6
JQ
209 /* xbzrle miss rate */
210 double xbzrle_cache_miss_rate;
180f61f7
JQ
211 /* xbzrle number of overflows */
212 uint64_t xbzrle_overflows;
0d8ec885
JQ
213 /* number of dirty bits in the bitmap */
214 uint64_t migration_dirty_pages;
2f4fde93
JQ
215 /* total number of bytes transferred */
216 uint64_t bytes_transferred;
47ad8619
JQ
217 /* number of dirtied pages in the last second */
218 uint64_t dirty_pages_rate;
96506894
JQ
219 /* Count of requests incoming from destination */
220 uint64_t postcopy_requests;
108cfae0
JQ
221 /* protects modification of the bitmap */
222 QemuMutex bitmap_mutex;
eb859c53
JQ
223 /* Ram Bitmap protected by RCU */
224 RAMBitmap *ram_bitmap;
68a098f3
JQ
225 /* The RAMBlock used in the last src_page_requests */
226 RAMBlock *last_req_rb;
ec481c6c
JQ
227 /* Queue of outstanding page requests from the destination */
228 QemuMutex src_page_req_mutex;
229 QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
6f37bb8b
JQ
230};
231typedef struct RAMState RAMState;
232
233static RAMState ram_state;
234
56e93d26
JQ
235uint64_t dup_mig_pages_transferred(void)
236{
f7ccd61b 237 return ram_state.zero_pages;
56e93d26
JQ
238}
239
56e93d26
JQ
240uint64_t norm_mig_pages_transferred(void)
241{
b4d1c6e7 242 return ram_state.norm_pages;
56e93d26
JQ
243}
244
245uint64_t xbzrle_mig_bytes_transferred(void)
246{
07ed50a2 247 return ram_state.xbzrle_bytes;
56e93d26
JQ
248}
249
250uint64_t xbzrle_mig_pages_transferred(void)
251{
f36ada95 252 return ram_state.xbzrle_pages;
56e93d26
JQ
253}
254
255uint64_t xbzrle_mig_pages_cache_miss(void)
256{
544c36f1 257 return ram_state.xbzrle_cache_miss;
56e93d26
JQ
258}
259
260double xbzrle_mig_cache_miss_rate(void)
261{
b07016b6 262 return ram_state.xbzrle_cache_miss_rate;
56e93d26
JQ
263}
264
265uint64_t xbzrle_mig_pages_overflow(void)
266{
180f61f7 267 return ram_state.xbzrle_overflows;
56e93d26
JQ
268}
269
9edabd4d 270uint64_t ram_bytes_transferred(void)
0d8ec885 271{
9edabd4d 272 return ram_state.bytes_transferred;
0d8ec885
JQ
273}
274
9edabd4d 275uint64_t ram_bytes_remaining(void)
2f4fde93 276{
9edabd4d 277 return ram_state.migration_dirty_pages * TARGET_PAGE_SIZE;
2f4fde93
JQ
278}
279
42d219d3
JQ
280uint64_t ram_dirty_sync_count(void)
281{
282 return ram_state.bitmap_sync_count;
283}
284
47ad8619
JQ
285uint64_t ram_dirty_pages_rate(void)
286{
287 return ram_state.dirty_pages_rate;
288}
289
96506894
JQ
290uint64_t ram_postcopy_requests(void)
291{
292 return ram_state.postcopy_requests;
293}
294
b8fb8cb7
DDAG
295/* used by the search for pages to send */
296struct PageSearchStatus {
297 /* Current block being searched */
298 RAMBlock *block;
299 /* Current offset to search from */
300 ram_addr_t offset;
301 /* Set once we wrap around */
302 bool complete_round;
303};
304typedef struct PageSearchStatus PageSearchStatus;
305
56e93d26 306struct CompressParam {
56e93d26 307 bool done;
90e56fb4 308 bool quit;
56e93d26
JQ
309 QEMUFile *file;
310 QemuMutex mutex;
311 QemuCond cond;
312 RAMBlock *block;
313 ram_addr_t offset;
314};
315typedef struct CompressParam CompressParam;
316
317struct DecompressParam {
73a8912b 318 bool done;
90e56fb4 319 bool quit;
56e93d26
JQ
320 QemuMutex mutex;
321 QemuCond cond;
322 void *des;
d341d9f3 323 uint8_t *compbuf;
56e93d26
JQ
324 int len;
325};
326typedef struct DecompressParam DecompressParam;
327
328static CompressParam *comp_param;
329static QemuThread *compress_threads;
330/* comp_done_cond is used to wake up the migration thread when
331 * one of the compression threads has finished the compression.
332 * comp_done_lock is used to co-work with comp_done_cond.
333 */
0d9f9a5c
LL
334static QemuMutex comp_done_lock;
335static QemuCond comp_done_cond;
56e93d26
JQ
336/* The empty QEMUFileOps will be used by file in CompressParam */
337static const QEMUFileOps empty_ops = { };
338
339static bool compression_switch;
56e93d26
JQ
340static DecompressParam *decomp_param;
341static QemuThread *decompress_threads;
73a8912b
LL
342static QemuMutex decomp_done_lock;
343static QemuCond decomp_done_cond;
56e93d26 344
a7a9a88f
LL
345static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
346 ram_addr_t offset);
56e93d26
JQ
347
348static void *do_data_compress(void *opaque)
349{
350 CompressParam *param = opaque;
a7a9a88f
LL
351 RAMBlock *block;
352 ram_addr_t offset;
56e93d26 353
a7a9a88f 354 qemu_mutex_lock(&param->mutex);
90e56fb4 355 while (!param->quit) {
a7a9a88f
LL
356 if (param->block) {
357 block = param->block;
358 offset = param->offset;
359 param->block = NULL;
360 qemu_mutex_unlock(&param->mutex);
361
362 do_compress_ram_page(param->file, block, offset);
363
0d9f9a5c 364 qemu_mutex_lock(&comp_done_lock);
a7a9a88f 365 param->done = true;
0d9f9a5c
LL
366 qemu_cond_signal(&comp_done_cond);
367 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
368
369 qemu_mutex_lock(&param->mutex);
370 } else {
56e93d26
JQ
371 qemu_cond_wait(&param->cond, &param->mutex);
372 }
56e93d26 373 }
a7a9a88f 374 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
375
376 return NULL;
377}
378
379static inline void terminate_compression_threads(void)
380{
381 int idx, thread_count;
382
383 thread_count = migrate_compress_threads();
3d0684b2 384
56e93d26
JQ
385 for (idx = 0; idx < thread_count; idx++) {
386 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 387 comp_param[idx].quit = true;
56e93d26
JQ
388 qemu_cond_signal(&comp_param[idx].cond);
389 qemu_mutex_unlock(&comp_param[idx].mutex);
390 }
391}
392
393void migrate_compress_threads_join(void)
394{
395 int i, thread_count;
396
397 if (!migrate_use_compression()) {
398 return;
399 }
400 terminate_compression_threads();
401 thread_count = migrate_compress_threads();
402 for (i = 0; i < thread_count; i++) {
403 qemu_thread_join(compress_threads + i);
404 qemu_fclose(comp_param[i].file);
405 qemu_mutex_destroy(&comp_param[i].mutex);
406 qemu_cond_destroy(&comp_param[i].cond);
407 }
0d9f9a5c
LL
408 qemu_mutex_destroy(&comp_done_lock);
409 qemu_cond_destroy(&comp_done_cond);
56e93d26
JQ
410 g_free(compress_threads);
411 g_free(comp_param);
56e93d26
JQ
412 compress_threads = NULL;
413 comp_param = NULL;
56e93d26
JQ
414}
415
416void migrate_compress_threads_create(void)
417{
418 int i, thread_count;
419
420 if (!migrate_use_compression()) {
421 return;
422 }
56e93d26
JQ
423 compression_switch = true;
424 thread_count = migrate_compress_threads();
425 compress_threads = g_new0(QemuThread, thread_count);
426 comp_param = g_new0(CompressParam, thread_count);
0d9f9a5c
LL
427 qemu_cond_init(&comp_done_cond);
428 qemu_mutex_init(&comp_done_lock);
56e93d26 429 for (i = 0; i < thread_count; i++) {
e110aa91
C
430 /* comp_param[i].file is just used as a dummy buffer to save data,
431 * set its ops to empty.
56e93d26
JQ
432 */
433 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
434 comp_param[i].done = true;
90e56fb4 435 comp_param[i].quit = false;
56e93d26
JQ
436 qemu_mutex_init(&comp_param[i].mutex);
437 qemu_cond_init(&comp_param[i].cond);
438 qemu_thread_create(compress_threads + i, "compress",
439 do_data_compress, comp_param + i,
440 QEMU_THREAD_JOINABLE);
441 }
442}
443
444/**
3d0684b2 445 * save_page_header: write page header to wire
56e93d26
JQ
446 *
447 * If this is the 1st block, it also writes the block identification
448 *
3d0684b2 449 * Returns the number of bytes written
56e93d26
JQ
450 *
451 * @f: QEMUFile where to send the data
452 * @block: block that contains the page we want to send
453 * @offset: offset inside the block for the page
454 * in the lower bits, it contains flags
455 */
456static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
457{
9f5f380b 458 size_t size, len;
56e93d26
JQ
459
460 qemu_put_be64(f, offset);
461 size = 8;
462
463 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
9f5f380b
LL
464 len = strlen(block->idstr);
465 qemu_put_byte(f, len);
466 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
467 size += 1 + len;
56e93d26
JQ
468 }
469 return size;
470}
471
3d0684b2
JQ
472/**
473 * mig_throttle_guest_down: throotle down the guest
474 *
475 * Reduce amount of guest cpu execution to hopefully slow down memory
476 * writes. If guest dirty memory rate is reduced below the rate at
477 * which we can transfer pages to the destination then we should be
478 * able to complete migration. Some workloads dirty memory way too
479 * fast and will not effectively converge, even with auto-converge.
070afca2
JH
480 */
481static void mig_throttle_guest_down(void)
482{
483 MigrationState *s = migrate_get_current();
2594f56d
DB
484 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
485 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
070afca2
JH
486
487 /* We have not started throttling yet. Let's start it. */
488 if (!cpu_throttle_active()) {
489 cpu_throttle_set(pct_initial);
490 } else {
491 /* Throttling already on, just increase the rate */
492 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
493 }
494}
495
3d0684b2
JQ
496/**
497 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
498 *
6f37bb8b 499 * @rs: current RAM state
3d0684b2
JQ
500 * @current_addr: address for the zero page
501 *
502 * Update the xbzrle cache to reflect a page that's been sent as all 0.
56e93d26
JQ
503 * The important thing is that a stale (not-yet-0'd) page be replaced
504 * by the new data.
505 * As a bonus, if the page wasn't in the cache it gets added so that
3d0684b2 506 * when a small write is made into the 0'd page it gets XBZRLE sent.
56e93d26 507 */
6f37bb8b 508static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
56e93d26 509{
6f37bb8b 510 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
56e93d26
JQ
511 return;
512 }
513
514 /* We don't care if this fails to allocate a new cache page
515 * as long as it updated an old one */
516 cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
5a987738 517 rs->bitmap_sync_count);
56e93d26
JQ
518}
519
520#define ENCODING_FLAG_XBZRLE 0x1
521
522/**
523 * save_xbzrle_page: compress and send current page
524 *
525 * Returns: 1 means that we wrote the page
526 * 0 means that page is identical to the one already sent
527 * -1 means that xbzrle would be longer than normal
528 *
5a987738 529 * @rs: current RAM state
3d0684b2
JQ
530 * @current_data: pointer to the address of the page contents
531 * @current_addr: addr of the page
56e93d26
JQ
532 * @block: block that contains the page we want to send
533 * @offset: offset inside the block for the page
534 * @last_stage: if we are at the completion stage
56e93d26 535 */
204b88b8 536static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
56e93d26 537 ram_addr_t current_addr, RAMBlock *block,
072c2511 538 ram_addr_t offset, bool last_stage)
56e93d26
JQ
539{
540 int encoded_len = 0, bytes_xbzrle;
541 uint8_t *prev_cached_page;
542
5a987738 543 if (!cache_is_cached(XBZRLE.cache, current_addr, rs->bitmap_sync_count)) {
544c36f1 544 rs->xbzrle_cache_miss++;
56e93d26
JQ
545 if (!last_stage) {
546 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
5a987738 547 rs->bitmap_sync_count) == -1) {
56e93d26
JQ
548 return -1;
549 } else {
550 /* update *current_data when the page has been
551 inserted into cache */
552 *current_data = get_cached_data(XBZRLE.cache, current_addr);
553 }
554 }
555 return -1;
556 }
557
558 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
559
560 /* save current buffer into memory */
561 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
562
563 /* XBZRLE encoding (if there is no overflow) */
564 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
565 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
566 TARGET_PAGE_SIZE);
567 if (encoded_len == 0) {
55c4446b 568 trace_save_xbzrle_page_skipping();
56e93d26
JQ
569 return 0;
570 } else if (encoded_len == -1) {
55c4446b 571 trace_save_xbzrle_page_overflow();
180f61f7 572 rs->xbzrle_overflows++;
56e93d26
JQ
573 /* update data in the cache */
574 if (!last_stage) {
575 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
576 *current_data = prev_cached_page;
577 }
578 return -1;
579 }
580
581 /* we need to update the data in the cache, in order to get the same data */
582 if (!last_stage) {
583 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
584 }
585
586 /* Send XBZRLE based compressed page */
204b88b8
JQ
587 bytes_xbzrle = save_page_header(rs->f, block,
588 offset | RAM_SAVE_FLAG_XBZRLE);
589 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
590 qemu_put_be16(rs->f, encoded_len);
591 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
56e93d26 592 bytes_xbzrle += encoded_len + 1 + 2;
f36ada95 593 rs->xbzrle_pages++;
07ed50a2 594 rs->xbzrle_bytes += bytes_xbzrle;
072c2511 595 rs->bytes_transferred += bytes_xbzrle;
56e93d26
JQ
596
597 return 1;
598}
599
3d0684b2
JQ
600/**
601 * migration_bitmap_find_dirty: find the next dirty page from start
f3f491fc 602 *
3d0684b2
JQ
603 * Called with rcu_read_lock() to protect migration_bitmap
604 *
605 * Returns the byte offset within memory region of the start of a dirty page
606 *
6f37bb8b 607 * @rs: current RAM state
3d0684b2
JQ
608 * @rb: RAMBlock where to search for dirty pages
609 * @start: starting address (typically so we can continue from previous page)
610 * @ram_addr_abs: pointer into which to store the address of the dirty page
611 * within the global ram_addr space
f3f491fc 612 */
56e93d26 613static inline
6f37bb8b 614ram_addr_t migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
a82d593b
DDAG
615 ram_addr_t start,
616 ram_addr_t *ram_addr_abs)
56e93d26 617{
2f68e399 618 unsigned long base = rb->offset >> TARGET_PAGE_BITS;
56e93d26 619 unsigned long nr = base + (start >> TARGET_PAGE_BITS);
2f68e399
DDAG
620 uint64_t rb_size = rb->used_length;
621 unsigned long size = base + (rb_size >> TARGET_PAGE_BITS);
2ff64038 622 unsigned long *bitmap;
56e93d26
JQ
623
624 unsigned long next;
625
eb859c53 626 bitmap = atomic_rcu_read(&rs->ram_bitmap)->bmap;
6f37bb8b 627 if (rs->ram_bulk_stage && nr > base) {
56e93d26
JQ
628 next = nr + 1;
629 } else {
2ff64038 630 next = find_next_bit(bitmap, size, nr);
56e93d26
JQ
631 }
632
f3f491fc 633 *ram_addr_abs = next << TARGET_PAGE_BITS;
56e93d26
JQ
634 return (next - base) << TARGET_PAGE_BITS;
635}
636
0d8ec885 637static inline bool migration_bitmap_clear_dirty(RAMState *rs, ram_addr_t addr)
a82d593b
DDAG
638{
639 bool ret;
640 int nr = addr >> TARGET_PAGE_BITS;
eb859c53 641 unsigned long *bitmap = atomic_rcu_read(&rs->ram_bitmap)->bmap;
a82d593b
DDAG
642
643 ret = test_and_clear_bit(nr, bitmap);
644
645 if (ret) {
0d8ec885 646 rs->migration_dirty_pages--;
a82d593b
DDAG
647 }
648 return ret;
649}
650
a66cd90c
JQ
651static void migration_bitmap_sync_range(RAMState *rs, ram_addr_t start,
652 ram_addr_t length)
56e93d26 653{
2ff64038 654 unsigned long *bitmap;
eb859c53 655 bitmap = atomic_rcu_read(&rs->ram_bitmap)->bmap;
0d8ec885
JQ
656 rs->migration_dirty_pages +=
657 cpu_physical_memory_sync_dirty_bitmap(bitmap, start, length,
658 &rs->num_dirty_pages_period);
56e93d26
JQ
659}
660
3d0684b2
JQ
661/**
662 * ram_pagesize_summary: calculate all the pagesizes of a VM
663 *
664 * Returns a summary bitmap of the page sizes of all RAMBlocks
665 *
666 * For VMs with just normal pages this is equivalent to the host page
667 * size. If it's got some huge pages then it's the OR of all the
668 * different page sizes.
e8ca1db2
DDAG
669 */
670uint64_t ram_pagesize_summary(void)
671{
672 RAMBlock *block;
673 uint64_t summary = 0;
674
675 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
676 summary |= block->page_size;
677 }
678
679 return summary;
680}
681
8d820d6f 682static void migration_bitmap_sync(RAMState *rs)
56e93d26
JQ
683{
684 RAMBlock *block;
56e93d26 685 int64_t end_time;
c4bdf0cf 686 uint64_t bytes_xfer_now;
56e93d26 687
5a987738 688 rs->bitmap_sync_count++;
56e93d26 689
eac74159
JQ
690 if (!rs->bytes_xfer_prev) {
691 rs->bytes_xfer_prev = ram_bytes_transferred();
56e93d26
JQ
692 }
693
f664da80
JQ
694 if (!rs->time_last_bitmap_sync) {
695 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
56e93d26
JQ
696 }
697
698 trace_migration_bitmap_sync_start();
9c1f8f44 699 memory_global_dirty_log_sync();
56e93d26 700
108cfae0 701 qemu_mutex_lock(&rs->bitmap_mutex);
56e93d26
JQ
702 rcu_read_lock();
703 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
a66cd90c 704 migration_bitmap_sync_range(rs, block->offset, block->used_length);
56e93d26
JQ
705 }
706 rcu_read_unlock();
108cfae0 707 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26 708
a66cd90c 709 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1ffb5dfd 710
56e93d26
JQ
711 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
712
713 /* more than 1 second = 1000 millisecons */
f664da80 714 if (end_time > rs->time_last_bitmap_sync + 1000) {
56e93d26
JQ
715 if (migrate_auto_converge()) {
716 /* The following detection logic can be refined later. For now:
717 Check to see if the dirtied bytes is 50% more than the approx.
718 amount of bytes that just got transferred since the last time we
070afca2
JH
719 were in this routine. If that happens twice, start or increase
720 throttling */
56e93d26 721 bytes_xfer_now = ram_bytes_transferred();
070afca2 722
47ad8619 723 if (rs->dirty_pages_rate &&
a66cd90c 724 (rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
eac74159 725 (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
8d820d6f 726 (rs->dirty_rate_high_cnt++ >= 2)) {
56e93d26 727 trace_migration_throttle();
8d820d6f 728 rs->dirty_rate_high_cnt = 0;
070afca2 729 mig_throttle_guest_down();
56e93d26 730 }
eac74159 731 rs->bytes_xfer_prev = bytes_xfer_now;
56e93d26 732 }
070afca2 733
56e93d26 734 if (migrate_use_xbzrle()) {
23b28c3c 735 if (rs->iterations_prev != rs->iterations) {
b07016b6 736 rs->xbzrle_cache_miss_rate =
544c36f1 737 (double)(rs->xbzrle_cache_miss -
b5833fde 738 rs->xbzrle_cache_miss_prev) /
23b28c3c 739 (rs->iterations - rs->iterations_prev);
56e93d26 740 }
23b28c3c 741 rs->iterations_prev = rs->iterations;
544c36f1 742 rs->xbzrle_cache_miss_prev = rs->xbzrle_cache_miss;
56e93d26 743 }
47ad8619 744 rs->dirty_pages_rate = rs->num_dirty_pages_period * 1000
f664da80 745 / (end_time - rs->time_last_bitmap_sync);
f664da80 746 rs->time_last_bitmap_sync = end_time;
a66cd90c 747 rs->num_dirty_pages_period = 0;
56e93d26 748 }
4addcd4f 749 if (migrate_use_events()) {
5a987738 750 qapi_event_send_migration_pass(rs->bitmap_sync_count, NULL);
4addcd4f 751 }
56e93d26
JQ
752}
753
754/**
3d0684b2 755 * save_zero_page: send the zero page to the stream
56e93d26 756 *
3d0684b2 757 * Returns the number of pages written.
56e93d26 758 *
f7ccd61b 759 * @rs: current RAM state
56e93d26
JQ
760 * @f: QEMUFile where to send the data
761 * @block: block that contains the page we want to send
762 * @offset: offset inside the block for the page
763 * @p: pointer to the page
56e93d26 764 */
f7ccd61b 765static int save_zero_page(RAMState *rs, QEMUFile *f, RAMBlock *block,
072c2511 766 ram_addr_t offset, uint8_t *p)
56e93d26
JQ
767{
768 int pages = -1;
769
770 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
f7ccd61b 771 rs->zero_pages++;
072c2511
JQ
772 rs->bytes_transferred +=
773 save_page_header(f, block, offset | RAM_SAVE_FLAG_COMPRESS);
56e93d26 774 qemu_put_byte(f, 0);
072c2511 775 rs->bytes_transferred += 1;
56e93d26
JQ
776 pages = 1;
777 }
778
779 return pages;
780}
781
36449157 782static void ram_release_pages(MigrationState *ms, const char *rbname,
53f09a10
PB
783 uint64_t offset, int pages)
784{
785 if (!migrate_release_ram() || !migration_in_postcopy(ms)) {
786 return;
787 }
788
36449157 789 ram_discard_range(NULL, rbname, offset, pages << TARGET_PAGE_BITS);
53f09a10
PB
790}
791
56e93d26 792/**
3d0684b2 793 * ram_save_page: send the given page to the stream
56e93d26 794 *
3d0684b2 795 * Returns the number of pages written.
3fd3c4b3
DDAG
796 * < 0 - error
797 * >=0 - Number of pages written - this might legally be 0
798 * if xbzrle noticed the page was the same.
56e93d26 799 *
6f37bb8b 800 * @rs: current RAM state
3d0684b2 801 * @ms: current migration state
56e93d26
JQ
802 * @f: QEMUFile where to send the data
803 * @block: block that contains the page we want to send
804 * @offset: offset inside the block for the page
805 * @last_stage: if we are at the completion stage
56e93d26 806 */
6f37bb8b 807static int ram_save_page(RAMState *rs, MigrationState *ms, QEMUFile *f,
072c2511 808 PageSearchStatus *pss, bool last_stage)
56e93d26
JQ
809{
810 int pages = -1;
811 uint64_t bytes_xmit;
812 ram_addr_t current_addr;
56e93d26
JQ
813 uint8_t *p;
814 int ret;
815 bool send_async = true;
a08f6890
HZ
816 RAMBlock *block = pss->block;
817 ram_addr_t offset = pss->offset;
56e93d26 818
2f68e399 819 p = block->host + offset;
56e93d26
JQ
820
821 /* In doubt sent page as normal */
822 bytes_xmit = 0;
823 ret = ram_control_save_page(f, block->offset,
824 offset, TARGET_PAGE_SIZE, &bytes_xmit);
825 if (bytes_xmit) {
072c2511 826 rs->bytes_transferred += bytes_xmit;
56e93d26
JQ
827 pages = 1;
828 }
829
830 XBZRLE_cache_lock();
831
832 current_addr = block->offset + offset;
833
6f37bb8b 834 if (block == rs->last_sent_block) {
56e93d26
JQ
835 offset |= RAM_SAVE_FLAG_CONTINUE;
836 }
837 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
838 if (ret != RAM_SAVE_CONTROL_DELAYED) {
839 if (bytes_xmit > 0) {
b4d1c6e7 840 rs->norm_pages++;
56e93d26 841 } else if (bytes_xmit == 0) {
f7ccd61b 842 rs->zero_pages++;
56e93d26
JQ
843 }
844 }
845 } else {
072c2511 846 pages = save_zero_page(rs, f, block, offset, p);
56e93d26
JQ
847 if (pages > 0) {
848 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
849 * page would be stale
850 */
6f37bb8b 851 xbzrle_cache_zero_page(rs, current_addr);
53f09a10 852 ram_release_pages(ms, block->idstr, pss->offset, pages);
6f37bb8b 853 } else if (!rs->ram_bulk_stage &&
9eb14766 854 !migration_in_postcopy(ms) && migrate_use_xbzrle()) {
204b88b8 855 pages = save_xbzrle_page(rs, &p, current_addr, block,
072c2511 856 offset, last_stage);
56e93d26
JQ
857 if (!last_stage) {
858 /* Can't send this cached data async, since the cache page
859 * might get updated before it gets to the wire
860 */
861 send_async = false;
862 }
863 }
864 }
865
866 /* XBZRLE overflow or normal page */
867 if (pages == -1) {
072c2511 868 rs->bytes_transferred += save_page_header(f, block,
56e93d26
JQ
869 offset | RAM_SAVE_FLAG_PAGE);
870 if (send_async) {
53f09a10
PB
871 qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE,
872 migrate_release_ram() &
873 migration_in_postcopy(ms));
56e93d26
JQ
874 } else {
875 qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
876 }
072c2511 877 rs->bytes_transferred += TARGET_PAGE_SIZE;
56e93d26 878 pages = 1;
b4d1c6e7 879 rs->norm_pages++;
56e93d26
JQ
880 }
881
882 XBZRLE_cache_unlock();
883
884 return pages;
885}
886
a7a9a88f
LL
887static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
888 ram_addr_t offset)
56e93d26
JQ
889{
890 int bytes_sent, blen;
a7a9a88f 891 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
56e93d26 892
a7a9a88f 893 bytes_sent = save_page_header(f, block, offset |
56e93d26 894 RAM_SAVE_FLAG_COMPRESS_PAGE);
a7a9a88f 895 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
56e93d26 896 migrate_compress_level());
b3be2896
LL
897 if (blen < 0) {
898 bytes_sent = 0;
899 qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
900 error_report("compressed data failed!");
901 } else {
902 bytes_sent += blen;
53f09a10
PB
903 ram_release_pages(migrate_get_current(), block->idstr,
904 offset & TARGET_PAGE_MASK, 1);
b3be2896 905 }
56e93d26
JQ
906
907 return bytes_sent;
908}
909
2f4fde93 910static void flush_compressed_data(RAMState *rs, QEMUFile *f)
56e93d26
JQ
911{
912 int idx, len, thread_count;
913
914 if (!migrate_use_compression()) {
915 return;
916 }
917 thread_count = migrate_compress_threads();
a7a9a88f 918
0d9f9a5c 919 qemu_mutex_lock(&comp_done_lock);
56e93d26 920 for (idx = 0; idx < thread_count; idx++) {
a7a9a88f 921 while (!comp_param[idx].done) {
0d9f9a5c 922 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26 923 }
a7a9a88f 924 }
0d9f9a5c 925 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
926
927 for (idx = 0; idx < thread_count; idx++) {
928 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 929 if (!comp_param[idx].quit) {
56e93d26 930 len = qemu_put_qemu_file(f, comp_param[idx].file);
2f4fde93 931 rs->bytes_transferred += len;
56e93d26 932 }
a7a9a88f 933 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26
JQ
934 }
935}
936
937static inline void set_compress_params(CompressParam *param, RAMBlock *block,
938 ram_addr_t offset)
939{
940 param->block = block;
941 param->offset = offset;
942}
943
b4d1c6e7 944static int compress_page_with_multi_thread(RAMState *rs, QEMUFile *f,
072c2511 945 RAMBlock *block, ram_addr_t offset)
56e93d26
JQ
946{
947 int idx, thread_count, bytes_xmit = -1, pages = -1;
948
949 thread_count = migrate_compress_threads();
0d9f9a5c 950 qemu_mutex_lock(&comp_done_lock);
56e93d26
JQ
951 while (true) {
952 for (idx = 0; idx < thread_count; idx++) {
953 if (comp_param[idx].done) {
a7a9a88f 954 comp_param[idx].done = false;
56e93d26 955 bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file);
a7a9a88f 956 qemu_mutex_lock(&comp_param[idx].mutex);
56e93d26 957 set_compress_params(&comp_param[idx], block, offset);
a7a9a88f
LL
958 qemu_cond_signal(&comp_param[idx].cond);
959 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26 960 pages = 1;
b4d1c6e7 961 rs->norm_pages++;
072c2511 962 rs->bytes_transferred += bytes_xmit;
56e93d26
JQ
963 break;
964 }
965 }
966 if (pages > 0) {
967 break;
968 } else {
0d9f9a5c 969 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26
JQ
970 }
971 }
0d9f9a5c 972 qemu_mutex_unlock(&comp_done_lock);
56e93d26
JQ
973
974 return pages;
975}
976
977/**
978 * ram_save_compressed_page: compress the given page and send it to the stream
979 *
3d0684b2 980 * Returns the number of pages written.
56e93d26 981 *
6f37bb8b 982 * @rs: current RAM state
3d0684b2 983 * @ms: current migration state
56e93d26
JQ
984 * @f: QEMUFile where to send the data
985 * @block: block that contains the page we want to send
986 * @offset: offset inside the block for the page
987 * @last_stage: if we are at the completion stage
56e93d26 988 */
6f37bb8b
JQ
989static int ram_save_compressed_page(RAMState *rs, MigrationState *ms,
990 QEMUFile *f,
072c2511 991 PageSearchStatus *pss, bool last_stage)
56e93d26
JQ
992{
993 int pages = -1;
fc50438e 994 uint64_t bytes_xmit = 0;
56e93d26 995 uint8_t *p;
fc50438e 996 int ret, blen;
a08f6890
HZ
997 RAMBlock *block = pss->block;
998 ram_addr_t offset = pss->offset;
56e93d26 999
2f68e399 1000 p = block->host + offset;
56e93d26 1001
56e93d26
JQ
1002 ret = ram_control_save_page(f, block->offset,
1003 offset, TARGET_PAGE_SIZE, &bytes_xmit);
1004 if (bytes_xmit) {
072c2511 1005 rs->bytes_transferred += bytes_xmit;
56e93d26
JQ
1006 pages = 1;
1007 }
56e93d26
JQ
1008 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
1009 if (ret != RAM_SAVE_CONTROL_DELAYED) {
1010 if (bytes_xmit > 0) {
b4d1c6e7 1011 rs->norm_pages++;
56e93d26 1012 } else if (bytes_xmit == 0) {
f7ccd61b 1013 rs->zero_pages++;
56e93d26
JQ
1014 }
1015 }
1016 } else {
1017 /* When starting the process of a new block, the first page of
1018 * the block should be sent out before other pages in the same
1019 * block, and all the pages in last block should have been sent
1020 * out, keeping this order is important, because the 'cont' flag
1021 * is used to avoid resending the block name.
1022 */
6f37bb8b 1023 if (block != rs->last_sent_block) {
2f4fde93 1024 flush_compressed_data(rs, f);
072c2511 1025 pages = save_zero_page(rs, f, block, offset, p);
56e93d26 1026 if (pages == -1) {
fc50438e
LL
1027 /* Make sure the first page is sent out before other pages */
1028 bytes_xmit = save_page_header(f, block, offset |
1029 RAM_SAVE_FLAG_COMPRESS_PAGE);
1030 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
1031 migrate_compress_level());
1032 if (blen > 0) {
072c2511 1033 rs->bytes_transferred += bytes_xmit + blen;
b4d1c6e7 1034 rs->norm_pages++;
b3be2896 1035 pages = 1;
fc50438e
LL
1036 } else {
1037 qemu_file_set_error(f, blen);
1038 error_report("compressed data failed!");
b3be2896 1039 }
56e93d26 1040 }
53f09a10
PB
1041 if (pages > 0) {
1042 ram_release_pages(ms, block->idstr, pss->offset, pages);
1043 }
56e93d26 1044 } else {
fc50438e 1045 offset |= RAM_SAVE_FLAG_CONTINUE;
072c2511 1046 pages = save_zero_page(rs, f, block, offset, p);
56e93d26 1047 if (pages == -1) {
072c2511 1048 pages = compress_page_with_multi_thread(rs, f, block, offset);
53f09a10
PB
1049 } else {
1050 ram_release_pages(ms, block->idstr, pss->offset, pages);
56e93d26
JQ
1051 }
1052 }
1053 }
1054
1055 return pages;
1056}
1057
3d0684b2
JQ
1058/**
1059 * find_dirty_block: find the next dirty page and update any state
1060 * associated with the search process.
b9e60928 1061 *
3d0684b2 1062 * Returns if a page is found
b9e60928 1063 *
6f37bb8b 1064 * @rs: current RAM state
3d0684b2
JQ
1065 * @f: QEMUFile where to send the data
1066 * @pss: data about the state of the current dirty page scan
1067 * @again: set to false if the search has scanned the whole of RAM
1068 * @ram_addr_abs: pointer into which to store the address of the dirty page
1069 * within the global ram_addr space
b9e60928 1070 */
6f37bb8b 1071static bool find_dirty_block(RAMState *rs, QEMUFile *f, PageSearchStatus *pss,
f3f491fc 1072 bool *again, ram_addr_t *ram_addr_abs)
b9e60928 1073{
6f37bb8b 1074 pss->offset = migration_bitmap_find_dirty(rs, pss->block, pss->offset,
a82d593b 1075 ram_addr_abs);
6f37bb8b
JQ
1076 if (pss->complete_round && pss->block == rs->last_seen_block &&
1077 pss->offset >= rs->last_offset) {
b9e60928
DDAG
1078 /*
1079 * We've been once around the RAM and haven't found anything.
1080 * Give up.
1081 */
1082 *again = false;
1083 return false;
1084 }
1085 if (pss->offset >= pss->block->used_length) {
1086 /* Didn't find anything in this RAM Block */
1087 pss->offset = 0;
1088 pss->block = QLIST_NEXT_RCU(pss->block, next);
1089 if (!pss->block) {
1090 /* Hit the end of the list */
1091 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1092 /* Flag that we've looped */
1093 pss->complete_round = true;
6f37bb8b 1094 rs->ram_bulk_stage = false;
b9e60928
DDAG
1095 if (migrate_use_xbzrle()) {
1096 /* If xbzrle is on, stop using the data compression at this
1097 * point. In theory, xbzrle can do better than compression.
1098 */
2f4fde93 1099 flush_compressed_data(rs, f);
b9e60928
DDAG
1100 compression_switch = false;
1101 }
1102 }
1103 /* Didn't find anything this time, but try again on the new block */
1104 *again = true;
1105 return false;
1106 } else {
1107 /* Can go around again, but... */
1108 *again = true;
1109 /* We've found something so probably don't need to */
1110 return true;
1111 }
1112}
1113
3d0684b2
JQ
1114/**
1115 * unqueue_page: gets a page of the queue
1116 *
a82d593b 1117 * Helper for 'get_queued_page' - gets a page off the queue
a82d593b 1118 *
3d0684b2
JQ
1119 * Returns the block of the page (or NULL if none available)
1120 *
ec481c6c 1121 * @rs: current RAM state
3d0684b2
JQ
1122 * @offset: used to return the offset within the RAMBlock
1123 * @ram_addr_abs: pointer into which to store the address of the dirty page
1124 * within the global ram_addr space
a82d593b 1125 */
ec481c6c 1126static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset,
a82d593b
DDAG
1127 ram_addr_t *ram_addr_abs)
1128{
1129 RAMBlock *block = NULL;
1130
ec481c6c
JQ
1131 qemu_mutex_lock(&rs->src_page_req_mutex);
1132 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1133 struct RAMSrcPageRequest *entry =
1134 QSIMPLEQ_FIRST(&rs->src_page_requests);
a82d593b
DDAG
1135 block = entry->rb;
1136 *offset = entry->offset;
1137 *ram_addr_abs = (entry->offset + entry->rb->offset) &
1138 TARGET_PAGE_MASK;
1139
1140 if (entry->len > TARGET_PAGE_SIZE) {
1141 entry->len -= TARGET_PAGE_SIZE;
1142 entry->offset += TARGET_PAGE_SIZE;
1143 } else {
1144 memory_region_unref(block->mr);
ec481c6c 1145 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
a82d593b
DDAG
1146 g_free(entry);
1147 }
1148 }
ec481c6c 1149 qemu_mutex_unlock(&rs->src_page_req_mutex);
a82d593b
DDAG
1150
1151 return block;
1152}
1153
3d0684b2
JQ
1154/**
1155 * get_queued_page: unqueue a page from the postocpy requests
1156 *
1157 * Skips pages that are already sent (!dirty)
a82d593b 1158 *
3d0684b2 1159 * Returns if a queued page is found
a82d593b 1160 *
6f37bb8b 1161 * @rs: current RAM state
3d0684b2
JQ
1162 * @pss: data about the state of the current dirty page scan
1163 * @ram_addr_abs: pointer into which to store the address of the dirty page
1164 * within the global ram_addr space
a82d593b 1165 */
ec481c6c 1166static bool get_queued_page(RAMState *rs, PageSearchStatus *pss,
a82d593b
DDAG
1167 ram_addr_t *ram_addr_abs)
1168{
1169 RAMBlock *block;
1170 ram_addr_t offset;
1171 bool dirty;
1172
1173 do {
ec481c6c 1174 block = unqueue_page(rs, &offset, ram_addr_abs);
a82d593b
DDAG
1175 /*
1176 * We're sending this page, and since it's postcopy nothing else
1177 * will dirty it, and we must make sure it doesn't get sent again
1178 * even if this queue request was received after the background
1179 * search already sent it.
1180 */
1181 if (block) {
1182 unsigned long *bitmap;
eb859c53 1183 bitmap = atomic_rcu_read(&rs->ram_bitmap)->bmap;
a82d593b
DDAG
1184 dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap);
1185 if (!dirty) {
1186 trace_get_queued_page_not_dirty(
1187 block->idstr, (uint64_t)offset,
1188 (uint64_t)*ram_addr_abs,
1189 test_bit(*ram_addr_abs >> TARGET_PAGE_BITS,
eb859c53 1190 atomic_rcu_read(&rs->ram_bitmap)->unsentmap));
a82d593b
DDAG
1191 } else {
1192 trace_get_queued_page(block->idstr,
1193 (uint64_t)offset,
1194 (uint64_t)*ram_addr_abs);
1195 }
1196 }
1197
1198 } while (block && !dirty);
1199
1200 if (block) {
1201 /*
1202 * As soon as we start servicing pages out of order, then we have
1203 * to kill the bulk stage, since the bulk stage assumes
1204 * in (migration_bitmap_find_and_reset_dirty) that every page is
1205 * dirty, that's no longer true.
1206 */
6f37bb8b 1207 rs->ram_bulk_stage = false;
a82d593b
DDAG
1208
1209 /*
1210 * We want the background search to continue from the queued page
1211 * since the guest is likely to want other pages near to the page
1212 * it just requested.
1213 */
1214 pss->block = block;
1215 pss->offset = offset;
1216 }
1217
1218 return !!block;
1219}
1220
6c595cde 1221/**
5e58f968
JQ
1222 * migration_page_queue_free: drop any remaining pages in the ram
1223 * request queue
6c595cde 1224 *
3d0684b2
JQ
1225 * It should be empty at the end anyway, but in error cases there may
1226 * be some left. in case that there is any page left, we drop it.
1227 *
6c595cde 1228 */
ec481c6c 1229void migration_page_queue_free(void)
6c595cde 1230{
ec481c6c
JQ
1231 struct RAMSrcPageRequest *mspr, *next_mspr;
1232 RAMState *rs = &ram_state;
6c595cde
DDAG
1233 /* This queue generally should be empty - but in the case of a failed
1234 * migration might have some droppings in.
1235 */
1236 rcu_read_lock();
ec481c6c 1237 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
6c595cde 1238 memory_region_unref(mspr->rb->mr);
ec481c6c 1239 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
6c595cde
DDAG
1240 g_free(mspr);
1241 }
1242 rcu_read_unlock();
1243}
1244
1245/**
3d0684b2
JQ
1246 * ram_save_queue_pages: queue the page for transmission
1247 *
1248 * A request from postcopy destination for example.
1249 *
1250 * Returns zero on success or negative on error
1251 *
3d0684b2
JQ
1252 * @rbname: Name of the RAMBLock of the request. NULL means the
1253 * same that last one.
1254 * @start: starting address from the start of the RAMBlock
1255 * @len: length (in bytes) to send
6c595cde 1256 */
96506894 1257int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
6c595cde
DDAG
1258{
1259 RAMBlock *ramblock;
68a098f3 1260 RAMState *rs = &ram_state;
6c595cde 1261
96506894 1262 rs->postcopy_requests++;
6c595cde
DDAG
1263 rcu_read_lock();
1264 if (!rbname) {
1265 /* Reuse last RAMBlock */
68a098f3 1266 ramblock = rs->last_req_rb;
6c595cde
DDAG
1267
1268 if (!ramblock) {
1269 /*
1270 * Shouldn't happen, we can't reuse the last RAMBlock if
1271 * it's the 1st request.
1272 */
1273 error_report("ram_save_queue_pages no previous block");
1274 goto err;
1275 }
1276 } else {
1277 ramblock = qemu_ram_block_by_name(rbname);
1278
1279 if (!ramblock) {
1280 /* We shouldn't be asked for a non-existent RAMBlock */
1281 error_report("ram_save_queue_pages no block '%s'", rbname);
1282 goto err;
1283 }
68a098f3 1284 rs->last_req_rb = ramblock;
6c595cde
DDAG
1285 }
1286 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1287 if (start+len > ramblock->used_length) {
9458ad6b
JQ
1288 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1289 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
6c595cde
DDAG
1290 __func__, start, len, ramblock->used_length);
1291 goto err;
1292 }
1293
ec481c6c
JQ
1294 struct RAMSrcPageRequest *new_entry =
1295 g_malloc0(sizeof(struct RAMSrcPageRequest));
6c595cde
DDAG
1296 new_entry->rb = ramblock;
1297 new_entry->offset = start;
1298 new_entry->len = len;
1299
1300 memory_region_ref(ramblock->mr);
ec481c6c
JQ
1301 qemu_mutex_lock(&rs->src_page_req_mutex);
1302 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1303 qemu_mutex_unlock(&rs->src_page_req_mutex);
6c595cde
DDAG
1304 rcu_read_unlock();
1305
1306 return 0;
1307
1308err:
1309 rcu_read_unlock();
1310 return -1;
1311}
1312
a82d593b 1313/**
3d0684b2 1314 * ram_save_target_page: save one target page
a82d593b 1315 *
3d0684b2 1316 * Returns the number of pages written
a82d593b 1317 *
6f37bb8b 1318 * @rs: current RAM state
3d0684b2 1319 * @ms: current migration state
a82d593b 1320 * @f: QEMUFile where to send the data
3d0684b2 1321 * @pss: data about the page we want to send
a82d593b 1322 * @last_stage: if we are at the completion stage
3d0684b2 1323 * @dirty_ram_abs: address of the start of the dirty page in ram_addr_t space
a82d593b 1324 */
6f37bb8b 1325static int ram_save_target_page(RAMState *rs, MigrationState *ms, QEMUFile *f,
a08f6890 1326 PageSearchStatus *pss,
a82d593b 1327 bool last_stage,
a82d593b
DDAG
1328 ram_addr_t dirty_ram_abs)
1329{
1330 int res = 0;
1331
1332 /* Check the pages is dirty and if it is send it */
0d8ec885 1333 if (migration_bitmap_clear_dirty(rs, dirty_ram_abs)) {
a82d593b
DDAG
1334 unsigned long *unsentmap;
1335 if (compression_switch && migrate_use_compression()) {
072c2511 1336 res = ram_save_compressed_page(rs, ms, f, pss, last_stage);
a82d593b 1337 } else {
072c2511 1338 res = ram_save_page(rs, ms, f, pss, last_stage);
a82d593b
DDAG
1339 }
1340
1341 if (res < 0) {
1342 return res;
1343 }
eb859c53 1344 unsentmap = atomic_rcu_read(&rs->ram_bitmap)->unsentmap;
a82d593b
DDAG
1345 if (unsentmap) {
1346 clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap);
1347 }
3fd3c4b3
DDAG
1348 /* Only update last_sent_block if a block was actually sent; xbzrle
1349 * might have decided the page was identical so didn't bother writing
1350 * to the stream.
1351 */
1352 if (res > 0) {
6f37bb8b 1353 rs->last_sent_block = pss->block;
3fd3c4b3 1354 }
a82d593b
DDAG
1355 }
1356
1357 return res;
1358}
1359
1360/**
3d0684b2 1361 * ram_save_host_page: save a whole host page
a82d593b 1362 *
3d0684b2
JQ
1363 * Starting at *offset send pages up to the end of the current host
1364 * page. It's valid for the initial offset to point into the middle of
1365 * a host page in which case the remainder of the hostpage is sent.
1366 * Only dirty target pages are sent. Note that the host page size may
1367 * be a huge page for this block.
a82d593b 1368 *
3d0684b2
JQ
1369 * Returns the number of pages written or negative on error
1370 *
6f37bb8b 1371 * @rs: current RAM state
3d0684b2 1372 * @ms: current migration state
a82d593b 1373 * @f: QEMUFile where to send the data
3d0684b2 1374 * @pss: data about the page we want to send
a82d593b 1375 * @last_stage: if we are at the completion stage
a82d593b
DDAG
1376 * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1377 */
6f37bb8b 1378static int ram_save_host_page(RAMState *rs, MigrationState *ms, QEMUFile *f,
a08f6890
HZ
1379 PageSearchStatus *pss,
1380 bool last_stage,
a82d593b
DDAG
1381 ram_addr_t dirty_ram_abs)
1382{
1383 int tmppages, pages = 0;
4c011c37
DDAG
1384 size_t pagesize = qemu_ram_pagesize(pss->block);
1385
a82d593b 1386 do {
6f37bb8b 1387 tmppages = ram_save_target_page(rs, ms, f, pss, last_stage,
072c2511 1388 dirty_ram_abs);
a82d593b
DDAG
1389 if (tmppages < 0) {
1390 return tmppages;
1391 }
1392
1393 pages += tmppages;
a08f6890 1394 pss->offset += TARGET_PAGE_SIZE;
a82d593b 1395 dirty_ram_abs += TARGET_PAGE_SIZE;
4c011c37 1396 } while (pss->offset & (pagesize - 1));
a82d593b
DDAG
1397
1398 /* The offset we leave with is the last one we looked at */
a08f6890 1399 pss->offset -= TARGET_PAGE_SIZE;
a82d593b
DDAG
1400 return pages;
1401}
6c595cde 1402
56e93d26 1403/**
3d0684b2 1404 * ram_find_and_save_block: finds a dirty page and sends it to f
56e93d26
JQ
1405 *
1406 * Called within an RCU critical section.
1407 *
3d0684b2 1408 * Returns the number of pages written where zero means no dirty pages
56e93d26 1409 *
6f37bb8b 1410 * @rs: current RAM state
56e93d26
JQ
1411 * @f: QEMUFile where to send the data
1412 * @last_stage: if we are at the completion stage
a82d593b
DDAG
1413 *
1414 * On systems where host-page-size > target-page-size it will send all the
1415 * pages in a host page that are dirty.
56e93d26
JQ
1416 */
1417
072c2511 1418static int ram_find_and_save_block(RAMState *rs, QEMUFile *f, bool last_stage)
56e93d26 1419{
b8fb8cb7 1420 PageSearchStatus pss;
a82d593b 1421 MigrationState *ms = migrate_get_current();
56e93d26 1422 int pages = 0;
b9e60928 1423 bool again, found;
f3f491fc
DDAG
1424 ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
1425 ram_addr_t space */
56e93d26 1426
0827b9e9
AA
1427 /* No dirty page as there is zero RAM */
1428 if (!ram_bytes_total()) {
1429 return pages;
1430 }
1431
6f37bb8b
JQ
1432 pss.block = rs->last_seen_block;
1433 pss.offset = rs->last_offset;
b8fb8cb7
DDAG
1434 pss.complete_round = false;
1435
1436 if (!pss.block) {
1437 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1438 }
56e93d26 1439
b9e60928 1440 do {
a82d593b 1441 again = true;
ec481c6c 1442 found = get_queued_page(rs, &pss, &dirty_ram_abs);
b9e60928 1443
a82d593b
DDAG
1444 if (!found) {
1445 /* priority queue empty, so just search for something dirty */
6f37bb8b 1446 found = find_dirty_block(rs, f, &pss, &again, &dirty_ram_abs);
a82d593b 1447 }
f3f491fc 1448
a82d593b 1449 if (found) {
072c2511 1450 pages = ram_save_host_page(rs, ms, f, &pss, last_stage,
a82d593b 1451 dirty_ram_abs);
56e93d26 1452 }
b9e60928 1453 } while (!pages && again);
56e93d26 1454
6f37bb8b
JQ
1455 rs->last_seen_block = pss.block;
1456 rs->last_offset = pss.offset;
56e93d26
JQ
1457
1458 return pages;
1459}
1460
1461void acct_update_position(QEMUFile *f, size_t size, bool zero)
1462{
1463 uint64_t pages = size / TARGET_PAGE_SIZE;
f7ccd61b
JQ
1464 RAMState *rs = &ram_state;
1465
56e93d26 1466 if (zero) {
f7ccd61b 1467 rs->zero_pages += pages;
56e93d26 1468 } else {
b4d1c6e7 1469 rs->norm_pages += pages;
2f4fde93 1470 rs->bytes_transferred += size;
56e93d26
JQ
1471 qemu_update_position(f, size);
1472 }
1473}
1474
56e93d26
JQ
1475uint64_t ram_bytes_total(void)
1476{
1477 RAMBlock *block;
1478 uint64_t total = 0;
1479
1480 rcu_read_lock();
1481 QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
1482 total += block->used_length;
1483 rcu_read_unlock();
1484 return total;
1485}
1486
1487void free_xbzrle_decoded_buf(void)
1488{
1489 g_free(xbzrle_decoded_buf);
1490 xbzrle_decoded_buf = NULL;
1491}
1492
eb859c53 1493static void migration_bitmap_free(struct RAMBitmap *bmap)
60be6340
DL
1494{
1495 g_free(bmap->bmap);
f3f491fc 1496 g_free(bmap->unsentmap);
60be6340
DL
1497 g_free(bmap);
1498}
1499
6ad2a215 1500static void ram_migration_cleanup(void *opaque)
56e93d26 1501{
eb859c53
JQ
1502 RAMState *rs = opaque;
1503
2ff64038
LZ
1504 /* caller have hold iothread lock or is in a bh, so there is
1505 * no writing race against this migration_bitmap
1506 */
eb859c53
JQ
1507 struct RAMBitmap *bitmap = rs->ram_bitmap;
1508 atomic_rcu_set(&rs->ram_bitmap, NULL);
2ff64038 1509 if (bitmap) {
56e93d26 1510 memory_global_dirty_log_stop();
60be6340 1511 call_rcu(bitmap, migration_bitmap_free, rcu);
56e93d26
JQ
1512 }
1513
1514 XBZRLE_cache_lock();
1515 if (XBZRLE.cache) {
1516 cache_fini(XBZRLE.cache);
1517 g_free(XBZRLE.encoded_buf);
1518 g_free(XBZRLE.current_buf);
adb65dec 1519 g_free(ZERO_TARGET_PAGE);
56e93d26
JQ
1520 XBZRLE.cache = NULL;
1521 XBZRLE.encoded_buf = NULL;
1522 XBZRLE.current_buf = NULL;
1523 }
1524 XBZRLE_cache_unlock();
1525}
1526
6f37bb8b 1527static void ram_state_reset(RAMState *rs)
56e93d26 1528{
6f37bb8b
JQ
1529 rs->last_seen_block = NULL;
1530 rs->last_sent_block = NULL;
1531 rs->last_offset = 0;
1532 rs->last_version = ram_list.version;
1533 rs->ram_bulk_stage = true;
56e93d26
JQ
1534}
1535
1536#define MAX_WAIT 50 /* ms, half buffered_file limit */
1537
dd631697
LZ
1538void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
1539{
0d8ec885 1540 RAMState *rs = &ram_state;
108cfae0 1541
dd631697
LZ
1542 /* called in qemu main thread, so there is
1543 * no writing race against this migration_bitmap
1544 */
eb859c53
JQ
1545 if (rs->ram_bitmap) {
1546 struct RAMBitmap *old_bitmap = rs->ram_bitmap, *bitmap;
1547 bitmap = g_new(struct RAMBitmap, 1);
60be6340 1548 bitmap->bmap = bitmap_new(new);
dd631697
LZ
1549
1550 /* prevent migration_bitmap content from being set bit
1551 * by migration_bitmap_sync_range() at the same time.
1552 * it is safe to migration if migration_bitmap is cleared bit
1553 * at the same time.
1554 */
108cfae0 1555 qemu_mutex_lock(&rs->bitmap_mutex);
60be6340
DL
1556 bitmap_copy(bitmap->bmap, old_bitmap->bmap, old);
1557 bitmap_set(bitmap->bmap, old, new - old);
f3f491fc
DDAG
1558
1559 /* We don't have a way to safely extend the sentmap
1560 * with RCU; so mark it as missing, entry to postcopy
1561 * will fail.
1562 */
1563 bitmap->unsentmap = NULL;
1564
eb859c53 1565 atomic_rcu_set(&rs->ram_bitmap, bitmap);
108cfae0 1566 qemu_mutex_unlock(&rs->bitmap_mutex);
0d8ec885 1567 rs->migration_dirty_pages += new - old;
60be6340 1568 call_rcu(old_bitmap, migration_bitmap_free, rcu);
dd631697
LZ
1569 }
1570}
56e93d26 1571
4f2e4252
DDAG
1572/*
1573 * 'expected' is the value you expect the bitmap mostly to be full
1574 * of; it won't bother printing lines that are all this value.
1575 * If 'todump' is null the migration bitmap is dumped.
1576 */
1577void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
1578{
1579 int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
eb859c53 1580 RAMState *rs = &ram_state;
4f2e4252
DDAG
1581 int64_t cur;
1582 int64_t linelen = 128;
1583 char linebuf[129];
1584
1585 if (!todump) {
eb859c53 1586 todump = atomic_rcu_read(&rs->ram_bitmap)->bmap;
4f2e4252
DDAG
1587 }
1588
1589 for (cur = 0; cur < ram_pages; cur += linelen) {
1590 int64_t curb;
1591 bool found = false;
1592 /*
1593 * Last line; catch the case where the line length
1594 * is longer than remaining ram
1595 */
1596 if (cur + linelen > ram_pages) {
1597 linelen = ram_pages - cur;
1598 }
1599 for (curb = 0; curb < linelen; curb++) {
1600 bool thisbit = test_bit(cur + curb, todump);
1601 linebuf[curb] = thisbit ? '1' : '.';
1602 found = found || (thisbit != expected);
1603 }
1604 if (found) {
1605 linebuf[curb] = '\0';
1606 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
1607 }
1608 }
1609}
1610
e0b266f0
DDAG
1611/* **** functions for postcopy ***** */
1612
ced1c616
PB
1613void ram_postcopy_migrated_memory_release(MigrationState *ms)
1614{
eb859c53 1615 RAMState *rs = &ram_state;
ced1c616 1616 struct RAMBlock *block;
eb859c53 1617 unsigned long *bitmap = atomic_rcu_read(&rs->ram_bitmap)->bmap;
ced1c616
PB
1618
1619 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1620 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1621 unsigned long range = first + (block->used_length >> TARGET_PAGE_BITS);
1622 unsigned long run_start = find_next_zero_bit(bitmap, range, first);
1623
1624 while (run_start < range) {
1625 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1626 ram_discard_range(NULL, block->idstr, run_start << TARGET_PAGE_BITS,
1627 (run_end - run_start) << TARGET_PAGE_BITS);
1628 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1629 }
1630 }
1631}
1632
3d0684b2
JQ
1633/**
1634 * postcopy_send_discard_bm_ram: discard a RAMBlock
1635 *
1636 * Returns zero on success
1637 *
e0b266f0
DDAG
1638 * Callback from postcopy_each_ram_send_discard for each RAMBlock
1639 * Note: At this point the 'unsentmap' is the processed bitmap combined
1640 * with the dirtymap; so a '1' means it's either dirty or unsent.
3d0684b2
JQ
1641 *
1642 * @ms: current migration state
1643 * @pds: state for postcopy
1644 * @start: RAMBlock starting page
1645 * @length: RAMBlock size
e0b266f0
DDAG
1646 */
1647static int postcopy_send_discard_bm_ram(MigrationState *ms,
1648 PostcopyDiscardState *pds,
1649 unsigned long start,
1650 unsigned long length)
1651{
eb859c53 1652 RAMState *rs = &ram_state;
e0b266f0
DDAG
1653 unsigned long end = start + length; /* one after the end */
1654 unsigned long current;
1655 unsigned long *unsentmap;
1656
eb859c53 1657 unsentmap = atomic_rcu_read(&rs->ram_bitmap)->unsentmap;
e0b266f0
DDAG
1658 for (current = start; current < end; ) {
1659 unsigned long one = find_next_bit(unsentmap, end, current);
1660
1661 if (one <= end) {
1662 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1663 unsigned long discard_length;
1664
1665 if (zero >= end) {
1666 discard_length = end - one;
1667 } else {
1668 discard_length = zero - one;
1669 }
d688c62d
DDAG
1670 if (discard_length) {
1671 postcopy_discard_send_range(ms, pds, one, discard_length);
1672 }
e0b266f0
DDAG
1673 current = one + discard_length;
1674 } else {
1675 current = one;
1676 }
1677 }
1678
1679 return 0;
1680}
1681
3d0684b2
JQ
1682/**
1683 * postcopy_each_ram_send_discard: discard all RAMBlocks
1684 *
1685 * Returns 0 for success or negative for error
1686 *
e0b266f0
DDAG
1687 * Utility for the outgoing postcopy code.
1688 * Calls postcopy_send_discard_bm_ram for each RAMBlock
1689 * passing it bitmap indexes and name.
e0b266f0
DDAG
1690 * (qemu_ram_foreach_block ends up passing unscaled lengths
1691 * which would mean postcopy code would have to deal with target page)
3d0684b2
JQ
1692 *
1693 * @ms: current migration state
e0b266f0
DDAG
1694 */
1695static int postcopy_each_ram_send_discard(MigrationState *ms)
1696{
1697 struct RAMBlock *block;
1698 int ret;
1699
1700 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1701 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1702 PostcopyDiscardState *pds = postcopy_discard_send_init(ms,
1703 first,
1704 block->idstr);
1705
1706 /*
1707 * Postcopy sends chunks of bitmap over the wire, but it
1708 * just needs indexes at this point, avoids it having
1709 * target page specific code.
1710 */
1711 ret = postcopy_send_discard_bm_ram(ms, pds, first,
1712 block->used_length >> TARGET_PAGE_BITS);
1713 postcopy_discard_send_finish(ms, pds);
1714 if (ret) {
1715 return ret;
1716 }
1717 }
1718
1719 return 0;
1720}
1721
3d0684b2
JQ
1722/**
1723 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1724 *
1725 * Helper for postcopy_chunk_hostpages; it's called twice to
1726 * canonicalize the two bitmaps, that are similar, but one is
1727 * inverted.
99e314eb 1728 *
3d0684b2
JQ
1729 * Postcopy requires that all target pages in a hostpage are dirty or
1730 * clean, not a mix. This function canonicalizes the bitmaps.
99e314eb 1731 *
3d0684b2
JQ
1732 * @ms: current migration state
1733 * @unsent_pass: if true we need to canonicalize partially unsent host pages
1734 * otherwise we need to canonicalize partially dirty host pages
1735 * @block: block that contains the page we want to canonicalize
1736 * @pds: state for postcopy
99e314eb
DDAG
1737 */
1738static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1739 RAMBlock *block,
1740 PostcopyDiscardState *pds)
1741{
0d8ec885 1742 RAMState *rs = &ram_state;
99e314eb
DDAG
1743 unsigned long *bitmap;
1744 unsigned long *unsentmap;
29c59172 1745 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
99e314eb
DDAG
1746 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1747 unsigned long len = block->used_length >> TARGET_PAGE_BITS;
1748 unsigned long last = first + (len - 1);
1749 unsigned long run_start;
1750
29c59172
DDAG
1751 if (block->page_size == TARGET_PAGE_SIZE) {
1752 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1753 return;
1754 }
1755
eb859c53
JQ
1756 bitmap = atomic_rcu_read(&rs->ram_bitmap)->bmap;
1757 unsentmap = atomic_rcu_read(&rs->ram_bitmap)->unsentmap;
99e314eb
DDAG
1758
1759 if (unsent_pass) {
1760 /* Find a sent page */
1761 run_start = find_next_zero_bit(unsentmap, last + 1, first);
1762 } else {
1763 /* Find a dirty page */
1764 run_start = find_next_bit(bitmap, last + 1, first);
1765 }
1766
1767 while (run_start <= last) {
1768 bool do_fixup = false;
1769 unsigned long fixup_start_addr;
1770 unsigned long host_offset;
1771
1772 /*
1773 * If the start of this run of pages is in the middle of a host
1774 * page, then we need to fixup this host page.
1775 */
1776 host_offset = run_start % host_ratio;
1777 if (host_offset) {
1778 do_fixup = true;
1779 run_start -= host_offset;
1780 fixup_start_addr = run_start;
1781 /* For the next pass */
1782 run_start = run_start + host_ratio;
1783 } else {
1784 /* Find the end of this run */
1785 unsigned long run_end;
1786 if (unsent_pass) {
1787 run_end = find_next_bit(unsentmap, last + 1, run_start + 1);
1788 } else {
1789 run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1);
1790 }
1791 /*
1792 * If the end isn't at the start of a host page, then the
1793 * run doesn't finish at the end of a host page
1794 * and we need to discard.
1795 */
1796 host_offset = run_end % host_ratio;
1797 if (host_offset) {
1798 do_fixup = true;
1799 fixup_start_addr = run_end - host_offset;
1800 /*
1801 * This host page has gone, the next loop iteration starts
1802 * from after the fixup
1803 */
1804 run_start = fixup_start_addr + host_ratio;
1805 } else {
1806 /*
1807 * No discards on this iteration, next loop starts from
1808 * next sent/dirty page
1809 */
1810 run_start = run_end + 1;
1811 }
1812 }
1813
1814 if (do_fixup) {
1815 unsigned long page;
1816
1817 /* Tell the destination to discard this page */
1818 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1819 /* For the unsent_pass we:
1820 * discard partially sent pages
1821 * For the !unsent_pass (dirty) we:
1822 * discard partially dirty pages that were sent
1823 * (any partially sent pages were already discarded
1824 * by the previous unsent_pass)
1825 */
1826 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1827 host_ratio);
1828 }
1829
1830 /* Clean up the bitmap */
1831 for (page = fixup_start_addr;
1832 page < fixup_start_addr + host_ratio; page++) {
1833 /* All pages in this host page are now not sent */
1834 set_bit(page, unsentmap);
1835
1836 /*
1837 * Remark them as dirty, updating the count for any pages
1838 * that weren't previously dirty.
1839 */
0d8ec885 1840 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
99e314eb
DDAG
1841 }
1842 }
1843
1844 if (unsent_pass) {
1845 /* Find the next sent page for the next iteration */
1846 run_start = find_next_zero_bit(unsentmap, last + 1,
1847 run_start);
1848 } else {
1849 /* Find the next dirty page for the next iteration */
1850 run_start = find_next_bit(bitmap, last + 1, run_start);
1851 }
1852 }
1853}
1854
3d0684b2
JQ
1855/**
1856 * postcopy_chuck_hostpages: discrad any partially sent host page
1857 *
99e314eb
DDAG
1858 * Utility for the outgoing postcopy code.
1859 *
1860 * Discard any partially sent host-page size chunks, mark any partially
29c59172
DDAG
1861 * dirty host-page size chunks as all dirty. In this case the host-page
1862 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
99e314eb 1863 *
3d0684b2
JQ
1864 * Returns zero on success
1865 *
1866 * @ms: current migration state
99e314eb
DDAG
1867 */
1868static int postcopy_chunk_hostpages(MigrationState *ms)
1869{
6f37bb8b 1870 RAMState *rs = &ram_state;
99e314eb
DDAG
1871 struct RAMBlock *block;
1872
99e314eb 1873 /* Easiest way to make sure we don't resume in the middle of a host-page */
6f37bb8b
JQ
1874 rs->last_seen_block = NULL;
1875 rs->last_sent_block = NULL;
1876 rs->last_offset = 0;
99e314eb
DDAG
1877
1878 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1879 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1880
1881 PostcopyDiscardState *pds =
1882 postcopy_discard_send_init(ms, first, block->idstr);
1883
1884 /* First pass: Discard all partially sent host pages */
1885 postcopy_chunk_hostpages_pass(ms, true, block, pds);
1886 /*
1887 * Second pass: Ensure that all partially dirty host pages are made
1888 * fully dirty.
1889 */
1890 postcopy_chunk_hostpages_pass(ms, false, block, pds);
1891
1892 postcopy_discard_send_finish(ms, pds);
1893 } /* ram_list loop */
1894
1895 return 0;
1896}
1897
3d0684b2
JQ
1898/**
1899 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1900 *
1901 * Returns zero on success
1902 *
e0b266f0
DDAG
1903 * Transmit the set of pages to be discarded after precopy to the target
1904 * these are pages that:
1905 * a) Have been previously transmitted but are now dirty again
1906 * b) Pages that have never been transmitted, this ensures that
1907 * any pages on the destination that have been mapped by background
1908 * tasks get discarded (transparent huge pages is the specific concern)
1909 * Hopefully this is pretty sparse
3d0684b2
JQ
1910 *
1911 * @ms: current migration state
e0b266f0
DDAG
1912 */
1913int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1914{
eb859c53 1915 RAMState *rs = &ram_state;
e0b266f0
DDAG
1916 int ret;
1917 unsigned long *bitmap, *unsentmap;
1918
1919 rcu_read_lock();
1920
1921 /* This should be our last sync, the src is now paused */
eb859c53 1922 migration_bitmap_sync(rs);
e0b266f0 1923
eb859c53 1924 unsentmap = atomic_rcu_read(&rs->ram_bitmap)->unsentmap;
e0b266f0
DDAG
1925 if (!unsentmap) {
1926 /* We don't have a safe way to resize the sentmap, so
1927 * if the bitmap was resized it will be NULL at this
1928 * point.
1929 */
1930 error_report("migration ram resized during precopy phase");
1931 rcu_read_unlock();
1932 return -EINVAL;
1933 }
1934
29c59172 1935 /* Deal with TPS != HPS and huge pages */
99e314eb
DDAG
1936 ret = postcopy_chunk_hostpages(ms);
1937 if (ret) {
1938 rcu_read_unlock();
1939 return ret;
1940 }
1941
e0b266f0
DDAG
1942 /*
1943 * Update the unsentmap to be unsentmap = unsentmap | dirty
1944 */
eb859c53 1945 bitmap = atomic_rcu_read(&rs->ram_bitmap)->bmap;
e0b266f0
DDAG
1946 bitmap_or(unsentmap, unsentmap, bitmap,
1947 last_ram_offset() >> TARGET_PAGE_BITS);
1948
1949
1950 trace_ram_postcopy_send_discard_bitmap();
1951#ifdef DEBUG_POSTCOPY
1952 ram_debug_dump_bitmap(unsentmap, true);
1953#endif
1954
1955 ret = postcopy_each_ram_send_discard(ms);
1956 rcu_read_unlock();
1957
1958 return ret;
1959}
1960
3d0684b2
JQ
1961/**
1962 * ram_discard_range: discard dirtied pages at the beginning of postcopy
e0b266f0 1963 *
3d0684b2 1964 * Returns zero on success
e0b266f0 1965 *
3d0684b2 1966 * @mis: current migration incoming state
36449157
JQ
1967 * @rbname: name of the RAMBlock of the request. NULL means the
1968 * same that last one.
3d0684b2
JQ
1969 * @start: RAMBlock starting page
1970 * @length: RAMBlock size
e0b266f0
DDAG
1971 */
1972int ram_discard_range(MigrationIncomingState *mis,
36449157 1973 const char *rbname,
e0b266f0
DDAG
1974 uint64_t start, size_t length)
1975{
1976 int ret = -1;
1977
36449157 1978 trace_ram_discard_range(rbname, start, length);
d3a5038c 1979
e0b266f0 1980 rcu_read_lock();
36449157 1981 RAMBlock *rb = qemu_ram_block_by_name(rbname);
e0b266f0
DDAG
1982
1983 if (!rb) {
36449157 1984 error_report("ram_discard_range: Failed to find block '%s'", rbname);
e0b266f0
DDAG
1985 goto err;
1986 }
1987
d3a5038c 1988 ret = ram_block_discard_range(rb, start, length);
e0b266f0
DDAG
1989
1990err:
1991 rcu_read_unlock();
1992
1993 return ret;
1994}
1995
ceb4d168 1996static int ram_state_init(RAMState *rs)
56e93d26 1997{
56e93d26
JQ
1998 int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
1999
ceb4d168 2000 memset(rs, 0, sizeof(*rs));
108cfae0 2001 qemu_mutex_init(&rs->bitmap_mutex);
ec481c6c
JQ
2002 qemu_mutex_init(&rs->src_page_req_mutex);
2003 QSIMPLEQ_INIT(&rs->src_page_requests);
56e93d26
JQ
2004
2005 if (migrate_use_xbzrle()) {
2006 XBZRLE_cache_lock();
adb65dec 2007 ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE);
56e93d26
JQ
2008 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
2009 TARGET_PAGE_SIZE,
2010 TARGET_PAGE_SIZE);
2011 if (!XBZRLE.cache) {
2012 XBZRLE_cache_unlock();
2013 error_report("Error creating cache");
2014 return -1;
2015 }
2016 XBZRLE_cache_unlock();
2017
2018 /* We prefer not to abort if there is no memory */
2019 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2020 if (!XBZRLE.encoded_buf) {
2021 error_report("Error allocating encoded_buf");
2022 return -1;
2023 }
2024
2025 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2026 if (!XBZRLE.current_buf) {
2027 error_report("Error allocating current_buf");
2028 g_free(XBZRLE.encoded_buf);
2029 XBZRLE.encoded_buf = NULL;
2030 return -1;
2031 }
56e93d26
JQ
2032 }
2033
49877834
PB
2034 /* For memory_global_dirty_log_start below. */
2035 qemu_mutex_lock_iothread();
2036
56e93d26
JQ
2037 qemu_mutex_lock_ramlist();
2038 rcu_read_lock();
6f37bb8b 2039 ram_state_reset(rs);
56e93d26 2040
eb859c53 2041 rs->ram_bitmap = g_new0(struct RAMBitmap, 1);
0827b9e9
AA
2042 /* Skip setting bitmap if there is no RAM */
2043 if (ram_bytes_total()) {
2044 ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
eb859c53
JQ
2045 rs->ram_bitmap->bmap = bitmap_new(ram_bitmap_pages);
2046 bitmap_set(rs->ram_bitmap->bmap, 0, ram_bitmap_pages);
0827b9e9
AA
2047
2048 if (migrate_postcopy_ram()) {
eb859c53
JQ
2049 rs->ram_bitmap->unsentmap = bitmap_new(ram_bitmap_pages);
2050 bitmap_set(rs->ram_bitmap->unsentmap, 0, ram_bitmap_pages);
0827b9e9 2051 }
f3f491fc
DDAG
2052 }
2053
56e93d26
JQ
2054 /*
2055 * Count the total number of pages used by ram blocks not including any
2056 * gaps due to alignment or unplugs.
2057 */
0d8ec885 2058 rs->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
56e93d26
JQ
2059
2060 memory_global_dirty_log_start();
8d820d6f 2061 migration_bitmap_sync(rs);
56e93d26 2062 qemu_mutex_unlock_ramlist();
49877834 2063 qemu_mutex_unlock_iothread();
a91246c9
HZ
2064 rcu_read_unlock();
2065
2066 return 0;
2067}
2068
3d0684b2
JQ
2069/*
2070 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
a91246c9
HZ
2071 * long-running RCU critical section. When rcu-reclaims in the code
2072 * start to become numerous it will be necessary to reduce the
2073 * granularity of these critical sections.
2074 */
2075
3d0684b2
JQ
2076/**
2077 * ram_save_setup: Setup RAM for migration
2078 *
2079 * Returns zero to indicate success and negative for error
2080 *
2081 * @f: QEMUFile where to send the data
2082 * @opaque: RAMState pointer
2083 */
a91246c9
HZ
2084static int ram_save_setup(QEMUFile *f, void *opaque)
2085{
6f37bb8b 2086 RAMState *rs = opaque;
a91246c9
HZ
2087 RAMBlock *block;
2088
2089 /* migration has already setup the bitmap, reuse it. */
2090 if (!migration_in_colo_state()) {
ceb4d168 2091 if (ram_state_init(rs) < 0) {
a91246c9
HZ
2092 return -1;
2093 }
2094 }
204b88b8 2095 rs->f = f;
a91246c9
HZ
2096
2097 rcu_read_lock();
56e93d26
JQ
2098
2099 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2100
2101 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2102 qemu_put_byte(f, strlen(block->idstr));
2103 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2104 qemu_put_be64(f, block->used_length);
ef08fb38
DDAG
2105 if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2106 qemu_put_be64(f, block->page_size);
2107 }
56e93d26
JQ
2108 }
2109
2110 rcu_read_unlock();
2111
2112 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2113 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2114
2115 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2116
2117 return 0;
2118}
2119
3d0684b2
JQ
2120/**
2121 * ram_save_iterate: iterative stage for migration
2122 *
2123 * Returns zero to indicate success and negative for error
2124 *
2125 * @f: QEMUFile where to send the data
2126 * @opaque: RAMState pointer
2127 */
56e93d26
JQ
2128static int ram_save_iterate(QEMUFile *f, void *opaque)
2129{
6f37bb8b 2130 RAMState *rs = opaque;
56e93d26
JQ
2131 int ret;
2132 int i;
2133 int64_t t0;
5c90308f 2134 int done = 0;
56e93d26
JQ
2135
2136 rcu_read_lock();
6f37bb8b
JQ
2137 if (ram_list.version != rs->last_version) {
2138 ram_state_reset(rs);
56e93d26
JQ
2139 }
2140
2141 /* Read version before ram_list.blocks */
2142 smp_rmb();
2143
2144 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2145
2146 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2147 i = 0;
2148 while ((ret = qemu_file_rate_limit(f)) == 0) {
2149 int pages;
2150
072c2511 2151 pages = ram_find_and_save_block(rs, f, false);
56e93d26
JQ
2152 /* no more pages to sent */
2153 if (pages == 0) {
5c90308f 2154 done = 1;
56e93d26
JQ
2155 break;
2156 }
23b28c3c 2157 rs->iterations++;
070afca2 2158
56e93d26
JQ
2159 /* we want to check in the 1st loop, just in case it was the 1st time
2160 and we had to sync the dirty bitmap.
2161 qemu_get_clock_ns() is a bit expensive, so we only check each some
2162 iterations
2163 */
2164 if ((i & 63) == 0) {
2165 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2166 if (t1 > MAX_WAIT) {
55c4446b 2167 trace_ram_save_iterate_big_wait(t1, i);
56e93d26
JQ
2168 break;
2169 }
2170 }
2171 i++;
2172 }
2f4fde93 2173 flush_compressed_data(rs, f);
56e93d26
JQ
2174 rcu_read_unlock();
2175
2176 /*
2177 * Must occur before EOS (or any QEMUFile operation)
2178 * because of RDMA protocol.
2179 */
2180 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2181
2182 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2f4fde93 2183 rs->bytes_transferred += 8;
56e93d26
JQ
2184
2185 ret = qemu_file_get_error(f);
2186 if (ret < 0) {
2187 return ret;
2188 }
2189
5c90308f 2190 return done;
56e93d26
JQ
2191}
2192
3d0684b2
JQ
2193/**
2194 * ram_save_complete: function called to send the remaining amount of ram
2195 *
2196 * Returns zero to indicate success
2197 *
2198 * Called with iothread lock
2199 *
2200 * @f: QEMUFile where to send the data
2201 * @opaque: RAMState pointer
2202 */
56e93d26
JQ
2203static int ram_save_complete(QEMUFile *f, void *opaque)
2204{
6f37bb8b
JQ
2205 RAMState *rs = opaque;
2206
56e93d26
JQ
2207 rcu_read_lock();
2208
663e6c1d 2209 if (!migration_in_postcopy(migrate_get_current())) {
8d820d6f 2210 migration_bitmap_sync(rs);
663e6c1d 2211 }
56e93d26
JQ
2212
2213 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2214
2215 /* try transferring iterative blocks of memory */
2216
2217 /* flush all remaining blocks regardless of rate limiting */
2218 while (true) {
2219 int pages;
2220
072c2511 2221 pages = ram_find_and_save_block(rs, f, !migration_in_colo_state());
56e93d26
JQ
2222 /* no more blocks to sent */
2223 if (pages == 0) {
2224 break;
2225 }
2226 }
2227
2f4fde93 2228 flush_compressed_data(rs, f);
56e93d26 2229 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
56e93d26
JQ
2230
2231 rcu_read_unlock();
d09a6fde 2232
56e93d26
JQ
2233 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2234
2235 return 0;
2236}
2237
c31b098f
DDAG
2238static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2239 uint64_t *non_postcopiable_pending,
2240 uint64_t *postcopiable_pending)
56e93d26 2241{
8d820d6f 2242 RAMState *rs = opaque;
56e93d26
JQ
2243 uint64_t remaining_size;
2244
9edabd4d 2245 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 2246
663e6c1d
DDAG
2247 if (!migration_in_postcopy(migrate_get_current()) &&
2248 remaining_size < max_size) {
56e93d26
JQ
2249 qemu_mutex_lock_iothread();
2250 rcu_read_lock();
8d820d6f 2251 migration_bitmap_sync(rs);
56e93d26
JQ
2252 rcu_read_unlock();
2253 qemu_mutex_unlock_iothread();
9edabd4d 2254 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 2255 }
c31b098f
DDAG
2256
2257 /* We can do postcopy, and all the data is postcopiable */
2258 *postcopiable_pending += remaining_size;
56e93d26
JQ
2259}
2260
2261static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2262{
2263 unsigned int xh_len;
2264 int xh_flags;
063e760a 2265 uint8_t *loaded_data;
56e93d26
JQ
2266
2267 if (!xbzrle_decoded_buf) {
2268 xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2269 }
063e760a 2270 loaded_data = xbzrle_decoded_buf;
56e93d26
JQ
2271
2272 /* extract RLE header */
2273 xh_flags = qemu_get_byte(f);
2274 xh_len = qemu_get_be16(f);
2275
2276 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2277 error_report("Failed to load XBZRLE page - wrong compression!");
2278 return -1;
2279 }
2280
2281 if (xh_len > TARGET_PAGE_SIZE) {
2282 error_report("Failed to load XBZRLE page - len overflow!");
2283 return -1;
2284 }
2285 /* load data and decode */
063e760a 2286 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
56e93d26
JQ
2287
2288 /* decode RLE */
063e760a 2289 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
56e93d26
JQ
2290 TARGET_PAGE_SIZE) == -1) {
2291 error_report("Failed to load XBZRLE page - decode error!");
2292 return -1;
2293 }
2294
2295 return 0;
2296}
2297
3d0684b2
JQ
2298/**
2299 * ram_block_from_stream: read a RAMBlock id from the migration stream
2300 *
2301 * Must be called from within a rcu critical section.
2302 *
56e93d26 2303 * Returns a pointer from within the RCU-protected ram_list.
a7180877 2304 *
3d0684b2
JQ
2305 * @f: QEMUFile where to read the data from
2306 * @flags: Page flags (mostly to see if it's a continuation of previous block)
a7180877 2307 */
3d0684b2 2308static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
56e93d26
JQ
2309{
2310 static RAMBlock *block = NULL;
2311 char id[256];
2312 uint8_t len;
2313
2314 if (flags & RAM_SAVE_FLAG_CONTINUE) {
4c4bad48 2315 if (!block) {
56e93d26
JQ
2316 error_report("Ack, bad migration stream!");
2317 return NULL;
2318 }
4c4bad48 2319 return block;
56e93d26
JQ
2320 }
2321
2322 len = qemu_get_byte(f);
2323 qemu_get_buffer(f, (uint8_t *)id, len);
2324 id[len] = 0;
2325
e3dd7493 2326 block = qemu_ram_block_by_name(id);
4c4bad48
HZ
2327 if (!block) {
2328 error_report("Can't find block %s", id);
2329 return NULL;
56e93d26
JQ
2330 }
2331
4c4bad48
HZ
2332 return block;
2333}
2334
2335static inline void *host_from_ram_block_offset(RAMBlock *block,
2336 ram_addr_t offset)
2337{
2338 if (!offset_in_ramblock(block, offset)) {
2339 return NULL;
2340 }
2341
2342 return block->host + offset;
56e93d26
JQ
2343}
2344
3d0684b2
JQ
2345/**
2346 * ram_handle_compressed: handle the zero page case
2347 *
56e93d26
JQ
2348 * If a page (or a whole RDMA chunk) has been
2349 * determined to be zero, then zap it.
3d0684b2
JQ
2350 *
2351 * @host: host address for the zero page
2352 * @ch: what the page is filled from. We only support zero
2353 * @size: size of the zero page
56e93d26
JQ
2354 */
2355void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2356{
2357 if (ch != 0 || !is_zero_range(host, size)) {
2358 memset(host, ch, size);
2359 }
2360}
2361
2362static void *do_data_decompress(void *opaque)
2363{
2364 DecompressParam *param = opaque;
2365 unsigned long pagesize;
33d151f4
LL
2366 uint8_t *des;
2367 int len;
56e93d26 2368
33d151f4 2369 qemu_mutex_lock(&param->mutex);
90e56fb4 2370 while (!param->quit) {
33d151f4
LL
2371 if (param->des) {
2372 des = param->des;
2373 len = param->len;
2374 param->des = 0;
2375 qemu_mutex_unlock(&param->mutex);
2376
56e93d26 2377 pagesize = TARGET_PAGE_SIZE;
73a8912b
LL
2378 /* uncompress() will return failed in some case, especially
2379 * when the page is dirted when doing the compression, it's
2380 * not a problem because the dirty page will be retransferred
2381 * and uncompress() won't break the data in other pages.
2382 */
33d151f4
LL
2383 uncompress((Bytef *)des, &pagesize,
2384 (const Bytef *)param->compbuf, len);
73a8912b 2385
33d151f4
LL
2386 qemu_mutex_lock(&decomp_done_lock);
2387 param->done = true;
2388 qemu_cond_signal(&decomp_done_cond);
2389 qemu_mutex_unlock(&decomp_done_lock);
2390
2391 qemu_mutex_lock(&param->mutex);
2392 } else {
2393 qemu_cond_wait(&param->cond, &param->mutex);
2394 }
56e93d26 2395 }
33d151f4 2396 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
2397
2398 return NULL;
2399}
2400
5533b2e9
LL
2401static void wait_for_decompress_done(void)
2402{
2403 int idx, thread_count;
2404
2405 if (!migrate_use_compression()) {
2406 return;
2407 }
2408
2409 thread_count = migrate_decompress_threads();
2410 qemu_mutex_lock(&decomp_done_lock);
2411 for (idx = 0; idx < thread_count; idx++) {
2412 while (!decomp_param[idx].done) {
2413 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2414 }
2415 }
2416 qemu_mutex_unlock(&decomp_done_lock);
2417}
2418
56e93d26
JQ
2419void migrate_decompress_threads_create(void)
2420{
2421 int i, thread_count;
2422
2423 thread_count = migrate_decompress_threads();
2424 decompress_threads = g_new0(QemuThread, thread_count);
2425 decomp_param = g_new0(DecompressParam, thread_count);
73a8912b
LL
2426 qemu_mutex_init(&decomp_done_lock);
2427 qemu_cond_init(&decomp_done_cond);
56e93d26
JQ
2428 for (i = 0; i < thread_count; i++) {
2429 qemu_mutex_init(&decomp_param[i].mutex);
2430 qemu_cond_init(&decomp_param[i].cond);
2431 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
73a8912b 2432 decomp_param[i].done = true;
90e56fb4 2433 decomp_param[i].quit = false;
56e93d26
JQ
2434 qemu_thread_create(decompress_threads + i, "decompress",
2435 do_data_decompress, decomp_param + i,
2436 QEMU_THREAD_JOINABLE);
2437 }
2438}
2439
2440void migrate_decompress_threads_join(void)
2441{
2442 int i, thread_count;
2443
56e93d26
JQ
2444 thread_count = migrate_decompress_threads();
2445 for (i = 0; i < thread_count; i++) {
2446 qemu_mutex_lock(&decomp_param[i].mutex);
90e56fb4 2447 decomp_param[i].quit = true;
56e93d26
JQ
2448 qemu_cond_signal(&decomp_param[i].cond);
2449 qemu_mutex_unlock(&decomp_param[i].mutex);
2450 }
2451 for (i = 0; i < thread_count; i++) {
2452 qemu_thread_join(decompress_threads + i);
2453 qemu_mutex_destroy(&decomp_param[i].mutex);
2454 qemu_cond_destroy(&decomp_param[i].cond);
2455 g_free(decomp_param[i].compbuf);
2456 }
2457 g_free(decompress_threads);
2458 g_free(decomp_param);
56e93d26
JQ
2459 decompress_threads = NULL;
2460 decomp_param = NULL;
56e93d26
JQ
2461}
2462
c1bc6626 2463static void decompress_data_with_multi_threads(QEMUFile *f,
56e93d26
JQ
2464 void *host, int len)
2465{
2466 int idx, thread_count;
2467
2468 thread_count = migrate_decompress_threads();
73a8912b 2469 qemu_mutex_lock(&decomp_done_lock);
56e93d26
JQ
2470 while (true) {
2471 for (idx = 0; idx < thread_count; idx++) {
73a8912b 2472 if (decomp_param[idx].done) {
33d151f4
LL
2473 decomp_param[idx].done = false;
2474 qemu_mutex_lock(&decomp_param[idx].mutex);
c1bc6626 2475 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
56e93d26
JQ
2476 decomp_param[idx].des = host;
2477 decomp_param[idx].len = len;
33d151f4
LL
2478 qemu_cond_signal(&decomp_param[idx].cond);
2479 qemu_mutex_unlock(&decomp_param[idx].mutex);
56e93d26
JQ
2480 break;
2481 }
2482 }
2483 if (idx < thread_count) {
2484 break;
73a8912b
LL
2485 } else {
2486 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
56e93d26
JQ
2487 }
2488 }
73a8912b 2489 qemu_mutex_unlock(&decomp_done_lock);
56e93d26
JQ
2490}
2491
3d0684b2
JQ
2492/**
2493 * ram_postcopy_incoming_init: allocate postcopy data structures
2494 *
2495 * Returns 0 for success and negative if there was one error
2496 *
2497 * @mis: current migration incoming state
2498 *
2499 * Allocate data structures etc needed by incoming migration with
2500 * postcopy-ram. postcopy-ram's similarly names
2501 * postcopy_ram_incoming_init does the work.
1caddf8a
DDAG
2502 */
2503int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2504{
2505 size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
2506
2507 return postcopy_ram_incoming_init(mis, ram_pages);
2508}
2509
3d0684b2
JQ
2510/**
2511 * ram_load_postcopy: load a page in postcopy case
2512 *
2513 * Returns 0 for success or -errno in case of error
2514 *
a7180877
DDAG
2515 * Called in postcopy mode by ram_load().
2516 * rcu_read_lock is taken prior to this being called.
3d0684b2
JQ
2517 *
2518 * @f: QEMUFile where to send the data
a7180877
DDAG
2519 */
2520static int ram_load_postcopy(QEMUFile *f)
2521{
2522 int flags = 0, ret = 0;
2523 bool place_needed = false;
28abd200 2524 bool matching_page_sizes = false;
a7180877
DDAG
2525 MigrationIncomingState *mis = migration_incoming_get_current();
2526 /* Temporary page that is later 'placed' */
2527 void *postcopy_host_page = postcopy_get_tmp_page(mis);
c53b7ddc 2528 void *last_host = NULL;
a3b6ff6d 2529 bool all_zero = false;
a7180877
DDAG
2530
2531 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2532 ram_addr_t addr;
2533 void *host = NULL;
2534 void *page_buffer = NULL;
2535 void *place_source = NULL;
df9ff5e1 2536 RAMBlock *block = NULL;
a7180877 2537 uint8_t ch;
a7180877
DDAG
2538
2539 addr = qemu_get_be64(f);
2540 flags = addr & ~TARGET_PAGE_MASK;
2541 addr &= TARGET_PAGE_MASK;
2542
2543 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2544 place_needed = false;
2545 if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) {
df9ff5e1 2546 block = ram_block_from_stream(f, flags);
4c4bad48
HZ
2547
2548 host = host_from_ram_block_offset(block, addr);
a7180877
DDAG
2549 if (!host) {
2550 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2551 ret = -EINVAL;
2552 break;
2553 }
28abd200 2554 matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
a7180877 2555 /*
28abd200
DDAG
2556 * Postcopy requires that we place whole host pages atomically;
2557 * these may be huge pages for RAMBlocks that are backed by
2558 * hugetlbfs.
a7180877
DDAG
2559 * To make it atomic, the data is read into a temporary page
2560 * that's moved into place later.
2561 * The migration protocol uses, possibly smaller, target-pages
2562 * however the source ensures it always sends all the components
2563 * of a host page in order.
2564 */
2565 page_buffer = postcopy_host_page +
28abd200 2566 ((uintptr_t)host & (block->page_size - 1));
a7180877 2567 /* If all TP are zero then we can optimise the place */
28abd200 2568 if (!((uintptr_t)host & (block->page_size - 1))) {
a7180877 2569 all_zero = true;
c53b7ddc
DDAG
2570 } else {
2571 /* not the 1st TP within the HP */
2572 if (host != (last_host + TARGET_PAGE_SIZE)) {
9af9e0fe 2573 error_report("Non-sequential target page %p/%p",
c53b7ddc
DDAG
2574 host, last_host);
2575 ret = -EINVAL;
2576 break;
2577 }
a7180877
DDAG
2578 }
2579
c53b7ddc 2580
a7180877
DDAG
2581 /*
2582 * If it's the last part of a host page then we place the host
2583 * page
2584 */
2585 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
28abd200 2586 (block->page_size - 1)) == 0;
a7180877
DDAG
2587 place_source = postcopy_host_page;
2588 }
c53b7ddc 2589 last_host = host;
a7180877
DDAG
2590
2591 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2592 case RAM_SAVE_FLAG_COMPRESS:
2593 ch = qemu_get_byte(f);
2594 memset(page_buffer, ch, TARGET_PAGE_SIZE);
2595 if (ch) {
2596 all_zero = false;
2597 }
2598 break;
2599
2600 case RAM_SAVE_FLAG_PAGE:
2601 all_zero = false;
2602 if (!place_needed || !matching_page_sizes) {
2603 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2604 } else {
2605 /* Avoids the qemu_file copy during postcopy, which is
2606 * going to do a copy later; can only do it when we
2607 * do this read in one go (matching page sizes)
2608 */
2609 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2610 TARGET_PAGE_SIZE);
2611 }
2612 break;
2613 case RAM_SAVE_FLAG_EOS:
2614 /* normal exit */
2615 break;
2616 default:
2617 error_report("Unknown combination of migration flags: %#x"
2618 " (postcopy mode)", flags);
2619 ret = -EINVAL;
2620 }
2621
2622 if (place_needed) {
2623 /* This gets called at the last target page in the host page */
df9ff5e1
DDAG
2624 void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2625
a7180877 2626 if (all_zero) {
df9ff5e1
DDAG
2627 ret = postcopy_place_page_zero(mis, place_dest,
2628 block->page_size);
a7180877 2629 } else {
df9ff5e1
DDAG
2630 ret = postcopy_place_page(mis, place_dest,
2631 place_source, block->page_size);
a7180877
DDAG
2632 }
2633 }
2634 if (!ret) {
2635 ret = qemu_file_get_error(f);
2636 }
2637 }
2638
2639 return ret;
2640}
2641
56e93d26
JQ
2642static int ram_load(QEMUFile *f, void *opaque, int version_id)
2643{
2644 int flags = 0, ret = 0;
2645 static uint64_t seq_iter;
2646 int len = 0;
a7180877
DDAG
2647 /*
2648 * If system is running in postcopy mode, page inserts to host memory must
2649 * be atomic
2650 */
2651 bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
ef08fb38
DDAG
2652 /* ADVISE is earlier, it shows the source has the postcopy capability on */
2653 bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
56e93d26
JQ
2654
2655 seq_iter++;
2656
2657 if (version_id != 4) {
2658 ret = -EINVAL;
2659 }
2660
2661 /* This RCU critical section can be very long running.
2662 * When RCU reclaims in the code start to become numerous,
2663 * it will be necessary to reduce the granularity of this
2664 * critical section.
2665 */
2666 rcu_read_lock();
a7180877
DDAG
2667
2668 if (postcopy_running) {
2669 ret = ram_load_postcopy(f);
2670 }
2671
2672 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
56e93d26 2673 ram_addr_t addr, total_ram_bytes;
a776aa15 2674 void *host = NULL;
56e93d26
JQ
2675 uint8_t ch;
2676
2677 addr = qemu_get_be64(f);
2678 flags = addr & ~TARGET_PAGE_MASK;
2679 addr &= TARGET_PAGE_MASK;
2680
a776aa15
DDAG
2681 if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE |
2682 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4c4bad48
HZ
2683 RAMBlock *block = ram_block_from_stream(f, flags);
2684
2685 host = host_from_ram_block_offset(block, addr);
a776aa15
DDAG
2686 if (!host) {
2687 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2688 ret = -EINVAL;
2689 break;
2690 }
2691 }
2692
56e93d26
JQ
2693 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2694 case RAM_SAVE_FLAG_MEM_SIZE:
2695 /* Synchronize RAM block list */
2696 total_ram_bytes = addr;
2697 while (!ret && total_ram_bytes) {
2698 RAMBlock *block;
56e93d26
JQ
2699 char id[256];
2700 ram_addr_t length;
2701
2702 len = qemu_get_byte(f);
2703 qemu_get_buffer(f, (uint8_t *)id, len);
2704 id[len] = 0;
2705 length = qemu_get_be64(f);
2706
e3dd7493
DDAG
2707 block = qemu_ram_block_by_name(id);
2708 if (block) {
2709 if (length != block->used_length) {
2710 Error *local_err = NULL;
56e93d26 2711
fa53a0e5 2712 ret = qemu_ram_resize(block, length,
e3dd7493
DDAG
2713 &local_err);
2714 if (local_err) {
2715 error_report_err(local_err);
56e93d26 2716 }
56e93d26 2717 }
ef08fb38
DDAG
2718 /* For postcopy we need to check hugepage sizes match */
2719 if (postcopy_advised &&
2720 block->page_size != qemu_host_page_size) {
2721 uint64_t remote_page_size = qemu_get_be64(f);
2722 if (remote_page_size != block->page_size) {
2723 error_report("Mismatched RAM page size %s "
2724 "(local) %zd != %" PRId64,
2725 id, block->page_size,
2726 remote_page_size);
2727 ret = -EINVAL;
2728 }
2729 }
e3dd7493
DDAG
2730 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2731 block->idstr);
2732 } else {
56e93d26
JQ
2733 error_report("Unknown ramblock \"%s\", cannot "
2734 "accept migration", id);
2735 ret = -EINVAL;
2736 }
2737
2738 total_ram_bytes -= length;
2739 }
2740 break;
a776aa15 2741
56e93d26 2742 case RAM_SAVE_FLAG_COMPRESS:
56e93d26
JQ
2743 ch = qemu_get_byte(f);
2744 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2745 break;
a776aa15 2746
56e93d26 2747 case RAM_SAVE_FLAG_PAGE:
56e93d26
JQ
2748 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2749 break;
56e93d26 2750
a776aa15 2751 case RAM_SAVE_FLAG_COMPRESS_PAGE:
56e93d26
JQ
2752 len = qemu_get_be32(f);
2753 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2754 error_report("Invalid compressed data length: %d", len);
2755 ret = -EINVAL;
2756 break;
2757 }
c1bc6626 2758 decompress_data_with_multi_threads(f, host, len);
56e93d26 2759 break;
a776aa15 2760
56e93d26 2761 case RAM_SAVE_FLAG_XBZRLE:
56e93d26
JQ
2762 if (load_xbzrle(f, addr, host) < 0) {
2763 error_report("Failed to decompress XBZRLE page at "
2764 RAM_ADDR_FMT, addr);
2765 ret = -EINVAL;
2766 break;
2767 }
2768 break;
2769 case RAM_SAVE_FLAG_EOS:
2770 /* normal exit */
2771 break;
2772 default:
2773 if (flags & RAM_SAVE_FLAG_HOOK) {
632e3a5c 2774 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
56e93d26
JQ
2775 } else {
2776 error_report("Unknown combination of migration flags: %#x",
2777 flags);
2778 ret = -EINVAL;
2779 }
2780 }
2781 if (!ret) {
2782 ret = qemu_file_get_error(f);
2783 }
2784 }
2785
5533b2e9 2786 wait_for_decompress_done();
56e93d26 2787 rcu_read_unlock();
55c4446b 2788 trace_ram_load_complete(ret, seq_iter);
56e93d26
JQ
2789 return ret;
2790}
2791
2792static SaveVMHandlers savevm_ram_handlers = {
2793 .save_live_setup = ram_save_setup,
2794 .save_live_iterate = ram_save_iterate,
763c906b 2795 .save_live_complete_postcopy = ram_save_complete,
a3e06c3d 2796 .save_live_complete_precopy = ram_save_complete,
56e93d26
JQ
2797 .save_live_pending = ram_save_pending,
2798 .load_state = ram_load,
6ad2a215 2799 .cleanup = ram_migration_cleanup,
56e93d26
JQ
2800};
2801
2802void ram_mig_init(void)
2803{
2804 qemu_mutex_init(&XBZRLE.lock);
6f37bb8b 2805 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
56e93d26 2806}