]> git.proxmox.com Git - mirror_qemu.git/blame - migration/ram.c
ram: Pass RAMBlock to bitmap_sync
[mirror_qemu.git] / migration / ram.c
CommitLineData
56e93d26
JQ
1/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
76cc7b58
JQ
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
56e93d26
JQ
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
1393a485 28#include "qemu/osdep.h"
33c11879
PB
29#include "qemu-common.h"
30#include "cpu.h"
56e93d26 31#include <zlib.h>
4addcd4f 32#include "qapi-event.h"
f348b6d1 33#include "qemu/cutils.h"
56e93d26
JQ
34#include "qemu/bitops.h"
35#include "qemu/bitmap.h"
7205c9ec
JQ
36#include "qemu/timer.h"
37#include "qemu/main-loop.h"
56e93d26 38#include "migration/migration.h"
e0b266f0 39#include "migration/postcopy-ram.h"
56e93d26
JQ
40#include "exec/address-spaces.h"
41#include "migration/page_cache.h"
56e93d26 42#include "qemu/error-report.h"
56e93d26 43#include "trace.h"
56e93d26 44#include "exec/ram_addr.h"
56e93d26 45#include "qemu/rcu_queue.h"
a91246c9 46#include "migration/colo.h"
56e93d26 47
56e93d26
JQ
48/***********************************************************/
49/* ram save/restore */
50
51#define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
52#define RAM_SAVE_FLAG_COMPRESS 0x02
53#define RAM_SAVE_FLAG_MEM_SIZE 0x04
54#define RAM_SAVE_FLAG_PAGE 0x08
55#define RAM_SAVE_FLAG_EOS 0x10
56#define RAM_SAVE_FLAG_CONTINUE 0x20
57#define RAM_SAVE_FLAG_XBZRLE 0x40
58/* 0x80 is reserved in migration.h start with 0x100 next */
59#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
60
adb65dec 61static uint8_t *ZERO_TARGET_PAGE;
56e93d26
JQ
62
63static inline bool is_zero_range(uint8_t *p, uint64_t size)
64{
a1febc49 65 return buffer_is_zero(p, size);
56e93d26
JQ
66}
67
68/* struct contains XBZRLE cache and a static page
69 used by the compression */
70static struct {
71 /* buffer used for XBZRLE encoding */
72 uint8_t *encoded_buf;
73 /* buffer for storing page content */
74 uint8_t *current_buf;
75 /* Cache for XBZRLE, Protected by lock. */
76 PageCache *cache;
77 QemuMutex lock;
78} XBZRLE;
79
80/* buffer used for XBZRLE decoding */
81static uint8_t *xbzrle_decoded_buf;
82
83static void XBZRLE_cache_lock(void)
84{
85 if (migrate_use_xbzrle())
86 qemu_mutex_lock(&XBZRLE.lock);
87}
88
89static void XBZRLE_cache_unlock(void)
90{
91 if (migrate_use_xbzrle())
92 qemu_mutex_unlock(&XBZRLE.lock);
93}
94
3d0684b2
JQ
95/**
96 * xbzrle_cache_resize: resize the xbzrle cache
97 *
98 * This function is called from qmp_migrate_set_cache_size in main
99 * thread, possibly while a migration is in progress. A running
100 * migration may be using the cache and might finish during this call,
101 * hence changes to the cache are protected by XBZRLE.lock().
102 *
103 * Returns the new_size or negative in case of error.
104 *
105 * @new_size: new cache size
56e93d26
JQ
106 */
107int64_t xbzrle_cache_resize(int64_t new_size)
108{
109 PageCache *new_cache;
110 int64_t ret;
111
112 if (new_size < TARGET_PAGE_SIZE) {
113 return -1;
114 }
115
116 XBZRLE_cache_lock();
117
118 if (XBZRLE.cache != NULL) {
119 if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
120 goto out_new_size;
121 }
122 new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
123 TARGET_PAGE_SIZE);
124 if (!new_cache) {
125 error_report("Error creating cache");
126 ret = -1;
127 goto out;
128 }
129
130 cache_fini(XBZRLE.cache);
131 XBZRLE.cache = new_cache;
132 }
133
134out_new_size:
135 ret = pow2floor(new_size);
136out:
137 XBZRLE_cache_unlock();
138 return ret;
139}
140
eb859c53
JQ
141struct RAMBitmap {
142 struct rcu_head rcu;
143 /* Main migration bitmap */
144 unsigned long *bmap;
145 /* bitmap of pages that haven't been sent even once
146 * only maintained and used in postcopy at the moment
147 * where it's used to send the dirtymap at the start
148 * of the postcopy phase
149 */
150 unsigned long *unsentmap;
151};
152typedef struct RAMBitmap RAMBitmap;
153
ec481c6c
JQ
154/*
155 * An outstanding page request, on the source, having been received
156 * and queued
157 */
158struct RAMSrcPageRequest {
159 RAMBlock *rb;
160 hwaddr offset;
161 hwaddr len;
162
163 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
164};
165
6f37bb8b
JQ
166/* State of RAM for migration */
167struct RAMState {
204b88b8
JQ
168 /* QEMUFile used for this migration */
169 QEMUFile *f;
6f37bb8b
JQ
170 /* Last block that we have visited searching for dirty pages */
171 RAMBlock *last_seen_block;
172 /* Last block from where we have sent data */
173 RAMBlock *last_sent_block;
174 /* Last offset we have sent data from */
175 ram_addr_t last_offset;
176 /* last ram version we have seen */
177 uint32_t last_version;
178 /* We are in the first round */
179 bool ram_bulk_stage;
8d820d6f
JQ
180 /* How many times we have dirty too many pages */
181 int dirty_rate_high_cnt;
5a987738
JQ
182 /* How many times we have synchronized the bitmap */
183 uint64_t bitmap_sync_count;
f664da80
JQ
184 /* these variables are used for bitmap sync */
185 /* last time we did a full bitmap_sync */
186 int64_t time_last_bitmap_sync;
eac74159 187 /* bytes transferred at start_time */
c4bdf0cf 188 uint64_t bytes_xfer_prev;
a66cd90c 189 /* number of dirty pages since start_time */
68908ed6 190 uint64_t num_dirty_pages_period;
b5833fde
JQ
191 /* xbzrle misses since the beginning of the period */
192 uint64_t xbzrle_cache_miss_prev;
36040d9c
JQ
193 /* number of iterations at the beginning of period */
194 uint64_t iterations_prev;
f7ccd61b
JQ
195 /* Accounting fields */
196 /* number of zero pages. It used to be pages filled by the same char. */
197 uint64_t zero_pages;
b4d1c6e7
JQ
198 /* number of normal transferred pages */
199 uint64_t norm_pages;
23b28c3c
JQ
200 /* Iterations since start */
201 uint64_t iterations;
f36ada95
JQ
202 /* xbzrle transmitted bytes. Notice that this is with
203 * compression, they can't be calculated from the pages */
07ed50a2 204 uint64_t xbzrle_bytes;
f36ada95
JQ
205 /* xbzrle transmmited pages */
206 uint64_t xbzrle_pages;
544c36f1
JQ
207 /* xbzrle number of cache miss */
208 uint64_t xbzrle_cache_miss;
b07016b6
JQ
209 /* xbzrle miss rate */
210 double xbzrle_cache_miss_rate;
180f61f7
JQ
211 /* xbzrle number of overflows */
212 uint64_t xbzrle_overflows;
0d8ec885
JQ
213 /* number of dirty bits in the bitmap */
214 uint64_t migration_dirty_pages;
2f4fde93
JQ
215 /* total number of bytes transferred */
216 uint64_t bytes_transferred;
47ad8619
JQ
217 /* number of dirtied pages in the last second */
218 uint64_t dirty_pages_rate;
96506894
JQ
219 /* Count of requests incoming from destination */
220 uint64_t postcopy_requests;
108cfae0
JQ
221 /* protects modification of the bitmap */
222 QemuMutex bitmap_mutex;
eb859c53
JQ
223 /* Ram Bitmap protected by RCU */
224 RAMBitmap *ram_bitmap;
68a098f3
JQ
225 /* The RAMBlock used in the last src_page_requests */
226 RAMBlock *last_req_rb;
ec481c6c
JQ
227 /* Queue of outstanding page requests from the destination */
228 QemuMutex src_page_req_mutex;
229 QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
6f37bb8b
JQ
230};
231typedef struct RAMState RAMState;
232
233static RAMState ram_state;
234
56e93d26
JQ
235uint64_t dup_mig_pages_transferred(void)
236{
f7ccd61b 237 return ram_state.zero_pages;
56e93d26
JQ
238}
239
56e93d26
JQ
240uint64_t norm_mig_pages_transferred(void)
241{
b4d1c6e7 242 return ram_state.norm_pages;
56e93d26
JQ
243}
244
245uint64_t xbzrle_mig_bytes_transferred(void)
246{
07ed50a2 247 return ram_state.xbzrle_bytes;
56e93d26
JQ
248}
249
250uint64_t xbzrle_mig_pages_transferred(void)
251{
f36ada95 252 return ram_state.xbzrle_pages;
56e93d26
JQ
253}
254
255uint64_t xbzrle_mig_pages_cache_miss(void)
256{
544c36f1 257 return ram_state.xbzrle_cache_miss;
56e93d26
JQ
258}
259
260double xbzrle_mig_cache_miss_rate(void)
261{
b07016b6 262 return ram_state.xbzrle_cache_miss_rate;
56e93d26
JQ
263}
264
265uint64_t xbzrle_mig_pages_overflow(void)
266{
180f61f7 267 return ram_state.xbzrle_overflows;
56e93d26
JQ
268}
269
9edabd4d 270uint64_t ram_bytes_transferred(void)
0d8ec885 271{
9edabd4d 272 return ram_state.bytes_transferred;
0d8ec885
JQ
273}
274
9edabd4d 275uint64_t ram_bytes_remaining(void)
2f4fde93 276{
9edabd4d 277 return ram_state.migration_dirty_pages * TARGET_PAGE_SIZE;
2f4fde93
JQ
278}
279
42d219d3
JQ
280uint64_t ram_dirty_sync_count(void)
281{
282 return ram_state.bitmap_sync_count;
283}
284
47ad8619
JQ
285uint64_t ram_dirty_pages_rate(void)
286{
287 return ram_state.dirty_pages_rate;
288}
289
96506894
JQ
290uint64_t ram_postcopy_requests(void)
291{
292 return ram_state.postcopy_requests;
293}
294
b8fb8cb7
DDAG
295/* used by the search for pages to send */
296struct PageSearchStatus {
297 /* Current block being searched */
298 RAMBlock *block;
299 /* Current offset to search from */
300 ram_addr_t offset;
301 /* Set once we wrap around */
302 bool complete_round;
303};
304typedef struct PageSearchStatus PageSearchStatus;
305
56e93d26 306struct CompressParam {
56e93d26 307 bool done;
90e56fb4 308 bool quit;
56e93d26
JQ
309 QEMUFile *file;
310 QemuMutex mutex;
311 QemuCond cond;
312 RAMBlock *block;
313 ram_addr_t offset;
314};
315typedef struct CompressParam CompressParam;
316
317struct DecompressParam {
73a8912b 318 bool done;
90e56fb4 319 bool quit;
56e93d26
JQ
320 QemuMutex mutex;
321 QemuCond cond;
322 void *des;
d341d9f3 323 uint8_t *compbuf;
56e93d26
JQ
324 int len;
325};
326typedef struct DecompressParam DecompressParam;
327
328static CompressParam *comp_param;
329static QemuThread *compress_threads;
330/* comp_done_cond is used to wake up the migration thread when
331 * one of the compression threads has finished the compression.
332 * comp_done_lock is used to co-work with comp_done_cond.
333 */
0d9f9a5c
LL
334static QemuMutex comp_done_lock;
335static QemuCond comp_done_cond;
56e93d26
JQ
336/* The empty QEMUFileOps will be used by file in CompressParam */
337static const QEMUFileOps empty_ops = { };
338
56e93d26
JQ
339static DecompressParam *decomp_param;
340static QemuThread *decompress_threads;
73a8912b
LL
341static QemuMutex decomp_done_lock;
342static QemuCond decomp_done_cond;
56e93d26 343
a7a9a88f
LL
344static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
345 ram_addr_t offset);
56e93d26
JQ
346
347static void *do_data_compress(void *opaque)
348{
349 CompressParam *param = opaque;
a7a9a88f
LL
350 RAMBlock *block;
351 ram_addr_t offset;
56e93d26 352
a7a9a88f 353 qemu_mutex_lock(&param->mutex);
90e56fb4 354 while (!param->quit) {
a7a9a88f
LL
355 if (param->block) {
356 block = param->block;
357 offset = param->offset;
358 param->block = NULL;
359 qemu_mutex_unlock(&param->mutex);
360
361 do_compress_ram_page(param->file, block, offset);
362
0d9f9a5c 363 qemu_mutex_lock(&comp_done_lock);
a7a9a88f 364 param->done = true;
0d9f9a5c
LL
365 qemu_cond_signal(&comp_done_cond);
366 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
367
368 qemu_mutex_lock(&param->mutex);
369 } else {
56e93d26
JQ
370 qemu_cond_wait(&param->cond, &param->mutex);
371 }
56e93d26 372 }
a7a9a88f 373 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
374
375 return NULL;
376}
377
378static inline void terminate_compression_threads(void)
379{
380 int idx, thread_count;
381
382 thread_count = migrate_compress_threads();
3d0684b2 383
56e93d26
JQ
384 for (idx = 0; idx < thread_count; idx++) {
385 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 386 comp_param[idx].quit = true;
56e93d26
JQ
387 qemu_cond_signal(&comp_param[idx].cond);
388 qemu_mutex_unlock(&comp_param[idx].mutex);
389 }
390}
391
392void migrate_compress_threads_join(void)
393{
394 int i, thread_count;
395
396 if (!migrate_use_compression()) {
397 return;
398 }
399 terminate_compression_threads();
400 thread_count = migrate_compress_threads();
401 for (i = 0; i < thread_count; i++) {
402 qemu_thread_join(compress_threads + i);
403 qemu_fclose(comp_param[i].file);
404 qemu_mutex_destroy(&comp_param[i].mutex);
405 qemu_cond_destroy(&comp_param[i].cond);
406 }
0d9f9a5c
LL
407 qemu_mutex_destroy(&comp_done_lock);
408 qemu_cond_destroy(&comp_done_cond);
56e93d26
JQ
409 g_free(compress_threads);
410 g_free(comp_param);
56e93d26
JQ
411 compress_threads = NULL;
412 comp_param = NULL;
56e93d26
JQ
413}
414
415void migrate_compress_threads_create(void)
416{
417 int i, thread_count;
418
419 if (!migrate_use_compression()) {
420 return;
421 }
56e93d26
JQ
422 thread_count = migrate_compress_threads();
423 compress_threads = g_new0(QemuThread, thread_count);
424 comp_param = g_new0(CompressParam, thread_count);
0d9f9a5c
LL
425 qemu_cond_init(&comp_done_cond);
426 qemu_mutex_init(&comp_done_lock);
56e93d26 427 for (i = 0; i < thread_count; i++) {
e110aa91
C
428 /* comp_param[i].file is just used as a dummy buffer to save data,
429 * set its ops to empty.
56e93d26
JQ
430 */
431 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
432 comp_param[i].done = true;
90e56fb4 433 comp_param[i].quit = false;
56e93d26
JQ
434 qemu_mutex_init(&comp_param[i].mutex);
435 qemu_cond_init(&comp_param[i].cond);
436 qemu_thread_create(compress_threads + i, "compress",
437 do_data_compress, comp_param + i,
438 QEMU_THREAD_JOINABLE);
439 }
440}
441
442/**
3d0684b2 443 * save_page_header: write page header to wire
56e93d26
JQ
444 *
445 * If this is the 1st block, it also writes the block identification
446 *
3d0684b2 447 * Returns the number of bytes written
56e93d26
JQ
448 *
449 * @f: QEMUFile where to send the data
450 * @block: block that contains the page we want to send
451 * @offset: offset inside the block for the page
452 * in the lower bits, it contains flags
453 */
454static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
455{
9f5f380b 456 size_t size, len;
56e93d26
JQ
457
458 qemu_put_be64(f, offset);
459 size = 8;
460
461 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
9f5f380b
LL
462 len = strlen(block->idstr);
463 qemu_put_byte(f, len);
464 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
465 size += 1 + len;
56e93d26
JQ
466 }
467 return size;
468}
469
3d0684b2
JQ
470/**
471 * mig_throttle_guest_down: throotle down the guest
472 *
473 * Reduce amount of guest cpu execution to hopefully slow down memory
474 * writes. If guest dirty memory rate is reduced below the rate at
475 * which we can transfer pages to the destination then we should be
476 * able to complete migration. Some workloads dirty memory way too
477 * fast and will not effectively converge, even with auto-converge.
070afca2
JH
478 */
479static void mig_throttle_guest_down(void)
480{
481 MigrationState *s = migrate_get_current();
2594f56d
DB
482 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
483 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
070afca2
JH
484
485 /* We have not started throttling yet. Let's start it. */
486 if (!cpu_throttle_active()) {
487 cpu_throttle_set(pct_initial);
488 } else {
489 /* Throttling already on, just increase the rate */
490 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
491 }
492}
493
3d0684b2
JQ
494/**
495 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
496 *
6f37bb8b 497 * @rs: current RAM state
3d0684b2
JQ
498 * @current_addr: address for the zero page
499 *
500 * Update the xbzrle cache to reflect a page that's been sent as all 0.
56e93d26
JQ
501 * The important thing is that a stale (not-yet-0'd) page be replaced
502 * by the new data.
503 * As a bonus, if the page wasn't in the cache it gets added so that
3d0684b2 504 * when a small write is made into the 0'd page it gets XBZRLE sent.
56e93d26 505 */
6f37bb8b 506static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
56e93d26 507{
6f37bb8b 508 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
56e93d26
JQ
509 return;
510 }
511
512 /* We don't care if this fails to allocate a new cache page
513 * as long as it updated an old one */
514 cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
5a987738 515 rs->bitmap_sync_count);
56e93d26
JQ
516}
517
518#define ENCODING_FLAG_XBZRLE 0x1
519
520/**
521 * save_xbzrle_page: compress and send current page
522 *
523 * Returns: 1 means that we wrote the page
524 * 0 means that page is identical to the one already sent
525 * -1 means that xbzrle would be longer than normal
526 *
5a987738 527 * @rs: current RAM state
3d0684b2
JQ
528 * @current_data: pointer to the address of the page contents
529 * @current_addr: addr of the page
56e93d26
JQ
530 * @block: block that contains the page we want to send
531 * @offset: offset inside the block for the page
532 * @last_stage: if we are at the completion stage
56e93d26 533 */
204b88b8 534static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
56e93d26 535 ram_addr_t current_addr, RAMBlock *block,
072c2511 536 ram_addr_t offset, bool last_stage)
56e93d26
JQ
537{
538 int encoded_len = 0, bytes_xbzrle;
539 uint8_t *prev_cached_page;
540
5a987738 541 if (!cache_is_cached(XBZRLE.cache, current_addr, rs->bitmap_sync_count)) {
544c36f1 542 rs->xbzrle_cache_miss++;
56e93d26
JQ
543 if (!last_stage) {
544 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
5a987738 545 rs->bitmap_sync_count) == -1) {
56e93d26
JQ
546 return -1;
547 } else {
548 /* update *current_data when the page has been
549 inserted into cache */
550 *current_data = get_cached_data(XBZRLE.cache, current_addr);
551 }
552 }
553 return -1;
554 }
555
556 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
557
558 /* save current buffer into memory */
559 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
560
561 /* XBZRLE encoding (if there is no overflow) */
562 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
563 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
564 TARGET_PAGE_SIZE);
565 if (encoded_len == 0) {
55c4446b 566 trace_save_xbzrle_page_skipping();
56e93d26
JQ
567 return 0;
568 } else if (encoded_len == -1) {
55c4446b 569 trace_save_xbzrle_page_overflow();
180f61f7 570 rs->xbzrle_overflows++;
56e93d26
JQ
571 /* update data in the cache */
572 if (!last_stage) {
573 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
574 *current_data = prev_cached_page;
575 }
576 return -1;
577 }
578
579 /* we need to update the data in the cache, in order to get the same data */
580 if (!last_stage) {
581 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
582 }
583
584 /* Send XBZRLE based compressed page */
204b88b8
JQ
585 bytes_xbzrle = save_page_header(rs->f, block,
586 offset | RAM_SAVE_FLAG_XBZRLE);
587 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
588 qemu_put_be16(rs->f, encoded_len);
589 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
56e93d26 590 bytes_xbzrle += encoded_len + 1 + 2;
f36ada95 591 rs->xbzrle_pages++;
07ed50a2 592 rs->xbzrle_bytes += bytes_xbzrle;
072c2511 593 rs->bytes_transferred += bytes_xbzrle;
56e93d26
JQ
594
595 return 1;
596}
597
3d0684b2
JQ
598/**
599 * migration_bitmap_find_dirty: find the next dirty page from start
f3f491fc 600 *
3d0684b2
JQ
601 * Called with rcu_read_lock() to protect migration_bitmap
602 *
603 * Returns the byte offset within memory region of the start of a dirty page
604 *
6f37bb8b 605 * @rs: current RAM state
3d0684b2
JQ
606 * @rb: RAMBlock where to search for dirty pages
607 * @start: starting address (typically so we can continue from previous page)
608 * @ram_addr_abs: pointer into which to store the address of the dirty page
609 * within the global ram_addr space
f3f491fc 610 */
56e93d26 611static inline
6f37bb8b 612ram_addr_t migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
a82d593b
DDAG
613 ram_addr_t start,
614 ram_addr_t *ram_addr_abs)
56e93d26 615{
2f68e399 616 unsigned long base = rb->offset >> TARGET_PAGE_BITS;
56e93d26 617 unsigned long nr = base + (start >> TARGET_PAGE_BITS);
2f68e399
DDAG
618 uint64_t rb_size = rb->used_length;
619 unsigned long size = base + (rb_size >> TARGET_PAGE_BITS);
2ff64038 620 unsigned long *bitmap;
56e93d26
JQ
621
622 unsigned long next;
623
eb859c53 624 bitmap = atomic_rcu_read(&rs->ram_bitmap)->bmap;
6f37bb8b 625 if (rs->ram_bulk_stage && nr > base) {
56e93d26
JQ
626 next = nr + 1;
627 } else {
2ff64038 628 next = find_next_bit(bitmap, size, nr);
56e93d26
JQ
629 }
630
f3f491fc 631 *ram_addr_abs = next << TARGET_PAGE_BITS;
56e93d26
JQ
632 return (next - base) << TARGET_PAGE_BITS;
633}
634
0d8ec885 635static inline bool migration_bitmap_clear_dirty(RAMState *rs, ram_addr_t addr)
a82d593b
DDAG
636{
637 bool ret;
638 int nr = addr >> TARGET_PAGE_BITS;
eb859c53 639 unsigned long *bitmap = atomic_rcu_read(&rs->ram_bitmap)->bmap;
a82d593b
DDAG
640
641 ret = test_and_clear_bit(nr, bitmap);
642
643 if (ret) {
0d8ec885 644 rs->migration_dirty_pages--;
a82d593b
DDAG
645 }
646 return ret;
647}
648
15440dd5
JQ
649static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
650 ram_addr_t start, ram_addr_t length)
56e93d26 651{
2ff64038 652 unsigned long *bitmap;
eb859c53 653 bitmap = atomic_rcu_read(&rs->ram_bitmap)->bmap;
0d8ec885 654 rs->migration_dirty_pages +=
15440dd5 655 cpu_physical_memory_sync_dirty_bitmap(bitmap, rb, start, length,
0d8ec885 656 &rs->num_dirty_pages_period);
56e93d26
JQ
657}
658
3d0684b2
JQ
659/**
660 * ram_pagesize_summary: calculate all the pagesizes of a VM
661 *
662 * Returns a summary bitmap of the page sizes of all RAMBlocks
663 *
664 * For VMs with just normal pages this is equivalent to the host page
665 * size. If it's got some huge pages then it's the OR of all the
666 * different page sizes.
e8ca1db2
DDAG
667 */
668uint64_t ram_pagesize_summary(void)
669{
670 RAMBlock *block;
671 uint64_t summary = 0;
672
673 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
674 summary |= block->page_size;
675 }
676
677 return summary;
678}
679
8d820d6f 680static void migration_bitmap_sync(RAMState *rs)
56e93d26
JQ
681{
682 RAMBlock *block;
56e93d26 683 int64_t end_time;
c4bdf0cf 684 uint64_t bytes_xfer_now;
56e93d26 685
5a987738 686 rs->bitmap_sync_count++;
56e93d26 687
eac74159
JQ
688 if (!rs->bytes_xfer_prev) {
689 rs->bytes_xfer_prev = ram_bytes_transferred();
56e93d26
JQ
690 }
691
f664da80
JQ
692 if (!rs->time_last_bitmap_sync) {
693 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
56e93d26
JQ
694 }
695
696 trace_migration_bitmap_sync_start();
9c1f8f44 697 memory_global_dirty_log_sync();
56e93d26 698
108cfae0 699 qemu_mutex_lock(&rs->bitmap_mutex);
56e93d26
JQ
700 rcu_read_lock();
701 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
15440dd5 702 migration_bitmap_sync_range(rs, block, 0, block->used_length);
56e93d26
JQ
703 }
704 rcu_read_unlock();
108cfae0 705 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26 706
a66cd90c 707 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1ffb5dfd 708
56e93d26
JQ
709 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
710
711 /* more than 1 second = 1000 millisecons */
f664da80 712 if (end_time > rs->time_last_bitmap_sync + 1000) {
56e93d26
JQ
713 if (migrate_auto_converge()) {
714 /* The following detection logic can be refined later. For now:
715 Check to see if the dirtied bytes is 50% more than the approx.
716 amount of bytes that just got transferred since the last time we
070afca2
JH
717 were in this routine. If that happens twice, start or increase
718 throttling */
56e93d26 719 bytes_xfer_now = ram_bytes_transferred();
070afca2 720
47ad8619 721 if (rs->dirty_pages_rate &&
a66cd90c 722 (rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
eac74159 723 (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
8d820d6f 724 (rs->dirty_rate_high_cnt++ >= 2)) {
56e93d26 725 trace_migration_throttle();
8d820d6f 726 rs->dirty_rate_high_cnt = 0;
070afca2 727 mig_throttle_guest_down();
56e93d26 728 }
eac74159 729 rs->bytes_xfer_prev = bytes_xfer_now;
56e93d26 730 }
070afca2 731
56e93d26 732 if (migrate_use_xbzrle()) {
23b28c3c 733 if (rs->iterations_prev != rs->iterations) {
b07016b6 734 rs->xbzrle_cache_miss_rate =
544c36f1 735 (double)(rs->xbzrle_cache_miss -
b5833fde 736 rs->xbzrle_cache_miss_prev) /
23b28c3c 737 (rs->iterations - rs->iterations_prev);
56e93d26 738 }
23b28c3c 739 rs->iterations_prev = rs->iterations;
544c36f1 740 rs->xbzrle_cache_miss_prev = rs->xbzrle_cache_miss;
56e93d26 741 }
47ad8619 742 rs->dirty_pages_rate = rs->num_dirty_pages_period * 1000
f664da80 743 / (end_time - rs->time_last_bitmap_sync);
f664da80 744 rs->time_last_bitmap_sync = end_time;
a66cd90c 745 rs->num_dirty_pages_period = 0;
56e93d26 746 }
4addcd4f 747 if (migrate_use_events()) {
5a987738 748 qapi_event_send_migration_pass(rs->bitmap_sync_count, NULL);
4addcd4f 749 }
56e93d26
JQ
750}
751
752/**
3d0684b2 753 * save_zero_page: send the zero page to the stream
56e93d26 754 *
3d0684b2 755 * Returns the number of pages written.
56e93d26 756 *
f7ccd61b 757 * @rs: current RAM state
56e93d26
JQ
758 * @block: block that contains the page we want to send
759 * @offset: offset inside the block for the page
760 * @p: pointer to the page
56e93d26 761 */
ce25d337
JQ
762static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
763 uint8_t *p)
56e93d26
JQ
764{
765 int pages = -1;
766
767 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
f7ccd61b 768 rs->zero_pages++;
072c2511 769 rs->bytes_transferred +=
ce25d337
JQ
770 save_page_header(rs->f, block, offset | RAM_SAVE_FLAG_COMPRESS);
771 qemu_put_byte(rs->f, 0);
072c2511 772 rs->bytes_transferred += 1;
56e93d26
JQ
773 pages = 1;
774 }
775
776 return pages;
777}
778
5727309d 779static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
53f09a10 780{
5727309d 781 if (!migrate_release_ram() || !migration_in_postcopy()) {
53f09a10
PB
782 return;
783 }
784
36449157 785 ram_discard_range(NULL, rbname, offset, pages << TARGET_PAGE_BITS);
53f09a10
PB
786}
787
56e93d26 788/**
3d0684b2 789 * ram_save_page: send the given page to the stream
56e93d26 790 *
3d0684b2 791 * Returns the number of pages written.
3fd3c4b3
DDAG
792 * < 0 - error
793 * >=0 - Number of pages written - this might legally be 0
794 * if xbzrle noticed the page was the same.
56e93d26 795 *
6f37bb8b 796 * @rs: current RAM state
56e93d26
JQ
797 * @block: block that contains the page we want to send
798 * @offset: offset inside the block for the page
799 * @last_stage: if we are at the completion stage
56e93d26 800 */
a0a8aa14 801static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
56e93d26
JQ
802{
803 int pages = -1;
804 uint64_t bytes_xmit;
805 ram_addr_t current_addr;
56e93d26
JQ
806 uint8_t *p;
807 int ret;
808 bool send_async = true;
a08f6890
HZ
809 RAMBlock *block = pss->block;
810 ram_addr_t offset = pss->offset;
56e93d26 811
2f68e399 812 p = block->host + offset;
56e93d26
JQ
813
814 /* In doubt sent page as normal */
815 bytes_xmit = 0;
ce25d337 816 ret = ram_control_save_page(rs->f, block->offset,
56e93d26
JQ
817 offset, TARGET_PAGE_SIZE, &bytes_xmit);
818 if (bytes_xmit) {
072c2511 819 rs->bytes_transferred += bytes_xmit;
56e93d26
JQ
820 pages = 1;
821 }
822
823 XBZRLE_cache_lock();
824
825 current_addr = block->offset + offset;
826
6f37bb8b 827 if (block == rs->last_sent_block) {
56e93d26
JQ
828 offset |= RAM_SAVE_FLAG_CONTINUE;
829 }
830 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
831 if (ret != RAM_SAVE_CONTROL_DELAYED) {
832 if (bytes_xmit > 0) {
b4d1c6e7 833 rs->norm_pages++;
56e93d26 834 } else if (bytes_xmit == 0) {
f7ccd61b 835 rs->zero_pages++;
56e93d26
JQ
836 }
837 }
838 } else {
ce25d337 839 pages = save_zero_page(rs, block, offset, p);
56e93d26
JQ
840 if (pages > 0) {
841 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
842 * page would be stale
843 */
6f37bb8b 844 xbzrle_cache_zero_page(rs, current_addr);
5727309d 845 ram_release_pages(block->idstr, pss->offset, pages);
6f37bb8b 846 } else if (!rs->ram_bulk_stage &&
5727309d 847 !migration_in_postcopy() && migrate_use_xbzrle()) {
204b88b8 848 pages = save_xbzrle_page(rs, &p, current_addr, block,
072c2511 849 offset, last_stage);
56e93d26
JQ
850 if (!last_stage) {
851 /* Can't send this cached data async, since the cache page
852 * might get updated before it gets to the wire
853 */
854 send_async = false;
855 }
856 }
857 }
858
859 /* XBZRLE overflow or normal page */
860 if (pages == -1) {
ce25d337 861 rs->bytes_transferred += save_page_header(rs->f, block,
56e93d26
JQ
862 offset | RAM_SAVE_FLAG_PAGE);
863 if (send_async) {
ce25d337 864 qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
53f09a10 865 migrate_release_ram() &
5727309d 866 migration_in_postcopy());
56e93d26 867 } else {
ce25d337 868 qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
56e93d26 869 }
072c2511 870 rs->bytes_transferred += TARGET_PAGE_SIZE;
56e93d26 871 pages = 1;
b4d1c6e7 872 rs->norm_pages++;
56e93d26
JQ
873 }
874
875 XBZRLE_cache_unlock();
876
877 return pages;
878}
879
a7a9a88f
LL
880static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
881 ram_addr_t offset)
56e93d26
JQ
882{
883 int bytes_sent, blen;
a7a9a88f 884 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
56e93d26 885
a7a9a88f 886 bytes_sent = save_page_header(f, block, offset |
56e93d26 887 RAM_SAVE_FLAG_COMPRESS_PAGE);
a7a9a88f 888 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
56e93d26 889 migrate_compress_level());
b3be2896
LL
890 if (blen < 0) {
891 bytes_sent = 0;
892 qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
893 error_report("compressed data failed!");
894 } else {
895 bytes_sent += blen;
5727309d 896 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
b3be2896 897 }
56e93d26
JQ
898
899 return bytes_sent;
900}
901
ce25d337 902static void flush_compressed_data(RAMState *rs)
56e93d26
JQ
903{
904 int idx, len, thread_count;
905
906 if (!migrate_use_compression()) {
907 return;
908 }
909 thread_count = migrate_compress_threads();
a7a9a88f 910
0d9f9a5c 911 qemu_mutex_lock(&comp_done_lock);
56e93d26 912 for (idx = 0; idx < thread_count; idx++) {
a7a9a88f 913 while (!comp_param[idx].done) {
0d9f9a5c 914 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26 915 }
a7a9a88f 916 }
0d9f9a5c 917 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
918
919 for (idx = 0; idx < thread_count; idx++) {
920 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 921 if (!comp_param[idx].quit) {
ce25d337 922 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2f4fde93 923 rs->bytes_transferred += len;
56e93d26 924 }
a7a9a88f 925 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26
JQ
926 }
927}
928
929static inline void set_compress_params(CompressParam *param, RAMBlock *block,
930 ram_addr_t offset)
931{
932 param->block = block;
933 param->offset = offset;
934}
935
ce25d337
JQ
936static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
937 ram_addr_t offset)
56e93d26
JQ
938{
939 int idx, thread_count, bytes_xmit = -1, pages = -1;
940
941 thread_count = migrate_compress_threads();
0d9f9a5c 942 qemu_mutex_lock(&comp_done_lock);
56e93d26
JQ
943 while (true) {
944 for (idx = 0; idx < thread_count; idx++) {
945 if (comp_param[idx].done) {
a7a9a88f 946 comp_param[idx].done = false;
ce25d337 947 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
a7a9a88f 948 qemu_mutex_lock(&comp_param[idx].mutex);
56e93d26 949 set_compress_params(&comp_param[idx], block, offset);
a7a9a88f
LL
950 qemu_cond_signal(&comp_param[idx].cond);
951 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26 952 pages = 1;
b4d1c6e7 953 rs->norm_pages++;
072c2511 954 rs->bytes_transferred += bytes_xmit;
56e93d26
JQ
955 break;
956 }
957 }
958 if (pages > 0) {
959 break;
960 } else {
0d9f9a5c 961 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26
JQ
962 }
963 }
0d9f9a5c 964 qemu_mutex_unlock(&comp_done_lock);
56e93d26
JQ
965
966 return pages;
967}
968
969/**
970 * ram_save_compressed_page: compress the given page and send it to the stream
971 *
3d0684b2 972 * Returns the number of pages written.
56e93d26 973 *
6f37bb8b 974 * @rs: current RAM state
56e93d26
JQ
975 * @block: block that contains the page we want to send
976 * @offset: offset inside the block for the page
977 * @last_stage: if we are at the completion stage
56e93d26 978 */
a0a8aa14
JQ
979static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
980 bool last_stage)
56e93d26
JQ
981{
982 int pages = -1;
fc50438e 983 uint64_t bytes_xmit = 0;
56e93d26 984 uint8_t *p;
fc50438e 985 int ret, blen;
a08f6890
HZ
986 RAMBlock *block = pss->block;
987 ram_addr_t offset = pss->offset;
56e93d26 988
2f68e399 989 p = block->host + offset;
56e93d26 990
ce25d337 991 ret = ram_control_save_page(rs->f, block->offset,
56e93d26
JQ
992 offset, TARGET_PAGE_SIZE, &bytes_xmit);
993 if (bytes_xmit) {
072c2511 994 rs->bytes_transferred += bytes_xmit;
56e93d26
JQ
995 pages = 1;
996 }
56e93d26
JQ
997 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
998 if (ret != RAM_SAVE_CONTROL_DELAYED) {
999 if (bytes_xmit > 0) {
b4d1c6e7 1000 rs->norm_pages++;
56e93d26 1001 } else if (bytes_xmit == 0) {
f7ccd61b 1002 rs->zero_pages++;
56e93d26
JQ
1003 }
1004 }
1005 } else {
1006 /* When starting the process of a new block, the first page of
1007 * the block should be sent out before other pages in the same
1008 * block, and all the pages in last block should have been sent
1009 * out, keeping this order is important, because the 'cont' flag
1010 * is used to avoid resending the block name.
1011 */
6f37bb8b 1012 if (block != rs->last_sent_block) {
ce25d337
JQ
1013 flush_compressed_data(rs);
1014 pages = save_zero_page(rs, block, offset, p);
56e93d26 1015 if (pages == -1) {
fc50438e 1016 /* Make sure the first page is sent out before other pages */
ce25d337 1017 bytes_xmit = save_page_header(rs->f, block, offset |
fc50438e 1018 RAM_SAVE_FLAG_COMPRESS_PAGE);
ce25d337 1019 blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
fc50438e
LL
1020 migrate_compress_level());
1021 if (blen > 0) {
072c2511 1022 rs->bytes_transferred += bytes_xmit + blen;
b4d1c6e7 1023 rs->norm_pages++;
b3be2896 1024 pages = 1;
fc50438e 1025 } else {
ce25d337 1026 qemu_file_set_error(rs->f, blen);
fc50438e 1027 error_report("compressed data failed!");
b3be2896 1028 }
56e93d26 1029 }
53f09a10 1030 if (pages > 0) {
5727309d 1031 ram_release_pages(block->idstr, pss->offset, pages);
53f09a10 1032 }
56e93d26 1033 } else {
fc50438e 1034 offset |= RAM_SAVE_FLAG_CONTINUE;
ce25d337 1035 pages = save_zero_page(rs, block, offset, p);
56e93d26 1036 if (pages == -1) {
ce25d337 1037 pages = compress_page_with_multi_thread(rs, block, offset);
53f09a10 1038 } else {
5727309d 1039 ram_release_pages(block->idstr, pss->offset, pages);
56e93d26
JQ
1040 }
1041 }
1042 }
1043
1044 return pages;
1045}
1046
3d0684b2
JQ
1047/**
1048 * find_dirty_block: find the next dirty page and update any state
1049 * associated with the search process.
b9e60928 1050 *
3d0684b2 1051 * Returns if a page is found
b9e60928 1052 *
6f37bb8b 1053 * @rs: current RAM state
3d0684b2
JQ
1054 * @pss: data about the state of the current dirty page scan
1055 * @again: set to false if the search has scanned the whole of RAM
1056 * @ram_addr_abs: pointer into which to store the address of the dirty page
1057 * within the global ram_addr space
b9e60928 1058 */
ce25d337 1059static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss,
f3f491fc 1060 bool *again, ram_addr_t *ram_addr_abs)
b9e60928 1061{
6f37bb8b 1062 pss->offset = migration_bitmap_find_dirty(rs, pss->block, pss->offset,
a82d593b 1063 ram_addr_abs);
6f37bb8b
JQ
1064 if (pss->complete_round && pss->block == rs->last_seen_block &&
1065 pss->offset >= rs->last_offset) {
b9e60928
DDAG
1066 /*
1067 * We've been once around the RAM and haven't found anything.
1068 * Give up.
1069 */
1070 *again = false;
1071 return false;
1072 }
1073 if (pss->offset >= pss->block->used_length) {
1074 /* Didn't find anything in this RAM Block */
1075 pss->offset = 0;
1076 pss->block = QLIST_NEXT_RCU(pss->block, next);
1077 if (!pss->block) {
1078 /* Hit the end of the list */
1079 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1080 /* Flag that we've looped */
1081 pss->complete_round = true;
6f37bb8b 1082 rs->ram_bulk_stage = false;
b9e60928
DDAG
1083 if (migrate_use_xbzrle()) {
1084 /* If xbzrle is on, stop using the data compression at this
1085 * point. In theory, xbzrle can do better than compression.
1086 */
ce25d337 1087 flush_compressed_data(rs);
b9e60928
DDAG
1088 }
1089 }
1090 /* Didn't find anything this time, but try again on the new block */
1091 *again = true;
1092 return false;
1093 } else {
1094 /* Can go around again, but... */
1095 *again = true;
1096 /* We've found something so probably don't need to */
1097 return true;
1098 }
1099}
1100
3d0684b2
JQ
1101/**
1102 * unqueue_page: gets a page of the queue
1103 *
a82d593b 1104 * Helper for 'get_queued_page' - gets a page off the queue
a82d593b 1105 *
3d0684b2
JQ
1106 * Returns the block of the page (or NULL if none available)
1107 *
ec481c6c 1108 * @rs: current RAM state
3d0684b2
JQ
1109 * @offset: used to return the offset within the RAMBlock
1110 * @ram_addr_abs: pointer into which to store the address of the dirty page
1111 * within the global ram_addr space
a82d593b 1112 */
ec481c6c 1113static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset,
a82d593b
DDAG
1114 ram_addr_t *ram_addr_abs)
1115{
1116 RAMBlock *block = NULL;
1117
ec481c6c
JQ
1118 qemu_mutex_lock(&rs->src_page_req_mutex);
1119 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1120 struct RAMSrcPageRequest *entry =
1121 QSIMPLEQ_FIRST(&rs->src_page_requests);
a82d593b
DDAG
1122 block = entry->rb;
1123 *offset = entry->offset;
1124 *ram_addr_abs = (entry->offset + entry->rb->offset) &
1125 TARGET_PAGE_MASK;
1126
1127 if (entry->len > TARGET_PAGE_SIZE) {
1128 entry->len -= TARGET_PAGE_SIZE;
1129 entry->offset += TARGET_PAGE_SIZE;
1130 } else {
1131 memory_region_unref(block->mr);
ec481c6c 1132 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
a82d593b
DDAG
1133 g_free(entry);
1134 }
1135 }
ec481c6c 1136 qemu_mutex_unlock(&rs->src_page_req_mutex);
a82d593b
DDAG
1137
1138 return block;
1139}
1140
3d0684b2
JQ
1141/**
1142 * get_queued_page: unqueue a page from the postocpy requests
1143 *
1144 * Skips pages that are already sent (!dirty)
a82d593b 1145 *
3d0684b2 1146 * Returns if a queued page is found
a82d593b 1147 *
6f37bb8b 1148 * @rs: current RAM state
3d0684b2
JQ
1149 * @pss: data about the state of the current dirty page scan
1150 * @ram_addr_abs: pointer into which to store the address of the dirty page
1151 * within the global ram_addr space
a82d593b 1152 */
ec481c6c 1153static bool get_queued_page(RAMState *rs, PageSearchStatus *pss,
a82d593b
DDAG
1154 ram_addr_t *ram_addr_abs)
1155{
1156 RAMBlock *block;
1157 ram_addr_t offset;
1158 bool dirty;
1159
1160 do {
ec481c6c 1161 block = unqueue_page(rs, &offset, ram_addr_abs);
a82d593b
DDAG
1162 /*
1163 * We're sending this page, and since it's postcopy nothing else
1164 * will dirty it, and we must make sure it doesn't get sent again
1165 * even if this queue request was received after the background
1166 * search already sent it.
1167 */
1168 if (block) {
1169 unsigned long *bitmap;
eb859c53 1170 bitmap = atomic_rcu_read(&rs->ram_bitmap)->bmap;
a82d593b
DDAG
1171 dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap);
1172 if (!dirty) {
1173 trace_get_queued_page_not_dirty(
1174 block->idstr, (uint64_t)offset,
1175 (uint64_t)*ram_addr_abs,
1176 test_bit(*ram_addr_abs >> TARGET_PAGE_BITS,
eb859c53 1177 atomic_rcu_read(&rs->ram_bitmap)->unsentmap));
a82d593b
DDAG
1178 } else {
1179 trace_get_queued_page(block->idstr,
1180 (uint64_t)offset,
1181 (uint64_t)*ram_addr_abs);
1182 }
1183 }
1184
1185 } while (block && !dirty);
1186
1187 if (block) {
1188 /*
1189 * As soon as we start servicing pages out of order, then we have
1190 * to kill the bulk stage, since the bulk stage assumes
1191 * in (migration_bitmap_find_and_reset_dirty) that every page is
1192 * dirty, that's no longer true.
1193 */
6f37bb8b 1194 rs->ram_bulk_stage = false;
a82d593b
DDAG
1195
1196 /*
1197 * We want the background search to continue from the queued page
1198 * since the guest is likely to want other pages near to the page
1199 * it just requested.
1200 */
1201 pss->block = block;
1202 pss->offset = offset;
1203 }
1204
1205 return !!block;
1206}
1207
6c595cde 1208/**
5e58f968
JQ
1209 * migration_page_queue_free: drop any remaining pages in the ram
1210 * request queue
6c595cde 1211 *
3d0684b2
JQ
1212 * It should be empty at the end anyway, but in error cases there may
1213 * be some left. in case that there is any page left, we drop it.
1214 *
6c595cde 1215 */
ec481c6c 1216void migration_page_queue_free(void)
6c595cde 1217{
ec481c6c
JQ
1218 struct RAMSrcPageRequest *mspr, *next_mspr;
1219 RAMState *rs = &ram_state;
6c595cde
DDAG
1220 /* This queue generally should be empty - but in the case of a failed
1221 * migration might have some droppings in.
1222 */
1223 rcu_read_lock();
ec481c6c 1224 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
6c595cde 1225 memory_region_unref(mspr->rb->mr);
ec481c6c 1226 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
6c595cde
DDAG
1227 g_free(mspr);
1228 }
1229 rcu_read_unlock();
1230}
1231
1232/**
3d0684b2
JQ
1233 * ram_save_queue_pages: queue the page for transmission
1234 *
1235 * A request from postcopy destination for example.
1236 *
1237 * Returns zero on success or negative on error
1238 *
3d0684b2
JQ
1239 * @rbname: Name of the RAMBLock of the request. NULL means the
1240 * same that last one.
1241 * @start: starting address from the start of the RAMBlock
1242 * @len: length (in bytes) to send
6c595cde 1243 */
96506894 1244int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
6c595cde
DDAG
1245{
1246 RAMBlock *ramblock;
68a098f3 1247 RAMState *rs = &ram_state;
6c595cde 1248
96506894 1249 rs->postcopy_requests++;
6c595cde
DDAG
1250 rcu_read_lock();
1251 if (!rbname) {
1252 /* Reuse last RAMBlock */
68a098f3 1253 ramblock = rs->last_req_rb;
6c595cde
DDAG
1254
1255 if (!ramblock) {
1256 /*
1257 * Shouldn't happen, we can't reuse the last RAMBlock if
1258 * it's the 1st request.
1259 */
1260 error_report("ram_save_queue_pages no previous block");
1261 goto err;
1262 }
1263 } else {
1264 ramblock = qemu_ram_block_by_name(rbname);
1265
1266 if (!ramblock) {
1267 /* We shouldn't be asked for a non-existent RAMBlock */
1268 error_report("ram_save_queue_pages no block '%s'", rbname);
1269 goto err;
1270 }
68a098f3 1271 rs->last_req_rb = ramblock;
6c595cde
DDAG
1272 }
1273 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1274 if (start+len > ramblock->used_length) {
9458ad6b
JQ
1275 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1276 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
6c595cde
DDAG
1277 __func__, start, len, ramblock->used_length);
1278 goto err;
1279 }
1280
ec481c6c
JQ
1281 struct RAMSrcPageRequest *new_entry =
1282 g_malloc0(sizeof(struct RAMSrcPageRequest));
6c595cde
DDAG
1283 new_entry->rb = ramblock;
1284 new_entry->offset = start;
1285 new_entry->len = len;
1286
1287 memory_region_ref(ramblock->mr);
ec481c6c
JQ
1288 qemu_mutex_lock(&rs->src_page_req_mutex);
1289 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1290 qemu_mutex_unlock(&rs->src_page_req_mutex);
6c595cde
DDAG
1291 rcu_read_unlock();
1292
1293 return 0;
1294
1295err:
1296 rcu_read_unlock();
1297 return -1;
1298}
1299
a82d593b 1300/**
3d0684b2 1301 * ram_save_target_page: save one target page
a82d593b 1302 *
3d0684b2 1303 * Returns the number of pages written
a82d593b 1304 *
6f37bb8b 1305 * @rs: current RAM state
3d0684b2 1306 * @ms: current migration state
3d0684b2 1307 * @pss: data about the page we want to send
a82d593b 1308 * @last_stage: if we are at the completion stage
3d0684b2 1309 * @dirty_ram_abs: address of the start of the dirty page in ram_addr_t space
a82d593b 1310 */
a0a8aa14
JQ
1311static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1312 bool last_stage, ram_addr_t dirty_ram_abs)
a82d593b
DDAG
1313{
1314 int res = 0;
1315
1316 /* Check the pages is dirty and if it is send it */
0d8ec885 1317 if (migration_bitmap_clear_dirty(rs, dirty_ram_abs)) {
a82d593b 1318 unsigned long *unsentmap;
6d358d94
JQ
1319 /*
1320 * If xbzrle is on, stop using the data compression after first
1321 * round of migration even if compression is enabled. In theory,
1322 * xbzrle can do better than compression.
1323 */
1324
1325 if (migrate_use_compression()
1326 && (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
a0a8aa14 1327 res = ram_save_compressed_page(rs, pss, last_stage);
a82d593b 1328 } else {
a0a8aa14 1329 res = ram_save_page(rs, pss, last_stage);
a82d593b
DDAG
1330 }
1331
1332 if (res < 0) {
1333 return res;
1334 }
eb859c53 1335 unsentmap = atomic_rcu_read(&rs->ram_bitmap)->unsentmap;
a82d593b
DDAG
1336 if (unsentmap) {
1337 clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap);
1338 }
3fd3c4b3
DDAG
1339 /* Only update last_sent_block if a block was actually sent; xbzrle
1340 * might have decided the page was identical so didn't bother writing
1341 * to the stream.
1342 */
1343 if (res > 0) {
6f37bb8b 1344 rs->last_sent_block = pss->block;
3fd3c4b3 1345 }
a82d593b
DDAG
1346 }
1347
1348 return res;
1349}
1350
1351/**
3d0684b2 1352 * ram_save_host_page: save a whole host page
a82d593b 1353 *
3d0684b2
JQ
1354 * Starting at *offset send pages up to the end of the current host
1355 * page. It's valid for the initial offset to point into the middle of
1356 * a host page in which case the remainder of the hostpage is sent.
1357 * Only dirty target pages are sent. Note that the host page size may
1358 * be a huge page for this block.
a82d593b 1359 *
3d0684b2
JQ
1360 * Returns the number of pages written or negative on error
1361 *
6f37bb8b 1362 * @rs: current RAM state
3d0684b2 1363 * @ms: current migration state
3d0684b2 1364 * @pss: data about the page we want to send
a82d593b 1365 * @last_stage: if we are at the completion stage
a82d593b
DDAG
1366 * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1367 */
a0a8aa14 1368static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
a08f6890 1369 bool last_stage,
a82d593b
DDAG
1370 ram_addr_t dirty_ram_abs)
1371{
1372 int tmppages, pages = 0;
4c011c37
DDAG
1373 size_t pagesize = qemu_ram_pagesize(pss->block);
1374
a82d593b 1375 do {
a0a8aa14 1376 tmppages = ram_save_target_page(rs, pss, last_stage, dirty_ram_abs);
a82d593b
DDAG
1377 if (tmppages < 0) {
1378 return tmppages;
1379 }
1380
1381 pages += tmppages;
a08f6890 1382 pss->offset += TARGET_PAGE_SIZE;
a82d593b 1383 dirty_ram_abs += TARGET_PAGE_SIZE;
4c011c37 1384 } while (pss->offset & (pagesize - 1));
a82d593b
DDAG
1385
1386 /* The offset we leave with is the last one we looked at */
a08f6890 1387 pss->offset -= TARGET_PAGE_SIZE;
a82d593b
DDAG
1388 return pages;
1389}
6c595cde 1390
56e93d26 1391/**
3d0684b2 1392 * ram_find_and_save_block: finds a dirty page and sends it to f
56e93d26
JQ
1393 *
1394 * Called within an RCU critical section.
1395 *
3d0684b2 1396 * Returns the number of pages written where zero means no dirty pages
56e93d26 1397 *
6f37bb8b 1398 * @rs: current RAM state
56e93d26 1399 * @last_stage: if we are at the completion stage
a82d593b
DDAG
1400 *
1401 * On systems where host-page-size > target-page-size it will send all the
1402 * pages in a host page that are dirty.
56e93d26
JQ
1403 */
1404
ce25d337 1405static int ram_find_and_save_block(RAMState *rs, bool last_stage)
56e93d26 1406{
b8fb8cb7 1407 PageSearchStatus pss;
56e93d26 1408 int pages = 0;
b9e60928 1409 bool again, found;
f3f491fc
DDAG
1410 ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
1411 ram_addr_t space */
56e93d26 1412
0827b9e9
AA
1413 /* No dirty page as there is zero RAM */
1414 if (!ram_bytes_total()) {
1415 return pages;
1416 }
1417
6f37bb8b
JQ
1418 pss.block = rs->last_seen_block;
1419 pss.offset = rs->last_offset;
b8fb8cb7
DDAG
1420 pss.complete_round = false;
1421
1422 if (!pss.block) {
1423 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1424 }
56e93d26 1425
b9e60928 1426 do {
a82d593b 1427 again = true;
ec481c6c 1428 found = get_queued_page(rs, &pss, &dirty_ram_abs);
b9e60928 1429
a82d593b
DDAG
1430 if (!found) {
1431 /* priority queue empty, so just search for something dirty */
ce25d337 1432 found = find_dirty_block(rs, &pss, &again, &dirty_ram_abs);
a82d593b 1433 }
f3f491fc 1434
a82d593b 1435 if (found) {
a0a8aa14 1436 pages = ram_save_host_page(rs, &pss, last_stage, dirty_ram_abs);
56e93d26 1437 }
b9e60928 1438 } while (!pages && again);
56e93d26 1439
6f37bb8b
JQ
1440 rs->last_seen_block = pss.block;
1441 rs->last_offset = pss.offset;
56e93d26
JQ
1442
1443 return pages;
1444}
1445
1446void acct_update_position(QEMUFile *f, size_t size, bool zero)
1447{
1448 uint64_t pages = size / TARGET_PAGE_SIZE;
f7ccd61b
JQ
1449 RAMState *rs = &ram_state;
1450
56e93d26 1451 if (zero) {
f7ccd61b 1452 rs->zero_pages += pages;
56e93d26 1453 } else {
b4d1c6e7 1454 rs->norm_pages += pages;
2f4fde93 1455 rs->bytes_transferred += size;
56e93d26
JQ
1456 qemu_update_position(f, size);
1457 }
1458}
1459
56e93d26
JQ
1460uint64_t ram_bytes_total(void)
1461{
1462 RAMBlock *block;
1463 uint64_t total = 0;
1464
1465 rcu_read_lock();
1466 QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
1467 total += block->used_length;
1468 rcu_read_unlock();
1469 return total;
1470}
1471
1472void free_xbzrle_decoded_buf(void)
1473{
1474 g_free(xbzrle_decoded_buf);
1475 xbzrle_decoded_buf = NULL;
1476}
1477
eb859c53 1478static void migration_bitmap_free(struct RAMBitmap *bmap)
60be6340
DL
1479{
1480 g_free(bmap->bmap);
f3f491fc 1481 g_free(bmap->unsentmap);
60be6340
DL
1482 g_free(bmap);
1483}
1484
6ad2a215 1485static void ram_migration_cleanup(void *opaque)
56e93d26 1486{
eb859c53
JQ
1487 RAMState *rs = opaque;
1488
2ff64038
LZ
1489 /* caller have hold iothread lock or is in a bh, so there is
1490 * no writing race against this migration_bitmap
1491 */
eb859c53
JQ
1492 struct RAMBitmap *bitmap = rs->ram_bitmap;
1493 atomic_rcu_set(&rs->ram_bitmap, NULL);
2ff64038 1494 if (bitmap) {
56e93d26 1495 memory_global_dirty_log_stop();
60be6340 1496 call_rcu(bitmap, migration_bitmap_free, rcu);
56e93d26
JQ
1497 }
1498
1499 XBZRLE_cache_lock();
1500 if (XBZRLE.cache) {
1501 cache_fini(XBZRLE.cache);
1502 g_free(XBZRLE.encoded_buf);
1503 g_free(XBZRLE.current_buf);
adb65dec 1504 g_free(ZERO_TARGET_PAGE);
56e93d26
JQ
1505 XBZRLE.cache = NULL;
1506 XBZRLE.encoded_buf = NULL;
1507 XBZRLE.current_buf = NULL;
1508 }
1509 XBZRLE_cache_unlock();
1510}
1511
6f37bb8b 1512static void ram_state_reset(RAMState *rs)
56e93d26 1513{
6f37bb8b
JQ
1514 rs->last_seen_block = NULL;
1515 rs->last_sent_block = NULL;
1516 rs->last_offset = 0;
1517 rs->last_version = ram_list.version;
1518 rs->ram_bulk_stage = true;
56e93d26
JQ
1519}
1520
1521#define MAX_WAIT 50 /* ms, half buffered_file limit */
1522
dd631697
LZ
1523void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
1524{
0d8ec885 1525 RAMState *rs = &ram_state;
108cfae0 1526
dd631697
LZ
1527 /* called in qemu main thread, so there is
1528 * no writing race against this migration_bitmap
1529 */
eb859c53
JQ
1530 if (rs->ram_bitmap) {
1531 struct RAMBitmap *old_bitmap = rs->ram_bitmap, *bitmap;
1532 bitmap = g_new(struct RAMBitmap, 1);
60be6340 1533 bitmap->bmap = bitmap_new(new);
dd631697
LZ
1534
1535 /* prevent migration_bitmap content from being set bit
1536 * by migration_bitmap_sync_range() at the same time.
1537 * it is safe to migration if migration_bitmap is cleared bit
1538 * at the same time.
1539 */
108cfae0 1540 qemu_mutex_lock(&rs->bitmap_mutex);
60be6340
DL
1541 bitmap_copy(bitmap->bmap, old_bitmap->bmap, old);
1542 bitmap_set(bitmap->bmap, old, new - old);
f3f491fc
DDAG
1543
1544 /* We don't have a way to safely extend the sentmap
1545 * with RCU; so mark it as missing, entry to postcopy
1546 * will fail.
1547 */
1548 bitmap->unsentmap = NULL;
1549
eb859c53 1550 atomic_rcu_set(&rs->ram_bitmap, bitmap);
108cfae0 1551 qemu_mutex_unlock(&rs->bitmap_mutex);
0d8ec885 1552 rs->migration_dirty_pages += new - old;
60be6340 1553 call_rcu(old_bitmap, migration_bitmap_free, rcu);
dd631697
LZ
1554 }
1555}
56e93d26 1556
4f2e4252
DDAG
1557/*
1558 * 'expected' is the value you expect the bitmap mostly to be full
1559 * of; it won't bother printing lines that are all this value.
1560 * If 'todump' is null the migration bitmap is dumped.
1561 */
1562void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
1563{
1564 int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
eb859c53 1565 RAMState *rs = &ram_state;
4f2e4252
DDAG
1566 int64_t cur;
1567 int64_t linelen = 128;
1568 char linebuf[129];
1569
1570 if (!todump) {
eb859c53 1571 todump = atomic_rcu_read(&rs->ram_bitmap)->bmap;
4f2e4252
DDAG
1572 }
1573
1574 for (cur = 0; cur < ram_pages; cur += linelen) {
1575 int64_t curb;
1576 bool found = false;
1577 /*
1578 * Last line; catch the case where the line length
1579 * is longer than remaining ram
1580 */
1581 if (cur + linelen > ram_pages) {
1582 linelen = ram_pages - cur;
1583 }
1584 for (curb = 0; curb < linelen; curb++) {
1585 bool thisbit = test_bit(cur + curb, todump);
1586 linebuf[curb] = thisbit ? '1' : '.';
1587 found = found || (thisbit != expected);
1588 }
1589 if (found) {
1590 linebuf[curb] = '\0';
1591 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
1592 }
1593 }
1594}
1595
e0b266f0
DDAG
1596/* **** functions for postcopy ***** */
1597
ced1c616
PB
1598void ram_postcopy_migrated_memory_release(MigrationState *ms)
1599{
eb859c53 1600 RAMState *rs = &ram_state;
ced1c616 1601 struct RAMBlock *block;
eb859c53 1602 unsigned long *bitmap = atomic_rcu_read(&rs->ram_bitmap)->bmap;
ced1c616
PB
1603
1604 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1605 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1606 unsigned long range = first + (block->used_length >> TARGET_PAGE_BITS);
1607 unsigned long run_start = find_next_zero_bit(bitmap, range, first);
1608
1609 while (run_start < range) {
1610 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1611 ram_discard_range(NULL, block->idstr, run_start << TARGET_PAGE_BITS,
1612 (run_end - run_start) << TARGET_PAGE_BITS);
1613 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1614 }
1615 }
1616}
1617
3d0684b2
JQ
1618/**
1619 * postcopy_send_discard_bm_ram: discard a RAMBlock
1620 *
1621 * Returns zero on success
1622 *
e0b266f0
DDAG
1623 * Callback from postcopy_each_ram_send_discard for each RAMBlock
1624 * Note: At this point the 'unsentmap' is the processed bitmap combined
1625 * with the dirtymap; so a '1' means it's either dirty or unsent.
3d0684b2
JQ
1626 *
1627 * @ms: current migration state
1628 * @pds: state for postcopy
1629 * @start: RAMBlock starting page
1630 * @length: RAMBlock size
e0b266f0
DDAG
1631 */
1632static int postcopy_send_discard_bm_ram(MigrationState *ms,
1633 PostcopyDiscardState *pds,
1634 unsigned long start,
1635 unsigned long length)
1636{
eb859c53 1637 RAMState *rs = &ram_state;
e0b266f0
DDAG
1638 unsigned long end = start + length; /* one after the end */
1639 unsigned long current;
1640 unsigned long *unsentmap;
1641
eb859c53 1642 unsentmap = atomic_rcu_read(&rs->ram_bitmap)->unsentmap;
e0b266f0
DDAG
1643 for (current = start; current < end; ) {
1644 unsigned long one = find_next_bit(unsentmap, end, current);
1645
1646 if (one <= end) {
1647 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1648 unsigned long discard_length;
1649
1650 if (zero >= end) {
1651 discard_length = end - one;
1652 } else {
1653 discard_length = zero - one;
1654 }
d688c62d
DDAG
1655 if (discard_length) {
1656 postcopy_discard_send_range(ms, pds, one, discard_length);
1657 }
e0b266f0
DDAG
1658 current = one + discard_length;
1659 } else {
1660 current = one;
1661 }
1662 }
1663
1664 return 0;
1665}
1666
3d0684b2
JQ
1667/**
1668 * postcopy_each_ram_send_discard: discard all RAMBlocks
1669 *
1670 * Returns 0 for success or negative for error
1671 *
e0b266f0
DDAG
1672 * Utility for the outgoing postcopy code.
1673 * Calls postcopy_send_discard_bm_ram for each RAMBlock
1674 * passing it bitmap indexes and name.
e0b266f0
DDAG
1675 * (qemu_ram_foreach_block ends up passing unscaled lengths
1676 * which would mean postcopy code would have to deal with target page)
3d0684b2
JQ
1677 *
1678 * @ms: current migration state
e0b266f0
DDAG
1679 */
1680static int postcopy_each_ram_send_discard(MigrationState *ms)
1681{
1682 struct RAMBlock *block;
1683 int ret;
1684
1685 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1686 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1687 PostcopyDiscardState *pds = postcopy_discard_send_init(ms,
1688 first,
1689 block->idstr);
1690
1691 /*
1692 * Postcopy sends chunks of bitmap over the wire, but it
1693 * just needs indexes at this point, avoids it having
1694 * target page specific code.
1695 */
1696 ret = postcopy_send_discard_bm_ram(ms, pds, first,
1697 block->used_length >> TARGET_PAGE_BITS);
1698 postcopy_discard_send_finish(ms, pds);
1699 if (ret) {
1700 return ret;
1701 }
1702 }
1703
1704 return 0;
1705}
1706
3d0684b2
JQ
1707/**
1708 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1709 *
1710 * Helper for postcopy_chunk_hostpages; it's called twice to
1711 * canonicalize the two bitmaps, that are similar, but one is
1712 * inverted.
99e314eb 1713 *
3d0684b2
JQ
1714 * Postcopy requires that all target pages in a hostpage are dirty or
1715 * clean, not a mix. This function canonicalizes the bitmaps.
99e314eb 1716 *
3d0684b2
JQ
1717 * @ms: current migration state
1718 * @unsent_pass: if true we need to canonicalize partially unsent host pages
1719 * otherwise we need to canonicalize partially dirty host pages
1720 * @block: block that contains the page we want to canonicalize
1721 * @pds: state for postcopy
99e314eb
DDAG
1722 */
1723static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1724 RAMBlock *block,
1725 PostcopyDiscardState *pds)
1726{
0d8ec885 1727 RAMState *rs = &ram_state;
99e314eb
DDAG
1728 unsigned long *bitmap;
1729 unsigned long *unsentmap;
29c59172 1730 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
99e314eb
DDAG
1731 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1732 unsigned long len = block->used_length >> TARGET_PAGE_BITS;
1733 unsigned long last = first + (len - 1);
1734 unsigned long run_start;
1735
29c59172
DDAG
1736 if (block->page_size == TARGET_PAGE_SIZE) {
1737 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1738 return;
1739 }
1740
eb859c53
JQ
1741 bitmap = atomic_rcu_read(&rs->ram_bitmap)->bmap;
1742 unsentmap = atomic_rcu_read(&rs->ram_bitmap)->unsentmap;
99e314eb
DDAG
1743
1744 if (unsent_pass) {
1745 /* Find a sent page */
1746 run_start = find_next_zero_bit(unsentmap, last + 1, first);
1747 } else {
1748 /* Find a dirty page */
1749 run_start = find_next_bit(bitmap, last + 1, first);
1750 }
1751
1752 while (run_start <= last) {
1753 bool do_fixup = false;
1754 unsigned long fixup_start_addr;
1755 unsigned long host_offset;
1756
1757 /*
1758 * If the start of this run of pages is in the middle of a host
1759 * page, then we need to fixup this host page.
1760 */
1761 host_offset = run_start % host_ratio;
1762 if (host_offset) {
1763 do_fixup = true;
1764 run_start -= host_offset;
1765 fixup_start_addr = run_start;
1766 /* For the next pass */
1767 run_start = run_start + host_ratio;
1768 } else {
1769 /* Find the end of this run */
1770 unsigned long run_end;
1771 if (unsent_pass) {
1772 run_end = find_next_bit(unsentmap, last + 1, run_start + 1);
1773 } else {
1774 run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1);
1775 }
1776 /*
1777 * If the end isn't at the start of a host page, then the
1778 * run doesn't finish at the end of a host page
1779 * and we need to discard.
1780 */
1781 host_offset = run_end % host_ratio;
1782 if (host_offset) {
1783 do_fixup = true;
1784 fixup_start_addr = run_end - host_offset;
1785 /*
1786 * This host page has gone, the next loop iteration starts
1787 * from after the fixup
1788 */
1789 run_start = fixup_start_addr + host_ratio;
1790 } else {
1791 /*
1792 * No discards on this iteration, next loop starts from
1793 * next sent/dirty page
1794 */
1795 run_start = run_end + 1;
1796 }
1797 }
1798
1799 if (do_fixup) {
1800 unsigned long page;
1801
1802 /* Tell the destination to discard this page */
1803 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1804 /* For the unsent_pass we:
1805 * discard partially sent pages
1806 * For the !unsent_pass (dirty) we:
1807 * discard partially dirty pages that were sent
1808 * (any partially sent pages were already discarded
1809 * by the previous unsent_pass)
1810 */
1811 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1812 host_ratio);
1813 }
1814
1815 /* Clean up the bitmap */
1816 for (page = fixup_start_addr;
1817 page < fixup_start_addr + host_ratio; page++) {
1818 /* All pages in this host page are now not sent */
1819 set_bit(page, unsentmap);
1820
1821 /*
1822 * Remark them as dirty, updating the count for any pages
1823 * that weren't previously dirty.
1824 */
0d8ec885 1825 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
99e314eb
DDAG
1826 }
1827 }
1828
1829 if (unsent_pass) {
1830 /* Find the next sent page for the next iteration */
1831 run_start = find_next_zero_bit(unsentmap, last + 1,
1832 run_start);
1833 } else {
1834 /* Find the next dirty page for the next iteration */
1835 run_start = find_next_bit(bitmap, last + 1, run_start);
1836 }
1837 }
1838}
1839
3d0684b2
JQ
1840/**
1841 * postcopy_chuck_hostpages: discrad any partially sent host page
1842 *
99e314eb
DDAG
1843 * Utility for the outgoing postcopy code.
1844 *
1845 * Discard any partially sent host-page size chunks, mark any partially
29c59172
DDAG
1846 * dirty host-page size chunks as all dirty. In this case the host-page
1847 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
99e314eb 1848 *
3d0684b2
JQ
1849 * Returns zero on success
1850 *
1851 * @ms: current migration state
99e314eb
DDAG
1852 */
1853static int postcopy_chunk_hostpages(MigrationState *ms)
1854{
6f37bb8b 1855 RAMState *rs = &ram_state;
99e314eb
DDAG
1856 struct RAMBlock *block;
1857
99e314eb 1858 /* Easiest way to make sure we don't resume in the middle of a host-page */
6f37bb8b
JQ
1859 rs->last_seen_block = NULL;
1860 rs->last_sent_block = NULL;
1861 rs->last_offset = 0;
99e314eb
DDAG
1862
1863 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1864 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1865
1866 PostcopyDiscardState *pds =
1867 postcopy_discard_send_init(ms, first, block->idstr);
1868
1869 /* First pass: Discard all partially sent host pages */
1870 postcopy_chunk_hostpages_pass(ms, true, block, pds);
1871 /*
1872 * Second pass: Ensure that all partially dirty host pages are made
1873 * fully dirty.
1874 */
1875 postcopy_chunk_hostpages_pass(ms, false, block, pds);
1876
1877 postcopy_discard_send_finish(ms, pds);
1878 } /* ram_list loop */
1879
1880 return 0;
1881}
1882
3d0684b2
JQ
1883/**
1884 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1885 *
1886 * Returns zero on success
1887 *
e0b266f0
DDAG
1888 * Transmit the set of pages to be discarded after precopy to the target
1889 * these are pages that:
1890 * a) Have been previously transmitted but are now dirty again
1891 * b) Pages that have never been transmitted, this ensures that
1892 * any pages on the destination that have been mapped by background
1893 * tasks get discarded (transparent huge pages is the specific concern)
1894 * Hopefully this is pretty sparse
3d0684b2
JQ
1895 *
1896 * @ms: current migration state
e0b266f0
DDAG
1897 */
1898int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1899{
eb859c53 1900 RAMState *rs = &ram_state;
e0b266f0
DDAG
1901 int ret;
1902 unsigned long *bitmap, *unsentmap;
1903
1904 rcu_read_lock();
1905
1906 /* This should be our last sync, the src is now paused */
eb859c53 1907 migration_bitmap_sync(rs);
e0b266f0 1908
eb859c53 1909 unsentmap = atomic_rcu_read(&rs->ram_bitmap)->unsentmap;
e0b266f0
DDAG
1910 if (!unsentmap) {
1911 /* We don't have a safe way to resize the sentmap, so
1912 * if the bitmap was resized it will be NULL at this
1913 * point.
1914 */
1915 error_report("migration ram resized during precopy phase");
1916 rcu_read_unlock();
1917 return -EINVAL;
1918 }
1919
29c59172 1920 /* Deal with TPS != HPS and huge pages */
99e314eb
DDAG
1921 ret = postcopy_chunk_hostpages(ms);
1922 if (ret) {
1923 rcu_read_unlock();
1924 return ret;
1925 }
1926
e0b266f0
DDAG
1927 /*
1928 * Update the unsentmap to be unsentmap = unsentmap | dirty
1929 */
eb859c53 1930 bitmap = atomic_rcu_read(&rs->ram_bitmap)->bmap;
e0b266f0
DDAG
1931 bitmap_or(unsentmap, unsentmap, bitmap,
1932 last_ram_offset() >> TARGET_PAGE_BITS);
1933
1934
1935 trace_ram_postcopy_send_discard_bitmap();
1936#ifdef DEBUG_POSTCOPY
1937 ram_debug_dump_bitmap(unsentmap, true);
1938#endif
1939
1940 ret = postcopy_each_ram_send_discard(ms);
1941 rcu_read_unlock();
1942
1943 return ret;
1944}
1945
3d0684b2
JQ
1946/**
1947 * ram_discard_range: discard dirtied pages at the beginning of postcopy
e0b266f0 1948 *
3d0684b2 1949 * Returns zero on success
e0b266f0 1950 *
3d0684b2 1951 * @mis: current migration incoming state
36449157
JQ
1952 * @rbname: name of the RAMBlock of the request. NULL means the
1953 * same that last one.
3d0684b2
JQ
1954 * @start: RAMBlock starting page
1955 * @length: RAMBlock size
e0b266f0
DDAG
1956 */
1957int ram_discard_range(MigrationIncomingState *mis,
36449157 1958 const char *rbname,
e0b266f0
DDAG
1959 uint64_t start, size_t length)
1960{
1961 int ret = -1;
1962
36449157 1963 trace_ram_discard_range(rbname, start, length);
d3a5038c 1964
e0b266f0 1965 rcu_read_lock();
36449157 1966 RAMBlock *rb = qemu_ram_block_by_name(rbname);
e0b266f0
DDAG
1967
1968 if (!rb) {
36449157 1969 error_report("ram_discard_range: Failed to find block '%s'", rbname);
e0b266f0
DDAG
1970 goto err;
1971 }
1972
d3a5038c 1973 ret = ram_block_discard_range(rb, start, length);
e0b266f0
DDAG
1974
1975err:
1976 rcu_read_unlock();
1977
1978 return ret;
1979}
1980
ceb4d168 1981static int ram_state_init(RAMState *rs)
56e93d26 1982{
56e93d26
JQ
1983 int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
1984
ceb4d168 1985 memset(rs, 0, sizeof(*rs));
108cfae0 1986 qemu_mutex_init(&rs->bitmap_mutex);
ec481c6c
JQ
1987 qemu_mutex_init(&rs->src_page_req_mutex);
1988 QSIMPLEQ_INIT(&rs->src_page_requests);
56e93d26
JQ
1989
1990 if (migrate_use_xbzrle()) {
1991 XBZRLE_cache_lock();
adb65dec 1992 ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE);
56e93d26
JQ
1993 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1994 TARGET_PAGE_SIZE,
1995 TARGET_PAGE_SIZE);
1996 if (!XBZRLE.cache) {
1997 XBZRLE_cache_unlock();
1998 error_report("Error creating cache");
1999 return -1;
2000 }
2001 XBZRLE_cache_unlock();
2002
2003 /* We prefer not to abort if there is no memory */
2004 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2005 if (!XBZRLE.encoded_buf) {
2006 error_report("Error allocating encoded_buf");
2007 return -1;
2008 }
2009
2010 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2011 if (!XBZRLE.current_buf) {
2012 error_report("Error allocating current_buf");
2013 g_free(XBZRLE.encoded_buf);
2014 XBZRLE.encoded_buf = NULL;
2015 return -1;
2016 }
56e93d26
JQ
2017 }
2018
49877834
PB
2019 /* For memory_global_dirty_log_start below. */
2020 qemu_mutex_lock_iothread();
2021
56e93d26
JQ
2022 qemu_mutex_lock_ramlist();
2023 rcu_read_lock();
6f37bb8b 2024 ram_state_reset(rs);
56e93d26 2025
eb859c53 2026 rs->ram_bitmap = g_new0(struct RAMBitmap, 1);
0827b9e9
AA
2027 /* Skip setting bitmap if there is no RAM */
2028 if (ram_bytes_total()) {
2029 ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
eb859c53
JQ
2030 rs->ram_bitmap->bmap = bitmap_new(ram_bitmap_pages);
2031 bitmap_set(rs->ram_bitmap->bmap, 0, ram_bitmap_pages);
0827b9e9
AA
2032
2033 if (migrate_postcopy_ram()) {
eb859c53
JQ
2034 rs->ram_bitmap->unsentmap = bitmap_new(ram_bitmap_pages);
2035 bitmap_set(rs->ram_bitmap->unsentmap, 0, ram_bitmap_pages);
0827b9e9 2036 }
f3f491fc
DDAG
2037 }
2038
56e93d26
JQ
2039 /*
2040 * Count the total number of pages used by ram blocks not including any
2041 * gaps due to alignment or unplugs.
2042 */
0d8ec885 2043 rs->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
56e93d26
JQ
2044
2045 memory_global_dirty_log_start();
8d820d6f 2046 migration_bitmap_sync(rs);
56e93d26 2047 qemu_mutex_unlock_ramlist();
49877834 2048 qemu_mutex_unlock_iothread();
a91246c9
HZ
2049 rcu_read_unlock();
2050
2051 return 0;
2052}
2053
3d0684b2
JQ
2054/*
2055 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
a91246c9
HZ
2056 * long-running RCU critical section. When rcu-reclaims in the code
2057 * start to become numerous it will be necessary to reduce the
2058 * granularity of these critical sections.
2059 */
2060
3d0684b2
JQ
2061/**
2062 * ram_save_setup: Setup RAM for migration
2063 *
2064 * Returns zero to indicate success and negative for error
2065 *
2066 * @f: QEMUFile where to send the data
2067 * @opaque: RAMState pointer
2068 */
a91246c9
HZ
2069static int ram_save_setup(QEMUFile *f, void *opaque)
2070{
6f37bb8b 2071 RAMState *rs = opaque;
a91246c9
HZ
2072 RAMBlock *block;
2073
2074 /* migration has already setup the bitmap, reuse it. */
2075 if (!migration_in_colo_state()) {
ceb4d168 2076 if (ram_state_init(rs) < 0) {
a91246c9
HZ
2077 return -1;
2078 }
2079 }
204b88b8 2080 rs->f = f;
a91246c9
HZ
2081
2082 rcu_read_lock();
56e93d26
JQ
2083
2084 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2085
2086 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2087 qemu_put_byte(f, strlen(block->idstr));
2088 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2089 qemu_put_be64(f, block->used_length);
ef08fb38
DDAG
2090 if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2091 qemu_put_be64(f, block->page_size);
2092 }
56e93d26
JQ
2093 }
2094
2095 rcu_read_unlock();
2096
2097 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2098 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2099
2100 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2101
2102 return 0;
2103}
2104
3d0684b2
JQ
2105/**
2106 * ram_save_iterate: iterative stage for migration
2107 *
2108 * Returns zero to indicate success and negative for error
2109 *
2110 * @f: QEMUFile where to send the data
2111 * @opaque: RAMState pointer
2112 */
56e93d26
JQ
2113static int ram_save_iterate(QEMUFile *f, void *opaque)
2114{
6f37bb8b 2115 RAMState *rs = opaque;
56e93d26
JQ
2116 int ret;
2117 int i;
2118 int64_t t0;
5c90308f 2119 int done = 0;
56e93d26
JQ
2120
2121 rcu_read_lock();
6f37bb8b
JQ
2122 if (ram_list.version != rs->last_version) {
2123 ram_state_reset(rs);
56e93d26
JQ
2124 }
2125
2126 /* Read version before ram_list.blocks */
2127 smp_rmb();
2128
2129 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2130
2131 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2132 i = 0;
2133 while ((ret = qemu_file_rate_limit(f)) == 0) {
2134 int pages;
2135
ce25d337 2136 pages = ram_find_and_save_block(rs, false);
56e93d26
JQ
2137 /* no more pages to sent */
2138 if (pages == 0) {
5c90308f 2139 done = 1;
56e93d26
JQ
2140 break;
2141 }
23b28c3c 2142 rs->iterations++;
070afca2 2143
56e93d26
JQ
2144 /* we want to check in the 1st loop, just in case it was the 1st time
2145 and we had to sync the dirty bitmap.
2146 qemu_get_clock_ns() is a bit expensive, so we only check each some
2147 iterations
2148 */
2149 if ((i & 63) == 0) {
2150 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2151 if (t1 > MAX_WAIT) {
55c4446b 2152 trace_ram_save_iterate_big_wait(t1, i);
56e93d26
JQ
2153 break;
2154 }
2155 }
2156 i++;
2157 }
ce25d337 2158 flush_compressed_data(rs);
56e93d26
JQ
2159 rcu_read_unlock();
2160
2161 /*
2162 * Must occur before EOS (or any QEMUFile operation)
2163 * because of RDMA protocol.
2164 */
2165 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2166
2167 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2f4fde93 2168 rs->bytes_transferred += 8;
56e93d26
JQ
2169
2170 ret = qemu_file_get_error(f);
2171 if (ret < 0) {
2172 return ret;
2173 }
2174
5c90308f 2175 return done;
56e93d26
JQ
2176}
2177
3d0684b2
JQ
2178/**
2179 * ram_save_complete: function called to send the remaining amount of ram
2180 *
2181 * Returns zero to indicate success
2182 *
2183 * Called with iothread lock
2184 *
2185 * @f: QEMUFile where to send the data
2186 * @opaque: RAMState pointer
2187 */
56e93d26
JQ
2188static int ram_save_complete(QEMUFile *f, void *opaque)
2189{
6f37bb8b
JQ
2190 RAMState *rs = opaque;
2191
56e93d26
JQ
2192 rcu_read_lock();
2193
5727309d 2194 if (!migration_in_postcopy()) {
8d820d6f 2195 migration_bitmap_sync(rs);
663e6c1d 2196 }
56e93d26
JQ
2197
2198 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2199
2200 /* try transferring iterative blocks of memory */
2201
2202 /* flush all remaining blocks regardless of rate limiting */
2203 while (true) {
2204 int pages;
2205
ce25d337 2206 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
56e93d26
JQ
2207 /* no more blocks to sent */
2208 if (pages == 0) {
2209 break;
2210 }
2211 }
2212
ce25d337 2213 flush_compressed_data(rs);
56e93d26 2214 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
56e93d26
JQ
2215
2216 rcu_read_unlock();
d09a6fde 2217
56e93d26
JQ
2218 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2219
2220 return 0;
2221}
2222
c31b098f
DDAG
2223static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2224 uint64_t *non_postcopiable_pending,
2225 uint64_t *postcopiable_pending)
56e93d26 2226{
8d820d6f 2227 RAMState *rs = opaque;
56e93d26
JQ
2228 uint64_t remaining_size;
2229
9edabd4d 2230 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 2231
5727309d 2232 if (!migration_in_postcopy() &&
663e6c1d 2233 remaining_size < max_size) {
56e93d26
JQ
2234 qemu_mutex_lock_iothread();
2235 rcu_read_lock();
8d820d6f 2236 migration_bitmap_sync(rs);
56e93d26
JQ
2237 rcu_read_unlock();
2238 qemu_mutex_unlock_iothread();
9edabd4d 2239 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 2240 }
c31b098f
DDAG
2241
2242 /* We can do postcopy, and all the data is postcopiable */
2243 *postcopiable_pending += remaining_size;
56e93d26
JQ
2244}
2245
2246static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2247{
2248 unsigned int xh_len;
2249 int xh_flags;
063e760a 2250 uint8_t *loaded_data;
56e93d26
JQ
2251
2252 if (!xbzrle_decoded_buf) {
2253 xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2254 }
063e760a 2255 loaded_data = xbzrle_decoded_buf;
56e93d26
JQ
2256
2257 /* extract RLE header */
2258 xh_flags = qemu_get_byte(f);
2259 xh_len = qemu_get_be16(f);
2260
2261 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2262 error_report("Failed to load XBZRLE page - wrong compression!");
2263 return -1;
2264 }
2265
2266 if (xh_len > TARGET_PAGE_SIZE) {
2267 error_report("Failed to load XBZRLE page - len overflow!");
2268 return -1;
2269 }
2270 /* load data and decode */
063e760a 2271 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
56e93d26
JQ
2272
2273 /* decode RLE */
063e760a 2274 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
56e93d26
JQ
2275 TARGET_PAGE_SIZE) == -1) {
2276 error_report("Failed to load XBZRLE page - decode error!");
2277 return -1;
2278 }
2279
2280 return 0;
2281}
2282
3d0684b2
JQ
2283/**
2284 * ram_block_from_stream: read a RAMBlock id from the migration stream
2285 *
2286 * Must be called from within a rcu critical section.
2287 *
56e93d26 2288 * Returns a pointer from within the RCU-protected ram_list.
a7180877 2289 *
3d0684b2
JQ
2290 * @f: QEMUFile where to read the data from
2291 * @flags: Page flags (mostly to see if it's a continuation of previous block)
a7180877 2292 */
3d0684b2 2293static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
56e93d26
JQ
2294{
2295 static RAMBlock *block = NULL;
2296 char id[256];
2297 uint8_t len;
2298
2299 if (flags & RAM_SAVE_FLAG_CONTINUE) {
4c4bad48 2300 if (!block) {
56e93d26
JQ
2301 error_report("Ack, bad migration stream!");
2302 return NULL;
2303 }
4c4bad48 2304 return block;
56e93d26
JQ
2305 }
2306
2307 len = qemu_get_byte(f);
2308 qemu_get_buffer(f, (uint8_t *)id, len);
2309 id[len] = 0;
2310
e3dd7493 2311 block = qemu_ram_block_by_name(id);
4c4bad48
HZ
2312 if (!block) {
2313 error_report("Can't find block %s", id);
2314 return NULL;
56e93d26
JQ
2315 }
2316
4c4bad48
HZ
2317 return block;
2318}
2319
2320static inline void *host_from_ram_block_offset(RAMBlock *block,
2321 ram_addr_t offset)
2322{
2323 if (!offset_in_ramblock(block, offset)) {
2324 return NULL;
2325 }
2326
2327 return block->host + offset;
56e93d26
JQ
2328}
2329
3d0684b2
JQ
2330/**
2331 * ram_handle_compressed: handle the zero page case
2332 *
56e93d26
JQ
2333 * If a page (or a whole RDMA chunk) has been
2334 * determined to be zero, then zap it.
3d0684b2
JQ
2335 *
2336 * @host: host address for the zero page
2337 * @ch: what the page is filled from. We only support zero
2338 * @size: size of the zero page
56e93d26
JQ
2339 */
2340void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2341{
2342 if (ch != 0 || !is_zero_range(host, size)) {
2343 memset(host, ch, size);
2344 }
2345}
2346
2347static void *do_data_decompress(void *opaque)
2348{
2349 DecompressParam *param = opaque;
2350 unsigned long pagesize;
33d151f4
LL
2351 uint8_t *des;
2352 int len;
56e93d26 2353
33d151f4 2354 qemu_mutex_lock(&param->mutex);
90e56fb4 2355 while (!param->quit) {
33d151f4
LL
2356 if (param->des) {
2357 des = param->des;
2358 len = param->len;
2359 param->des = 0;
2360 qemu_mutex_unlock(&param->mutex);
2361
56e93d26 2362 pagesize = TARGET_PAGE_SIZE;
73a8912b
LL
2363 /* uncompress() will return failed in some case, especially
2364 * when the page is dirted when doing the compression, it's
2365 * not a problem because the dirty page will be retransferred
2366 * and uncompress() won't break the data in other pages.
2367 */
33d151f4
LL
2368 uncompress((Bytef *)des, &pagesize,
2369 (const Bytef *)param->compbuf, len);
73a8912b 2370
33d151f4
LL
2371 qemu_mutex_lock(&decomp_done_lock);
2372 param->done = true;
2373 qemu_cond_signal(&decomp_done_cond);
2374 qemu_mutex_unlock(&decomp_done_lock);
2375
2376 qemu_mutex_lock(&param->mutex);
2377 } else {
2378 qemu_cond_wait(&param->cond, &param->mutex);
2379 }
56e93d26 2380 }
33d151f4 2381 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
2382
2383 return NULL;
2384}
2385
5533b2e9
LL
2386static void wait_for_decompress_done(void)
2387{
2388 int idx, thread_count;
2389
2390 if (!migrate_use_compression()) {
2391 return;
2392 }
2393
2394 thread_count = migrate_decompress_threads();
2395 qemu_mutex_lock(&decomp_done_lock);
2396 for (idx = 0; idx < thread_count; idx++) {
2397 while (!decomp_param[idx].done) {
2398 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2399 }
2400 }
2401 qemu_mutex_unlock(&decomp_done_lock);
2402}
2403
56e93d26
JQ
2404void migrate_decompress_threads_create(void)
2405{
2406 int i, thread_count;
2407
2408 thread_count = migrate_decompress_threads();
2409 decompress_threads = g_new0(QemuThread, thread_count);
2410 decomp_param = g_new0(DecompressParam, thread_count);
73a8912b
LL
2411 qemu_mutex_init(&decomp_done_lock);
2412 qemu_cond_init(&decomp_done_cond);
56e93d26
JQ
2413 for (i = 0; i < thread_count; i++) {
2414 qemu_mutex_init(&decomp_param[i].mutex);
2415 qemu_cond_init(&decomp_param[i].cond);
2416 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
73a8912b 2417 decomp_param[i].done = true;
90e56fb4 2418 decomp_param[i].quit = false;
56e93d26
JQ
2419 qemu_thread_create(decompress_threads + i, "decompress",
2420 do_data_decompress, decomp_param + i,
2421 QEMU_THREAD_JOINABLE);
2422 }
2423}
2424
2425void migrate_decompress_threads_join(void)
2426{
2427 int i, thread_count;
2428
56e93d26
JQ
2429 thread_count = migrate_decompress_threads();
2430 for (i = 0; i < thread_count; i++) {
2431 qemu_mutex_lock(&decomp_param[i].mutex);
90e56fb4 2432 decomp_param[i].quit = true;
56e93d26
JQ
2433 qemu_cond_signal(&decomp_param[i].cond);
2434 qemu_mutex_unlock(&decomp_param[i].mutex);
2435 }
2436 for (i = 0; i < thread_count; i++) {
2437 qemu_thread_join(decompress_threads + i);
2438 qemu_mutex_destroy(&decomp_param[i].mutex);
2439 qemu_cond_destroy(&decomp_param[i].cond);
2440 g_free(decomp_param[i].compbuf);
2441 }
2442 g_free(decompress_threads);
2443 g_free(decomp_param);
56e93d26
JQ
2444 decompress_threads = NULL;
2445 decomp_param = NULL;
56e93d26
JQ
2446}
2447
c1bc6626 2448static void decompress_data_with_multi_threads(QEMUFile *f,
56e93d26
JQ
2449 void *host, int len)
2450{
2451 int idx, thread_count;
2452
2453 thread_count = migrate_decompress_threads();
73a8912b 2454 qemu_mutex_lock(&decomp_done_lock);
56e93d26
JQ
2455 while (true) {
2456 for (idx = 0; idx < thread_count; idx++) {
73a8912b 2457 if (decomp_param[idx].done) {
33d151f4
LL
2458 decomp_param[idx].done = false;
2459 qemu_mutex_lock(&decomp_param[idx].mutex);
c1bc6626 2460 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
56e93d26
JQ
2461 decomp_param[idx].des = host;
2462 decomp_param[idx].len = len;
33d151f4
LL
2463 qemu_cond_signal(&decomp_param[idx].cond);
2464 qemu_mutex_unlock(&decomp_param[idx].mutex);
56e93d26
JQ
2465 break;
2466 }
2467 }
2468 if (idx < thread_count) {
2469 break;
73a8912b
LL
2470 } else {
2471 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
56e93d26
JQ
2472 }
2473 }
73a8912b 2474 qemu_mutex_unlock(&decomp_done_lock);
56e93d26
JQ
2475}
2476
3d0684b2
JQ
2477/**
2478 * ram_postcopy_incoming_init: allocate postcopy data structures
2479 *
2480 * Returns 0 for success and negative if there was one error
2481 *
2482 * @mis: current migration incoming state
2483 *
2484 * Allocate data structures etc needed by incoming migration with
2485 * postcopy-ram. postcopy-ram's similarly names
2486 * postcopy_ram_incoming_init does the work.
1caddf8a
DDAG
2487 */
2488int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2489{
2490 size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
2491
2492 return postcopy_ram_incoming_init(mis, ram_pages);
2493}
2494
3d0684b2
JQ
2495/**
2496 * ram_load_postcopy: load a page in postcopy case
2497 *
2498 * Returns 0 for success or -errno in case of error
2499 *
a7180877
DDAG
2500 * Called in postcopy mode by ram_load().
2501 * rcu_read_lock is taken prior to this being called.
3d0684b2
JQ
2502 *
2503 * @f: QEMUFile where to send the data
a7180877
DDAG
2504 */
2505static int ram_load_postcopy(QEMUFile *f)
2506{
2507 int flags = 0, ret = 0;
2508 bool place_needed = false;
28abd200 2509 bool matching_page_sizes = false;
a7180877
DDAG
2510 MigrationIncomingState *mis = migration_incoming_get_current();
2511 /* Temporary page that is later 'placed' */
2512 void *postcopy_host_page = postcopy_get_tmp_page(mis);
c53b7ddc 2513 void *last_host = NULL;
a3b6ff6d 2514 bool all_zero = false;
a7180877
DDAG
2515
2516 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2517 ram_addr_t addr;
2518 void *host = NULL;
2519 void *page_buffer = NULL;
2520 void *place_source = NULL;
df9ff5e1 2521 RAMBlock *block = NULL;
a7180877 2522 uint8_t ch;
a7180877
DDAG
2523
2524 addr = qemu_get_be64(f);
2525 flags = addr & ~TARGET_PAGE_MASK;
2526 addr &= TARGET_PAGE_MASK;
2527
2528 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2529 place_needed = false;
2530 if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) {
df9ff5e1 2531 block = ram_block_from_stream(f, flags);
4c4bad48
HZ
2532
2533 host = host_from_ram_block_offset(block, addr);
a7180877
DDAG
2534 if (!host) {
2535 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2536 ret = -EINVAL;
2537 break;
2538 }
28abd200 2539 matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
a7180877 2540 /*
28abd200
DDAG
2541 * Postcopy requires that we place whole host pages atomically;
2542 * these may be huge pages for RAMBlocks that are backed by
2543 * hugetlbfs.
a7180877
DDAG
2544 * To make it atomic, the data is read into a temporary page
2545 * that's moved into place later.
2546 * The migration protocol uses, possibly smaller, target-pages
2547 * however the source ensures it always sends all the components
2548 * of a host page in order.
2549 */
2550 page_buffer = postcopy_host_page +
28abd200 2551 ((uintptr_t)host & (block->page_size - 1));
a7180877 2552 /* If all TP are zero then we can optimise the place */
28abd200 2553 if (!((uintptr_t)host & (block->page_size - 1))) {
a7180877 2554 all_zero = true;
c53b7ddc
DDAG
2555 } else {
2556 /* not the 1st TP within the HP */
2557 if (host != (last_host + TARGET_PAGE_SIZE)) {
9af9e0fe 2558 error_report("Non-sequential target page %p/%p",
c53b7ddc
DDAG
2559 host, last_host);
2560 ret = -EINVAL;
2561 break;
2562 }
a7180877
DDAG
2563 }
2564
c53b7ddc 2565
a7180877
DDAG
2566 /*
2567 * If it's the last part of a host page then we place the host
2568 * page
2569 */
2570 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
28abd200 2571 (block->page_size - 1)) == 0;
a7180877
DDAG
2572 place_source = postcopy_host_page;
2573 }
c53b7ddc 2574 last_host = host;
a7180877
DDAG
2575
2576 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2577 case RAM_SAVE_FLAG_COMPRESS:
2578 ch = qemu_get_byte(f);
2579 memset(page_buffer, ch, TARGET_PAGE_SIZE);
2580 if (ch) {
2581 all_zero = false;
2582 }
2583 break;
2584
2585 case RAM_SAVE_FLAG_PAGE:
2586 all_zero = false;
2587 if (!place_needed || !matching_page_sizes) {
2588 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2589 } else {
2590 /* Avoids the qemu_file copy during postcopy, which is
2591 * going to do a copy later; can only do it when we
2592 * do this read in one go (matching page sizes)
2593 */
2594 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2595 TARGET_PAGE_SIZE);
2596 }
2597 break;
2598 case RAM_SAVE_FLAG_EOS:
2599 /* normal exit */
2600 break;
2601 default:
2602 error_report("Unknown combination of migration flags: %#x"
2603 " (postcopy mode)", flags);
2604 ret = -EINVAL;
2605 }
2606
2607 if (place_needed) {
2608 /* This gets called at the last target page in the host page */
df9ff5e1
DDAG
2609 void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2610
a7180877 2611 if (all_zero) {
df9ff5e1
DDAG
2612 ret = postcopy_place_page_zero(mis, place_dest,
2613 block->page_size);
a7180877 2614 } else {
df9ff5e1
DDAG
2615 ret = postcopy_place_page(mis, place_dest,
2616 place_source, block->page_size);
a7180877
DDAG
2617 }
2618 }
2619 if (!ret) {
2620 ret = qemu_file_get_error(f);
2621 }
2622 }
2623
2624 return ret;
2625}
2626
56e93d26
JQ
2627static int ram_load(QEMUFile *f, void *opaque, int version_id)
2628{
2629 int flags = 0, ret = 0;
2630 static uint64_t seq_iter;
2631 int len = 0;
a7180877
DDAG
2632 /*
2633 * If system is running in postcopy mode, page inserts to host memory must
2634 * be atomic
2635 */
2636 bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
ef08fb38
DDAG
2637 /* ADVISE is earlier, it shows the source has the postcopy capability on */
2638 bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
56e93d26
JQ
2639
2640 seq_iter++;
2641
2642 if (version_id != 4) {
2643 ret = -EINVAL;
2644 }
2645
2646 /* This RCU critical section can be very long running.
2647 * When RCU reclaims in the code start to become numerous,
2648 * it will be necessary to reduce the granularity of this
2649 * critical section.
2650 */
2651 rcu_read_lock();
a7180877
DDAG
2652
2653 if (postcopy_running) {
2654 ret = ram_load_postcopy(f);
2655 }
2656
2657 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
56e93d26 2658 ram_addr_t addr, total_ram_bytes;
a776aa15 2659 void *host = NULL;
56e93d26
JQ
2660 uint8_t ch;
2661
2662 addr = qemu_get_be64(f);
2663 flags = addr & ~TARGET_PAGE_MASK;
2664 addr &= TARGET_PAGE_MASK;
2665
a776aa15
DDAG
2666 if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE |
2667 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4c4bad48
HZ
2668 RAMBlock *block = ram_block_from_stream(f, flags);
2669
2670 host = host_from_ram_block_offset(block, addr);
a776aa15
DDAG
2671 if (!host) {
2672 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2673 ret = -EINVAL;
2674 break;
2675 }
2676 }
2677
56e93d26
JQ
2678 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2679 case RAM_SAVE_FLAG_MEM_SIZE:
2680 /* Synchronize RAM block list */
2681 total_ram_bytes = addr;
2682 while (!ret && total_ram_bytes) {
2683 RAMBlock *block;
56e93d26
JQ
2684 char id[256];
2685 ram_addr_t length;
2686
2687 len = qemu_get_byte(f);
2688 qemu_get_buffer(f, (uint8_t *)id, len);
2689 id[len] = 0;
2690 length = qemu_get_be64(f);
2691
e3dd7493
DDAG
2692 block = qemu_ram_block_by_name(id);
2693 if (block) {
2694 if (length != block->used_length) {
2695 Error *local_err = NULL;
56e93d26 2696
fa53a0e5 2697 ret = qemu_ram_resize(block, length,
e3dd7493
DDAG
2698 &local_err);
2699 if (local_err) {
2700 error_report_err(local_err);
56e93d26 2701 }
56e93d26 2702 }
ef08fb38
DDAG
2703 /* For postcopy we need to check hugepage sizes match */
2704 if (postcopy_advised &&
2705 block->page_size != qemu_host_page_size) {
2706 uint64_t remote_page_size = qemu_get_be64(f);
2707 if (remote_page_size != block->page_size) {
2708 error_report("Mismatched RAM page size %s "
2709 "(local) %zd != %" PRId64,
2710 id, block->page_size,
2711 remote_page_size);
2712 ret = -EINVAL;
2713 }
2714 }
e3dd7493
DDAG
2715 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2716 block->idstr);
2717 } else {
56e93d26
JQ
2718 error_report("Unknown ramblock \"%s\", cannot "
2719 "accept migration", id);
2720 ret = -EINVAL;
2721 }
2722
2723 total_ram_bytes -= length;
2724 }
2725 break;
a776aa15 2726
56e93d26 2727 case RAM_SAVE_FLAG_COMPRESS:
56e93d26
JQ
2728 ch = qemu_get_byte(f);
2729 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2730 break;
a776aa15 2731
56e93d26 2732 case RAM_SAVE_FLAG_PAGE:
56e93d26
JQ
2733 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2734 break;
56e93d26 2735
a776aa15 2736 case RAM_SAVE_FLAG_COMPRESS_PAGE:
56e93d26
JQ
2737 len = qemu_get_be32(f);
2738 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2739 error_report("Invalid compressed data length: %d", len);
2740 ret = -EINVAL;
2741 break;
2742 }
c1bc6626 2743 decompress_data_with_multi_threads(f, host, len);
56e93d26 2744 break;
a776aa15 2745
56e93d26 2746 case RAM_SAVE_FLAG_XBZRLE:
56e93d26
JQ
2747 if (load_xbzrle(f, addr, host) < 0) {
2748 error_report("Failed to decompress XBZRLE page at "
2749 RAM_ADDR_FMT, addr);
2750 ret = -EINVAL;
2751 break;
2752 }
2753 break;
2754 case RAM_SAVE_FLAG_EOS:
2755 /* normal exit */
2756 break;
2757 default:
2758 if (flags & RAM_SAVE_FLAG_HOOK) {
632e3a5c 2759 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
56e93d26
JQ
2760 } else {
2761 error_report("Unknown combination of migration flags: %#x",
2762 flags);
2763 ret = -EINVAL;
2764 }
2765 }
2766 if (!ret) {
2767 ret = qemu_file_get_error(f);
2768 }
2769 }
2770
5533b2e9 2771 wait_for_decompress_done();
56e93d26 2772 rcu_read_unlock();
55c4446b 2773 trace_ram_load_complete(ret, seq_iter);
56e93d26
JQ
2774 return ret;
2775}
2776
2777static SaveVMHandlers savevm_ram_handlers = {
2778 .save_live_setup = ram_save_setup,
2779 .save_live_iterate = ram_save_iterate,
763c906b 2780 .save_live_complete_postcopy = ram_save_complete,
a3e06c3d 2781 .save_live_complete_precopy = ram_save_complete,
56e93d26
JQ
2782 .save_live_pending = ram_save_pending,
2783 .load_state = ram_load,
6ad2a215 2784 .cleanup = ram_migration_cleanup,
56e93d26
JQ
2785};
2786
2787void ram_mig_init(void)
2788{
2789 qemu_mutex_init(&XBZRLE.lock);
6f37bb8b 2790 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
56e93d26 2791}