]> git.proxmox.com Git - mirror_qemu.git/blame - migration/ram.c
ram: Move postcopy_requests into RAMState
[mirror_qemu.git] / migration / ram.c
CommitLineData
56e93d26
JQ
1/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
76cc7b58
JQ
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
56e93d26
JQ
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
1393a485 28#include "qemu/osdep.h"
33c11879
PB
29#include "qemu-common.h"
30#include "cpu.h"
56e93d26 31#include <zlib.h>
4addcd4f 32#include "qapi-event.h"
f348b6d1 33#include "qemu/cutils.h"
56e93d26
JQ
34#include "qemu/bitops.h"
35#include "qemu/bitmap.h"
7205c9ec
JQ
36#include "qemu/timer.h"
37#include "qemu/main-loop.h"
56e93d26 38#include "migration/migration.h"
e0b266f0 39#include "migration/postcopy-ram.h"
56e93d26
JQ
40#include "exec/address-spaces.h"
41#include "migration/page_cache.h"
56e93d26 42#include "qemu/error-report.h"
56e93d26 43#include "trace.h"
56e93d26 44#include "exec/ram_addr.h"
56e93d26 45#include "qemu/rcu_queue.h"
a91246c9 46#include "migration/colo.h"
56e93d26 47
56e93d26
JQ
48/***********************************************************/
49/* ram save/restore */
50
51#define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
52#define RAM_SAVE_FLAG_COMPRESS 0x02
53#define RAM_SAVE_FLAG_MEM_SIZE 0x04
54#define RAM_SAVE_FLAG_PAGE 0x08
55#define RAM_SAVE_FLAG_EOS 0x10
56#define RAM_SAVE_FLAG_CONTINUE 0x20
57#define RAM_SAVE_FLAG_XBZRLE 0x40
58/* 0x80 is reserved in migration.h start with 0x100 next */
59#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
60
adb65dec 61static uint8_t *ZERO_TARGET_PAGE;
56e93d26
JQ
62
63static inline bool is_zero_range(uint8_t *p, uint64_t size)
64{
a1febc49 65 return buffer_is_zero(p, size);
56e93d26
JQ
66}
67
68/* struct contains XBZRLE cache and a static page
69 used by the compression */
70static struct {
71 /* buffer used for XBZRLE encoding */
72 uint8_t *encoded_buf;
73 /* buffer for storing page content */
74 uint8_t *current_buf;
75 /* Cache for XBZRLE, Protected by lock. */
76 PageCache *cache;
77 QemuMutex lock;
78} XBZRLE;
79
80/* buffer used for XBZRLE decoding */
81static uint8_t *xbzrle_decoded_buf;
82
83static void XBZRLE_cache_lock(void)
84{
85 if (migrate_use_xbzrle())
86 qemu_mutex_lock(&XBZRLE.lock);
87}
88
89static void XBZRLE_cache_unlock(void)
90{
91 if (migrate_use_xbzrle())
92 qemu_mutex_unlock(&XBZRLE.lock);
93}
94
3d0684b2
JQ
95/**
96 * xbzrle_cache_resize: resize the xbzrle cache
97 *
98 * This function is called from qmp_migrate_set_cache_size in main
99 * thread, possibly while a migration is in progress. A running
100 * migration may be using the cache and might finish during this call,
101 * hence changes to the cache are protected by XBZRLE.lock().
102 *
103 * Returns the new_size or negative in case of error.
104 *
105 * @new_size: new cache size
56e93d26
JQ
106 */
107int64_t xbzrle_cache_resize(int64_t new_size)
108{
109 PageCache *new_cache;
110 int64_t ret;
111
112 if (new_size < TARGET_PAGE_SIZE) {
113 return -1;
114 }
115
116 XBZRLE_cache_lock();
117
118 if (XBZRLE.cache != NULL) {
119 if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
120 goto out_new_size;
121 }
122 new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
123 TARGET_PAGE_SIZE);
124 if (!new_cache) {
125 error_report("Error creating cache");
126 ret = -1;
127 goto out;
128 }
129
130 cache_fini(XBZRLE.cache);
131 XBZRLE.cache = new_cache;
132 }
133
134out_new_size:
135 ret = pow2floor(new_size);
136out:
137 XBZRLE_cache_unlock();
138 return ret;
139}
140
eb859c53
JQ
141struct RAMBitmap {
142 struct rcu_head rcu;
143 /* Main migration bitmap */
144 unsigned long *bmap;
145 /* bitmap of pages that haven't been sent even once
146 * only maintained and used in postcopy at the moment
147 * where it's used to send the dirtymap at the start
148 * of the postcopy phase
149 */
150 unsigned long *unsentmap;
151};
152typedef struct RAMBitmap RAMBitmap;
153
ec481c6c
JQ
154/*
155 * An outstanding page request, on the source, having been received
156 * and queued
157 */
158struct RAMSrcPageRequest {
159 RAMBlock *rb;
160 hwaddr offset;
161 hwaddr len;
162
163 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
164};
165
6f37bb8b
JQ
166/* State of RAM for migration */
167struct RAMState {
168 /* Last block that we have visited searching for dirty pages */
169 RAMBlock *last_seen_block;
170 /* Last block from where we have sent data */
171 RAMBlock *last_sent_block;
172 /* Last offset we have sent data from */
173 ram_addr_t last_offset;
174 /* last ram version we have seen */
175 uint32_t last_version;
176 /* We are in the first round */
177 bool ram_bulk_stage;
8d820d6f
JQ
178 /* How many times we have dirty too many pages */
179 int dirty_rate_high_cnt;
5a987738
JQ
180 /* How many times we have synchronized the bitmap */
181 uint64_t bitmap_sync_count;
f664da80
JQ
182 /* these variables are used for bitmap sync */
183 /* last time we did a full bitmap_sync */
184 int64_t time_last_bitmap_sync;
eac74159 185 /* bytes transferred at start_time */
c4bdf0cf 186 uint64_t bytes_xfer_prev;
a66cd90c 187 /* number of dirty pages since start_time */
68908ed6 188 uint64_t num_dirty_pages_period;
b5833fde
JQ
189 /* xbzrle misses since the beginning of the period */
190 uint64_t xbzrle_cache_miss_prev;
36040d9c
JQ
191 /* number of iterations at the beginning of period */
192 uint64_t iterations_prev;
f7ccd61b
JQ
193 /* Accounting fields */
194 /* number of zero pages. It used to be pages filled by the same char. */
195 uint64_t zero_pages;
b4d1c6e7
JQ
196 /* number of normal transferred pages */
197 uint64_t norm_pages;
23b28c3c
JQ
198 /* Iterations since start */
199 uint64_t iterations;
f36ada95
JQ
200 /* xbzrle transmitted bytes. Notice that this is with
201 * compression, they can't be calculated from the pages */
07ed50a2 202 uint64_t xbzrle_bytes;
f36ada95
JQ
203 /* xbzrle transmmited pages */
204 uint64_t xbzrle_pages;
544c36f1
JQ
205 /* xbzrle number of cache miss */
206 uint64_t xbzrle_cache_miss;
b07016b6
JQ
207 /* xbzrle miss rate */
208 double xbzrle_cache_miss_rate;
180f61f7
JQ
209 /* xbzrle number of overflows */
210 uint64_t xbzrle_overflows;
0d8ec885
JQ
211 /* number of dirty bits in the bitmap */
212 uint64_t migration_dirty_pages;
2f4fde93
JQ
213 /* total number of bytes transferred */
214 uint64_t bytes_transferred;
47ad8619
JQ
215 /* number of dirtied pages in the last second */
216 uint64_t dirty_pages_rate;
96506894
JQ
217 /* Count of requests incoming from destination */
218 uint64_t postcopy_requests;
108cfae0
JQ
219 /* protects modification of the bitmap */
220 QemuMutex bitmap_mutex;
eb859c53
JQ
221 /* Ram Bitmap protected by RCU */
222 RAMBitmap *ram_bitmap;
68a098f3
JQ
223 /* The RAMBlock used in the last src_page_requests */
224 RAMBlock *last_req_rb;
ec481c6c
JQ
225 /* Queue of outstanding page requests from the destination */
226 QemuMutex src_page_req_mutex;
227 QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
6f37bb8b
JQ
228};
229typedef struct RAMState RAMState;
230
231static RAMState ram_state;
232
56e93d26
JQ
233uint64_t dup_mig_pages_transferred(void)
234{
f7ccd61b 235 return ram_state.zero_pages;
56e93d26
JQ
236}
237
56e93d26
JQ
238uint64_t norm_mig_pages_transferred(void)
239{
b4d1c6e7 240 return ram_state.norm_pages;
56e93d26
JQ
241}
242
243uint64_t xbzrle_mig_bytes_transferred(void)
244{
07ed50a2 245 return ram_state.xbzrle_bytes;
56e93d26
JQ
246}
247
248uint64_t xbzrle_mig_pages_transferred(void)
249{
f36ada95 250 return ram_state.xbzrle_pages;
56e93d26
JQ
251}
252
253uint64_t xbzrle_mig_pages_cache_miss(void)
254{
544c36f1 255 return ram_state.xbzrle_cache_miss;
56e93d26
JQ
256}
257
258double xbzrle_mig_cache_miss_rate(void)
259{
b07016b6 260 return ram_state.xbzrle_cache_miss_rate;
56e93d26
JQ
261}
262
263uint64_t xbzrle_mig_pages_overflow(void)
264{
180f61f7 265 return ram_state.xbzrle_overflows;
56e93d26
JQ
266}
267
9edabd4d 268uint64_t ram_bytes_transferred(void)
0d8ec885 269{
9edabd4d 270 return ram_state.bytes_transferred;
0d8ec885
JQ
271}
272
9edabd4d 273uint64_t ram_bytes_remaining(void)
2f4fde93 274{
9edabd4d 275 return ram_state.migration_dirty_pages * TARGET_PAGE_SIZE;
2f4fde93
JQ
276}
277
42d219d3
JQ
278uint64_t ram_dirty_sync_count(void)
279{
280 return ram_state.bitmap_sync_count;
281}
282
47ad8619
JQ
283uint64_t ram_dirty_pages_rate(void)
284{
285 return ram_state.dirty_pages_rate;
286}
287
96506894
JQ
288uint64_t ram_postcopy_requests(void)
289{
290 return ram_state.postcopy_requests;
291}
292
b8fb8cb7
DDAG
293/* used by the search for pages to send */
294struct PageSearchStatus {
295 /* Current block being searched */
296 RAMBlock *block;
297 /* Current offset to search from */
298 ram_addr_t offset;
299 /* Set once we wrap around */
300 bool complete_round;
301};
302typedef struct PageSearchStatus PageSearchStatus;
303
56e93d26 304struct CompressParam {
56e93d26 305 bool done;
90e56fb4 306 bool quit;
56e93d26
JQ
307 QEMUFile *file;
308 QemuMutex mutex;
309 QemuCond cond;
310 RAMBlock *block;
311 ram_addr_t offset;
312};
313typedef struct CompressParam CompressParam;
314
315struct DecompressParam {
73a8912b 316 bool done;
90e56fb4 317 bool quit;
56e93d26
JQ
318 QemuMutex mutex;
319 QemuCond cond;
320 void *des;
d341d9f3 321 uint8_t *compbuf;
56e93d26
JQ
322 int len;
323};
324typedef struct DecompressParam DecompressParam;
325
326static CompressParam *comp_param;
327static QemuThread *compress_threads;
328/* comp_done_cond is used to wake up the migration thread when
329 * one of the compression threads has finished the compression.
330 * comp_done_lock is used to co-work with comp_done_cond.
331 */
0d9f9a5c
LL
332static QemuMutex comp_done_lock;
333static QemuCond comp_done_cond;
56e93d26
JQ
334/* The empty QEMUFileOps will be used by file in CompressParam */
335static const QEMUFileOps empty_ops = { };
336
337static bool compression_switch;
56e93d26
JQ
338static DecompressParam *decomp_param;
339static QemuThread *decompress_threads;
73a8912b
LL
340static QemuMutex decomp_done_lock;
341static QemuCond decomp_done_cond;
56e93d26 342
a7a9a88f
LL
343static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
344 ram_addr_t offset);
56e93d26
JQ
345
346static void *do_data_compress(void *opaque)
347{
348 CompressParam *param = opaque;
a7a9a88f
LL
349 RAMBlock *block;
350 ram_addr_t offset;
56e93d26 351
a7a9a88f 352 qemu_mutex_lock(&param->mutex);
90e56fb4 353 while (!param->quit) {
a7a9a88f
LL
354 if (param->block) {
355 block = param->block;
356 offset = param->offset;
357 param->block = NULL;
358 qemu_mutex_unlock(&param->mutex);
359
360 do_compress_ram_page(param->file, block, offset);
361
0d9f9a5c 362 qemu_mutex_lock(&comp_done_lock);
a7a9a88f 363 param->done = true;
0d9f9a5c
LL
364 qemu_cond_signal(&comp_done_cond);
365 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
366
367 qemu_mutex_lock(&param->mutex);
368 } else {
56e93d26
JQ
369 qemu_cond_wait(&param->cond, &param->mutex);
370 }
56e93d26 371 }
a7a9a88f 372 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
373
374 return NULL;
375}
376
377static inline void terminate_compression_threads(void)
378{
379 int idx, thread_count;
380
381 thread_count = migrate_compress_threads();
3d0684b2 382
56e93d26
JQ
383 for (idx = 0; idx < thread_count; idx++) {
384 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 385 comp_param[idx].quit = true;
56e93d26
JQ
386 qemu_cond_signal(&comp_param[idx].cond);
387 qemu_mutex_unlock(&comp_param[idx].mutex);
388 }
389}
390
391void migrate_compress_threads_join(void)
392{
393 int i, thread_count;
394
395 if (!migrate_use_compression()) {
396 return;
397 }
398 terminate_compression_threads();
399 thread_count = migrate_compress_threads();
400 for (i = 0; i < thread_count; i++) {
401 qemu_thread_join(compress_threads + i);
402 qemu_fclose(comp_param[i].file);
403 qemu_mutex_destroy(&comp_param[i].mutex);
404 qemu_cond_destroy(&comp_param[i].cond);
405 }
0d9f9a5c
LL
406 qemu_mutex_destroy(&comp_done_lock);
407 qemu_cond_destroy(&comp_done_cond);
56e93d26
JQ
408 g_free(compress_threads);
409 g_free(comp_param);
56e93d26
JQ
410 compress_threads = NULL;
411 comp_param = NULL;
56e93d26
JQ
412}
413
414void migrate_compress_threads_create(void)
415{
416 int i, thread_count;
417
418 if (!migrate_use_compression()) {
419 return;
420 }
56e93d26
JQ
421 compression_switch = true;
422 thread_count = migrate_compress_threads();
423 compress_threads = g_new0(QemuThread, thread_count);
424 comp_param = g_new0(CompressParam, thread_count);
0d9f9a5c
LL
425 qemu_cond_init(&comp_done_cond);
426 qemu_mutex_init(&comp_done_lock);
56e93d26 427 for (i = 0; i < thread_count; i++) {
e110aa91
C
428 /* comp_param[i].file is just used as a dummy buffer to save data,
429 * set its ops to empty.
56e93d26
JQ
430 */
431 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
432 comp_param[i].done = true;
90e56fb4 433 comp_param[i].quit = false;
56e93d26
JQ
434 qemu_mutex_init(&comp_param[i].mutex);
435 qemu_cond_init(&comp_param[i].cond);
436 qemu_thread_create(compress_threads + i, "compress",
437 do_data_compress, comp_param + i,
438 QEMU_THREAD_JOINABLE);
439 }
440}
441
442/**
3d0684b2 443 * save_page_header: write page header to wire
56e93d26
JQ
444 *
445 * If this is the 1st block, it also writes the block identification
446 *
3d0684b2 447 * Returns the number of bytes written
56e93d26
JQ
448 *
449 * @f: QEMUFile where to send the data
450 * @block: block that contains the page we want to send
451 * @offset: offset inside the block for the page
452 * in the lower bits, it contains flags
453 */
454static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
455{
9f5f380b 456 size_t size, len;
56e93d26
JQ
457
458 qemu_put_be64(f, offset);
459 size = 8;
460
461 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
9f5f380b
LL
462 len = strlen(block->idstr);
463 qemu_put_byte(f, len);
464 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
465 size += 1 + len;
56e93d26
JQ
466 }
467 return size;
468}
469
3d0684b2
JQ
470/**
471 * mig_throttle_guest_down: throotle down the guest
472 *
473 * Reduce amount of guest cpu execution to hopefully slow down memory
474 * writes. If guest dirty memory rate is reduced below the rate at
475 * which we can transfer pages to the destination then we should be
476 * able to complete migration. Some workloads dirty memory way too
477 * fast and will not effectively converge, even with auto-converge.
070afca2
JH
478 */
479static void mig_throttle_guest_down(void)
480{
481 MigrationState *s = migrate_get_current();
2594f56d
DB
482 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
483 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
070afca2
JH
484
485 /* We have not started throttling yet. Let's start it. */
486 if (!cpu_throttle_active()) {
487 cpu_throttle_set(pct_initial);
488 } else {
489 /* Throttling already on, just increase the rate */
490 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
491 }
492}
493
3d0684b2
JQ
494/**
495 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
496 *
6f37bb8b 497 * @rs: current RAM state
3d0684b2
JQ
498 * @current_addr: address for the zero page
499 *
500 * Update the xbzrle cache to reflect a page that's been sent as all 0.
56e93d26
JQ
501 * The important thing is that a stale (not-yet-0'd) page be replaced
502 * by the new data.
503 * As a bonus, if the page wasn't in the cache it gets added so that
3d0684b2 504 * when a small write is made into the 0'd page it gets XBZRLE sent.
56e93d26 505 */
6f37bb8b 506static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
56e93d26 507{
6f37bb8b 508 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
56e93d26
JQ
509 return;
510 }
511
512 /* We don't care if this fails to allocate a new cache page
513 * as long as it updated an old one */
514 cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
5a987738 515 rs->bitmap_sync_count);
56e93d26
JQ
516}
517
518#define ENCODING_FLAG_XBZRLE 0x1
519
520/**
521 * save_xbzrle_page: compress and send current page
522 *
523 * Returns: 1 means that we wrote the page
524 * 0 means that page is identical to the one already sent
525 * -1 means that xbzrle would be longer than normal
526 *
5a987738 527 * @rs: current RAM state
56e93d26 528 * @f: QEMUFile where to send the data
3d0684b2
JQ
529 * @current_data: pointer to the address of the page contents
530 * @current_addr: addr of the page
56e93d26
JQ
531 * @block: block that contains the page we want to send
532 * @offset: offset inside the block for the page
533 * @last_stage: if we are at the completion stage
56e93d26 534 */
5a987738 535static int save_xbzrle_page(RAMState *rs, QEMUFile *f, uint8_t **current_data,
56e93d26 536 ram_addr_t current_addr, RAMBlock *block,
072c2511 537 ram_addr_t offset, bool last_stage)
56e93d26
JQ
538{
539 int encoded_len = 0, bytes_xbzrle;
540 uint8_t *prev_cached_page;
541
5a987738 542 if (!cache_is_cached(XBZRLE.cache, current_addr, rs->bitmap_sync_count)) {
544c36f1 543 rs->xbzrle_cache_miss++;
56e93d26
JQ
544 if (!last_stage) {
545 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
5a987738 546 rs->bitmap_sync_count) == -1) {
56e93d26
JQ
547 return -1;
548 } else {
549 /* update *current_data when the page has been
550 inserted into cache */
551 *current_data = get_cached_data(XBZRLE.cache, current_addr);
552 }
553 }
554 return -1;
555 }
556
557 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
558
559 /* save current buffer into memory */
560 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
561
562 /* XBZRLE encoding (if there is no overflow) */
563 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
564 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
565 TARGET_PAGE_SIZE);
566 if (encoded_len == 0) {
55c4446b 567 trace_save_xbzrle_page_skipping();
56e93d26
JQ
568 return 0;
569 } else if (encoded_len == -1) {
55c4446b 570 trace_save_xbzrle_page_overflow();
180f61f7 571 rs->xbzrle_overflows++;
56e93d26
JQ
572 /* update data in the cache */
573 if (!last_stage) {
574 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
575 *current_data = prev_cached_page;
576 }
577 return -1;
578 }
579
580 /* we need to update the data in the cache, in order to get the same data */
581 if (!last_stage) {
582 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
583 }
584
585 /* Send XBZRLE based compressed page */
586 bytes_xbzrle = save_page_header(f, block, offset | RAM_SAVE_FLAG_XBZRLE);
587 qemu_put_byte(f, ENCODING_FLAG_XBZRLE);
588 qemu_put_be16(f, encoded_len);
589 qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len);
590 bytes_xbzrle += encoded_len + 1 + 2;
f36ada95 591 rs->xbzrle_pages++;
07ed50a2 592 rs->xbzrle_bytes += bytes_xbzrle;
072c2511 593 rs->bytes_transferred += bytes_xbzrle;
56e93d26
JQ
594
595 return 1;
596}
597
3d0684b2
JQ
598/**
599 * migration_bitmap_find_dirty: find the next dirty page from start
f3f491fc 600 *
3d0684b2
JQ
601 * Called with rcu_read_lock() to protect migration_bitmap
602 *
603 * Returns the byte offset within memory region of the start of a dirty page
604 *
6f37bb8b 605 * @rs: current RAM state
3d0684b2
JQ
606 * @rb: RAMBlock where to search for dirty pages
607 * @start: starting address (typically so we can continue from previous page)
608 * @ram_addr_abs: pointer into which to store the address of the dirty page
609 * within the global ram_addr space
f3f491fc 610 */
56e93d26 611static inline
6f37bb8b 612ram_addr_t migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
a82d593b
DDAG
613 ram_addr_t start,
614 ram_addr_t *ram_addr_abs)
56e93d26 615{
2f68e399 616 unsigned long base = rb->offset >> TARGET_PAGE_BITS;
56e93d26 617 unsigned long nr = base + (start >> TARGET_PAGE_BITS);
2f68e399
DDAG
618 uint64_t rb_size = rb->used_length;
619 unsigned long size = base + (rb_size >> TARGET_PAGE_BITS);
2ff64038 620 unsigned long *bitmap;
56e93d26
JQ
621
622 unsigned long next;
623
eb859c53 624 bitmap = atomic_rcu_read(&rs->ram_bitmap)->bmap;
6f37bb8b 625 if (rs->ram_bulk_stage && nr > base) {
56e93d26
JQ
626 next = nr + 1;
627 } else {
2ff64038 628 next = find_next_bit(bitmap, size, nr);
56e93d26
JQ
629 }
630
f3f491fc 631 *ram_addr_abs = next << TARGET_PAGE_BITS;
56e93d26
JQ
632 return (next - base) << TARGET_PAGE_BITS;
633}
634
0d8ec885 635static inline bool migration_bitmap_clear_dirty(RAMState *rs, ram_addr_t addr)
a82d593b
DDAG
636{
637 bool ret;
638 int nr = addr >> TARGET_PAGE_BITS;
eb859c53 639 unsigned long *bitmap = atomic_rcu_read(&rs->ram_bitmap)->bmap;
a82d593b
DDAG
640
641 ret = test_and_clear_bit(nr, bitmap);
642
643 if (ret) {
0d8ec885 644 rs->migration_dirty_pages--;
a82d593b
DDAG
645 }
646 return ret;
647}
648
a66cd90c
JQ
649static void migration_bitmap_sync_range(RAMState *rs, ram_addr_t start,
650 ram_addr_t length)
56e93d26 651{
2ff64038 652 unsigned long *bitmap;
eb859c53 653 bitmap = atomic_rcu_read(&rs->ram_bitmap)->bmap;
0d8ec885
JQ
654 rs->migration_dirty_pages +=
655 cpu_physical_memory_sync_dirty_bitmap(bitmap, start, length,
656 &rs->num_dirty_pages_period);
56e93d26
JQ
657}
658
3d0684b2
JQ
659/**
660 * ram_pagesize_summary: calculate all the pagesizes of a VM
661 *
662 * Returns a summary bitmap of the page sizes of all RAMBlocks
663 *
664 * For VMs with just normal pages this is equivalent to the host page
665 * size. If it's got some huge pages then it's the OR of all the
666 * different page sizes.
e8ca1db2
DDAG
667 */
668uint64_t ram_pagesize_summary(void)
669{
670 RAMBlock *block;
671 uint64_t summary = 0;
672
673 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
674 summary |= block->page_size;
675 }
676
677 return summary;
678}
679
8d820d6f 680static void migration_bitmap_sync(RAMState *rs)
56e93d26
JQ
681{
682 RAMBlock *block;
56e93d26 683 int64_t end_time;
c4bdf0cf 684 uint64_t bytes_xfer_now;
56e93d26 685
5a987738 686 rs->bitmap_sync_count++;
56e93d26 687
eac74159
JQ
688 if (!rs->bytes_xfer_prev) {
689 rs->bytes_xfer_prev = ram_bytes_transferred();
56e93d26
JQ
690 }
691
f664da80
JQ
692 if (!rs->time_last_bitmap_sync) {
693 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
56e93d26
JQ
694 }
695
696 trace_migration_bitmap_sync_start();
9c1f8f44 697 memory_global_dirty_log_sync();
56e93d26 698
108cfae0 699 qemu_mutex_lock(&rs->bitmap_mutex);
56e93d26
JQ
700 rcu_read_lock();
701 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
a66cd90c 702 migration_bitmap_sync_range(rs, block->offset, block->used_length);
56e93d26
JQ
703 }
704 rcu_read_unlock();
108cfae0 705 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26 706
a66cd90c 707 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1ffb5dfd 708
56e93d26
JQ
709 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
710
711 /* more than 1 second = 1000 millisecons */
f664da80 712 if (end_time > rs->time_last_bitmap_sync + 1000) {
56e93d26
JQ
713 if (migrate_auto_converge()) {
714 /* The following detection logic can be refined later. For now:
715 Check to see if the dirtied bytes is 50% more than the approx.
716 amount of bytes that just got transferred since the last time we
070afca2
JH
717 were in this routine. If that happens twice, start or increase
718 throttling */
56e93d26 719 bytes_xfer_now = ram_bytes_transferred();
070afca2 720
47ad8619 721 if (rs->dirty_pages_rate &&
a66cd90c 722 (rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
eac74159 723 (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
8d820d6f 724 (rs->dirty_rate_high_cnt++ >= 2)) {
56e93d26 725 trace_migration_throttle();
8d820d6f 726 rs->dirty_rate_high_cnt = 0;
070afca2 727 mig_throttle_guest_down();
56e93d26 728 }
eac74159 729 rs->bytes_xfer_prev = bytes_xfer_now;
56e93d26 730 }
070afca2 731
56e93d26 732 if (migrate_use_xbzrle()) {
23b28c3c 733 if (rs->iterations_prev != rs->iterations) {
b07016b6 734 rs->xbzrle_cache_miss_rate =
544c36f1 735 (double)(rs->xbzrle_cache_miss -
b5833fde 736 rs->xbzrle_cache_miss_prev) /
23b28c3c 737 (rs->iterations - rs->iterations_prev);
56e93d26 738 }
23b28c3c 739 rs->iterations_prev = rs->iterations;
544c36f1 740 rs->xbzrle_cache_miss_prev = rs->xbzrle_cache_miss;
56e93d26 741 }
47ad8619 742 rs->dirty_pages_rate = rs->num_dirty_pages_period * 1000
f664da80 743 / (end_time - rs->time_last_bitmap_sync);
f664da80 744 rs->time_last_bitmap_sync = end_time;
a66cd90c 745 rs->num_dirty_pages_period = 0;
56e93d26 746 }
4addcd4f 747 if (migrate_use_events()) {
5a987738 748 qapi_event_send_migration_pass(rs->bitmap_sync_count, NULL);
4addcd4f 749 }
56e93d26
JQ
750}
751
752/**
3d0684b2 753 * save_zero_page: send the zero page to the stream
56e93d26 754 *
3d0684b2 755 * Returns the number of pages written.
56e93d26 756 *
f7ccd61b 757 * @rs: current RAM state
56e93d26
JQ
758 * @f: QEMUFile where to send the data
759 * @block: block that contains the page we want to send
760 * @offset: offset inside the block for the page
761 * @p: pointer to the page
56e93d26 762 */
f7ccd61b 763static int save_zero_page(RAMState *rs, QEMUFile *f, RAMBlock *block,
072c2511 764 ram_addr_t offset, uint8_t *p)
56e93d26
JQ
765{
766 int pages = -1;
767
768 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
f7ccd61b 769 rs->zero_pages++;
072c2511
JQ
770 rs->bytes_transferred +=
771 save_page_header(f, block, offset | RAM_SAVE_FLAG_COMPRESS);
56e93d26 772 qemu_put_byte(f, 0);
072c2511 773 rs->bytes_transferred += 1;
56e93d26
JQ
774 pages = 1;
775 }
776
777 return pages;
778}
779
36449157 780static void ram_release_pages(MigrationState *ms, const char *rbname,
53f09a10
PB
781 uint64_t offset, int pages)
782{
783 if (!migrate_release_ram() || !migration_in_postcopy(ms)) {
784 return;
785 }
786
36449157 787 ram_discard_range(NULL, rbname, offset, pages << TARGET_PAGE_BITS);
53f09a10
PB
788}
789
56e93d26 790/**
3d0684b2 791 * ram_save_page: send the given page to the stream
56e93d26 792 *
3d0684b2 793 * Returns the number of pages written.
3fd3c4b3
DDAG
794 * < 0 - error
795 * >=0 - Number of pages written - this might legally be 0
796 * if xbzrle noticed the page was the same.
56e93d26 797 *
6f37bb8b 798 * @rs: current RAM state
3d0684b2 799 * @ms: current migration state
56e93d26
JQ
800 * @f: QEMUFile where to send the data
801 * @block: block that contains the page we want to send
802 * @offset: offset inside the block for the page
803 * @last_stage: if we are at the completion stage
56e93d26 804 */
6f37bb8b 805static int ram_save_page(RAMState *rs, MigrationState *ms, QEMUFile *f,
072c2511 806 PageSearchStatus *pss, bool last_stage)
56e93d26
JQ
807{
808 int pages = -1;
809 uint64_t bytes_xmit;
810 ram_addr_t current_addr;
56e93d26
JQ
811 uint8_t *p;
812 int ret;
813 bool send_async = true;
a08f6890
HZ
814 RAMBlock *block = pss->block;
815 ram_addr_t offset = pss->offset;
56e93d26 816
2f68e399 817 p = block->host + offset;
56e93d26
JQ
818
819 /* In doubt sent page as normal */
820 bytes_xmit = 0;
821 ret = ram_control_save_page(f, block->offset,
822 offset, TARGET_PAGE_SIZE, &bytes_xmit);
823 if (bytes_xmit) {
072c2511 824 rs->bytes_transferred += bytes_xmit;
56e93d26
JQ
825 pages = 1;
826 }
827
828 XBZRLE_cache_lock();
829
830 current_addr = block->offset + offset;
831
6f37bb8b 832 if (block == rs->last_sent_block) {
56e93d26
JQ
833 offset |= RAM_SAVE_FLAG_CONTINUE;
834 }
835 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
836 if (ret != RAM_SAVE_CONTROL_DELAYED) {
837 if (bytes_xmit > 0) {
b4d1c6e7 838 rs->norm_pages++;
56e93d26 839 } else if (bytes_xmit == 0) {
f7ccd61b 840 rs->zero_pages++;
56e93d26
JQ
841 }
842 }
843 } else {
072c2511 844 pages = save_zero_page(rs, f, block, offset, p);
56e93d26
JQ
845 if (pages > 0) {
846 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
847 * page would be stale
848 */
6f37bb8b 849 xbzrle_cache_zero_page(rs, current_addr);
53f09a10 850 ram_release_pages(ms, block->idstr, pss->offset, pages);
6f37bb8b 851 } else if (!rs->ram_bulk_stage &&
9eb14766 852 !migration_in_postcopy(ms) && migrate_use_xbzrle()) {
5a987738 853 pages = save_xbzrle_page(rs, f, &p, current_addr, block,
072c2511 854 offset, last_stage);
56e93d26
JQ
855 if (!last_stage) {
856 /* Can't send this cached data async, since the cache page
857 * might get updated before it gets to the wire
858 */
859 send_async = false;
860 }
861 }
862 }
863
864 /* XBZRLE overflow or normal page */
865 if (pages == -1) {
072c2511 866 rs->bytes_transferred += save_page_header(f, block,
56e93d26
JQ
867 offset | RAM_SAVE_FLAG_PAGE);
868 if (send_async) {
53f09a10
PB
869 qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE,
870 migrate_release_ram() &
871 migration_in_postcopy(ms));
56e93d26
JQ
872 } else {
873 qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
874 }
072c2511 875 rs->bytes_transferred += TARGET_PAGE_SIZE;
56e93d26 876 pages = 1;
b4d1c6e7 877 rs->norm_pages++;
56e93d26
JQ
878 }
879
880 XBZRLE_cache_unlock();
881
882 return pages;
883}
884
a7a9a88f
LL
885static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
886 ram_addr_t offset)
56e93d26
JQ
887{
888 int bytes_sent, blen;
a7a9a88f 889 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
56e93d26 890
a7a9a88f 891 bytes_sent = save_page_header(f, block, offset |
56e93d26 892 RAM_SAVE_FLAG_COMPRESS_PAGE);
a7a9a88f 893 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
56e93d26 894 migrate_compress_level());
b3be2896
LL
895 if (blen < 0) {
896 bytes_sent = 0;
897 qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
898 error_report("compressed data failed!");
899 } else {
900 bytes_sent += blen;
53f09a10
PB
901 ram_release_pages(migrate_get_current(), block->idstr,
902 offset & TARGET_PAGE_MASK, 1);
b3be2896 903 }
56e93d26
JQ
904
905 return bytes_sent;
906}
907
2f4fde93 908static void flush_compressed_data(RAMState *rs, QEMUFile *f)
56e93d26
JQ
909{
910 int idx, len, thread_count;
911
912 if (!migrate_use_compression()) {
913 return;
914 }
915 thread_count = migrate_compress_threads();
a7a9a88f 916
0d9f9a5c 917 qemu_mutex_lock(&comp_done_lock);
56e93d26 918 for (idx = 0; idx < thread_count; idx++) {
a7a9a88f 919 while (!comp_param[idx].done) {
0d9f9a5c 920 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26 921 }
a7a9a88f 922 }
0d9f9a5c 923 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
924
925 for (idx = 0; idx < thread_count; idx++) {
926 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 927 if (!comp_param[idx].quit) {
56e93d26 928 len = qemu_put_qemu_file(f, comp_param[idx].file);
2f4fde93 929 rs->bytes_transferred += len;
56e93d26 930 }
a7a9a88f 931 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26
JQ
932 }
933}
934
935static inline void set_compress_params(CompressParam *param, RAMBlock *block,
936 ram_addr_t offset)
937{
938 param->block = block;
939 param->offset = offset;
940}
941
b4d1c6e7 942static int compress_page_with_multi_thread(RAMState *rs, QEMUFile *f,
072c2511 943 RAMBlock *block, ram_addr_t offset)
56e93d26
JQ
944{
945 int idx, thread_count, bytes_xmit = -1, pages = -1;
946
947 thread_count = migrate_compress_threads();
0d9f9a5c 948 qemu_mutex_lock(&comp_done_lock);
56e93d26
JQ
949 while (true) {
950 for (idx = 0; idx < thread_count; idx++) {
951 if (comp_param[idx].done) {
a7a9a88f 952 comp_param[idx].done = false;
56e93d26 953 bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file);
a7a9a88f 954 qemu_mutex_lock(&comp_param[idx].mutex);
56e93d26 955 set_compress_params(&comp_param[idx], block, offset);
a7a9a88f
LL
956 qemu_cond_signal(&comp_param[idx].cond);
957 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26 958 pages = 1;
b4d1c6e7 959 rs->norm_pages++;
072c2511 960 rs->bytes_transferred += bytes_xmit;
56e93d26
JQ
961 break;
962 }
963 }
964 if (pages > 0) {
965 break;
966 } else {
0d9f9a5c 967 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26
JQ
968 }
969 }
0d9f9a5c 970 qemu_mutex_unlock(&comp_done_lock);
56e93d26
JQ
971
972 return pages;
973}
974
975/**
976 * ram_save_compressed_page: compress the given page and send it to the stream
977 *
3d0684b2 978 * Returns the number of pages written.
56e93d26 979 *
6f37bb8b 980 * @rs: current RAM state
3d0684b2 981 * @ms: current migration state
56e93d26
JQ
982 * @f: QEMUFile where to send the data
983 * @block: block that contains the page we want to send
984 * @offset: offset inside the block for the page
985 * @last_stage: if we are at the completion stage
56e93d26 986 */
6f37bb8b
JQ
987static int ram_save_compressed_page(RAMState *rs, MigrationState *ms,
988 QEMUFile *f,
072c2511 989 PageSearchStatus *pss, bool last_stage)
56e93d26
JQ
990{
991 int pages = -1;
fc50438e 992 uint64_t bytes_xmit = 0;
56e93d26 993 uint8_t *p;
fc50438e 994 int ret, blen;
a08f6890
HZ
995 RAMBlock *block = pss->block;
996 ram_addr_t offset = pss->offset;
56e93d26 997
2f68e399 998 p = block->host + offset;
56e93d26 999
56e93d26
JQ
1000 ret = ram_control_save_page(f, block->offset,
1001 offset, TARGET_PAGE_SIZE, &bytes_xmit);
1002 if (bytes_xmit) {
072c2511 1003 rs->bytes_transferred += bytes_xmit;
56e93d26
JQ
1004 pages = 1;
1005 }
56e93d26
JQ
1006 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
1007 if (ret != RAM_SAVE_CONTROL_DELAYED) {
1008 if (bytes_xmit > 0) {
b4d1c6e7 1009 rs->norm_pages++;
56e93d26 1010 } else if (bytes_xmit == 0) {
f7ccd61b 1011 rs->zero_pages++;
56e93d26
JQ
1012 }
1013 }
1014 } else {
1015 /* When starting the process of a new block, the first page of
1016 * the block should be sent out before other pages in the same
1017 * block, and all the pages in last block should have been sent
1018 * out, keeping this order is important, because the 'cont' flag
1019 * is used to avoid resending the block name.
1020 */
6f37bb8b 1021 if (block != rs->last_sent_block) {
2f4fde93 1022 flush_compressed_data(rs, f);
072c2511 1023 pages = save_zero_page(rs, f, block, offset, p);
56e93d26 1024 if (pages == -1) {
fc50438e
LL
1025 /* Make sure the first page is sent out before other pages */
1026 bytes_xmit = save_page_header(f, block, offset |
1027 RAM_SAVE_FLAG_COMPRESS_PAGE);
1028 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
1029 migrate_compress_level());
1030 if (blen > 0) {
072c2511 1031 rs->bytes_transferred += bytes_xmit + blen;
b4d1c6e7 1032 rs->norm_pages++;
b3be2896 1033 pages = 1;
fc50438e
LL
1034 } else {
1035 qemu_file_set_error(f, blen);
1036 error_report("compressed data failed!");
b3be2896 1037 }
56e93d26 1038 }
53f09a10
PB
1039 if (pages > 0) {
1040 ram_release_pages(ms, block->idstr, pss->offset, pages);
1041 }
56e93d26 1042 } else {
fc50438e 1043 offset |= RAM_SAVE_FLAG_CONTINUE;
072c2511 1044 pages = save_zero_page(rs, f, block, offset, p);
56e93d26 1045 if (pages == -1) {
072c2511 1046 pages = compress_page_with_multi_thread(rs, f, block, offset);
53f09a10
PB
1047 } else {
1048 ram_release_pages(ms, block->idstr, pss->offset, pages);
56e93d26
JQ
1049 }
1050 }
1051 }
1052
1053 return pages;
1054}
1055
3d0684b2
JQ
1056/**
1057 * find_dirty_block: find the next dirty page and update any state
1058 * associated with the search process.
b9e60928 1059 *
3d0684b2 1060 * Returns if a page is found
b9e60928 1061 *
6f37bb8b 1062 * @rs: current RAM state
3d0684b2
JQ
1063 * @f: QEMUFile where to send the data
1064 * @pss: data about the state of the current dirty page scan
1065 * @again: set to false if the search has scanned the whole of RAM
1066 * @ram_addr_abs: pointer into which to store the address of the dirty page
1067 * within the global ram_addr space
b9e60928 1068 */
6f37bb8b 1069static bool find_dirty_block(RAMState *rs, QEMUFile *f, PageSearchStatus *pss,
f3f491fc 1070 bool *again, ram_addr_t *ram_addr_abs)
b9e60928 1071{
6f37bb8b 1072 pss->offset = migration_bitmap_find_dirty(rs, pss->block, pss->offset,
a82d593b 1073 ram_addr_abs);
6f37bb8b
JQ
1074 if (pss->complete_round && pss->block == rs->last_seen_block &&
1075 pss->offset >= rs->last_offset) {
b9e60928
DDAG
1076 /*
1077 * We've been once around the RAM and haven't found anything.
1078 * Give up.
1079 */
1080 *again = false;
1081 return false;
1082 }
1083 if (pss->offset >= pss->block->used_length) {
1084 /* Didn't find anything in this RAM Block */
1085 pss->offset = 0;
1086 pss->block = QLIST_NEXT_RCU(pss->block, next);
1087 if (!pss->block) {
1088 /* Hit the end of the list */
1089 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1090 /* Flag that we've looped */
1091 pss->complete_round = true;
6f37bb8b 1092 rs->ram_bulk_stage = false;
b9e60928
DDAG
1093 if (migrate_use_xbzrle()) {
1094 /* If xbzrle is on, stop using the data compression at this
1095 * point. In theory, xbzrle can do better than compression.
1096 */
2f4fde93 1097 flush_compressed_data(rs, f);
b9e60928
DDAG
1098 compression_switch = false;
1099 }
1100 }
1101 /* Didn't find anything this time, but try again on the new block */
1102 *again = true;
1103 return false;
1104 } else {
1105 /* Can go around again, but... */
1106 *again = true;
1107 /* We've found something so probably don't need to */
1108 return true;
1109 }
1110}
1111
3d0684b2
JQ
1112/**
1113 * unqueue_page: gets a page of the queue
1114 *
a82d593b 1115 * Helper for 'get_queued_page' - gets a page off the queue
a82d593b 1116 *
3d0684b2
JQ
1117 * Returns the block of the page (or NULL if none available)
1118 *
ec481c6c 1119 * @rs: current RAM state
3d0684b2
JQ
1120 * @offset: used to return the offset within the RAMBlock
1121 * @ram_addr_abs: pointer into which to store the address of the dirty page
1122 * within the global ram_addr space
a82d593b 1123 */
ec481c6c 1124static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset,
a82d593b
DDAG
1125 ram_addr_t *ram_addr_abs)
1126{
1127 RAMBlock *block = NULL;
1128
ec481c6c
JQ
1129 qemu_mutex_lock(&rs->src_page_req_mutex);
1130 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1131 struct RAMSrcPageRequest *entry =
1132 QSIMPLEQ_FIRST(&rs->src_page_requests);
a82d593b
DDAG
1133 block = entry->rb;
1134 *offset = entry->offset;
1135 *ram_addr_abs = (entry->offset + entry->rb->offset) &
1136 TARGET_PAGE_MASK;
1137
1138 if (entry->len > TARGET_PAGE_SIZE) {
1139 entry->len -= TARGET_PAGE_SIZE;
1140 entry->offset += TARGET_PAGE_SIZE;
1141 } else {
1142 memory_region_unref(block->mr);
ec481c6c 1143 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
a82d593b
DDAG
1144 g_free(entry);
1145 }
1146 }
ec481c6c 1147 qemu_mutex_unlock(&rs->src_page_req_mutex);
a82d593b
DDAG
1148
1149 return block;
1150}
1151
3d0684b2
JQ
1152/**
1153 * get_queued_page: unqueue a page from the postocpy requests
1154 *
1155 * Skips pages that are already sent (!dirty)
a82d593b 1156 *
3d0684b2 1157 * Returns if a queued page is found
a82d593b 1158 *
6f37bb8b 1159 * @rs: current RAM state
3d0684b2
JQ
1160 * @pss: data about the state of the current dirty page scan
1161 * @ram_addr_abs: pointer into which to store the address of the dirty page
1162 * within the global ram_addr space
a82d593b 1163 */
ec481c6c 1164static bool get_queued_page(RAMState *rs, PageSearchStatus *pss,
a82d593b
DDAG
1165 ram_addr_t *ram_addr_abs)
1166{
1167 RAMBlock *block;
1168 ram_addr_t offset;
1169 bool dirty;
1170
1171 do {
ec481c6c 1172 block = unqueue_page(rs, &offset, ram_addr_abs);
a82d593b
DDAG
1173 /*
1174 * We're sending this page, and since it's postcopy nothing else
1175 * will dirty it, and we must make sure it doesn't get sent again
1176 * even if this queue request was received after the background
1177 * search already sent it.
1178 */
1179 if (block) {
1180 unsigned long *bitmap;
eb859c53 1181 bitmap = atomic_rcu_read(&rs->ram_bitmap)->bmap;
a82d593b
DDAG
1182 dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap);
1183 if (!dirty) {
1184 trace_get_queued_page_not_dirty(
1185 block->idstr, (uint64_t)offset,
1186 (uint64_t)*ram_addr_abs,
1187 test_bit(*ram_addr_abs >> TARGET_PAGE_BITS,
eb859c53 1188 atomic_rcu_read(&rs->ram_bitmap)->unsentmap));
a82d593b
DDAG
1189 } else {
1190 trace_get_queued_page(block->idstr,
1191 (uint64_t)offset,
1192 (uint64_t)*ram_addr_abs);
1193 }
1194 }
1195
1196 } while (block && !dirty);
1197
1198 if (block) {
1199 /*
1200 * As soon as we start servicing pages out of order, then we have
1201 * to kill the bulk stage, since the bulk stage assumes
1202 * in (migration_bitmap_find_and_reset_dirty) that every page is
1203 * dirty, that's no longer true.
1204 */
6f37bb8b 1205 rs->ram_bulk_stage = false;
a82d593b
DDAG
1206
1207 /*
1208 * We want the background search to continue from the queued page
1209 * since the guest is likely to want other pages near to the page
1210 * it just requested.
1211 */
1212 pss->block = block;
1213 pss->offset = offset;
1214 }
1215
1216 return !!block;
1217}
1218
6c595cde 1219/**
5e58f968
JQ
1220 * migration_page_queue_free: drop any remaining pages in the ram
1221 * request queue
6c595cde 1222 *
3d0684b2
JQ
1223 * It should be empty at the end anyway, but in error cases there may
1224 * be some left. in case that there is any page left, we drop it.
1225 *
6c595cde 1226 */
ec481c6c 1227void migration_page_queue_free(void)
6c595cde 1228{
ec481c6c
JQ
1229 struct RAMSrcPageRequest *mspr, *next_mspr;
1230 RAMState *rs = &ram_state;
6c595cde
DDAG
1231 /* This queue generally should be empty - but in the case of a failed
1232 * migration might have some droppings in.
1233 */
1234 rcu_read_lock();
ec481c6c 1235 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
6c595cde 1236 memory_region_unref(mspr->rb->mr);
ec481c6c 1237 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
6c595cde
DDAG
1238 g_free(mspr);
1239 }
1240 rcu_read_unlock();
1241}
1242
1243/**
3d0684b2
JQ
1244 * ram_save_queue_pages: queue the page for transmission
1245 *
1246 * A request from postcopy destination for example.
1247 *
1248 * Returns zero on success or negative on error
1249 *
3d0684b2
JQ
1250 * @rbname: Name of the RAMBLock of the request. NULL means the
1251 * same that last one.
1252 * @start: starting address from the start of the RAMBlock
1253 * @len: length (in bytes) to send
6c595cde 1254 */
96506894 1255int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
6c595cde
DDAG
1256{
1257 RAMBlock *ramblock;
68a098f3 1258 RAMState *rs = &ram_state;
6c595cde 1259
96506894 1260 rs->postcopy_requests++;
6c595cde
DDAG
1261 rcu_read_lock();
1262 if (!rbname) {
1263 /* Reuse last RAMBlock */
68a098f3 1264 ramblock = rs->last_req_rb;
6c595cde
DDAG
1265
1266 if (!ramblock) {
1267 /*
1268 * Shouldn't happen, we can't reuse the last RAMBlock if
1269 * it's the 1st request.
1270 */
1271 error_report("ram_save_queue_pages no previous block");
1272 goto err;
1273 }
1274 } else {
1275 ramblock = qemu_ram_block_by_name(rbname);
1276
1277 if (!ramblock) {
1278 /* We shouldn't be asked for a non-existent RAMBlock */
1279 error_report("ram_save_queue_pages no block '%s'", rbname);
1280 goto err;
1281 }
68a098f3 1282 rs->last_req_rb = ramblock;
6c595cde
DDAG
1283 }
1284 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1285 if (start+len > ramblock->used_length) {
9458ad6b
JQ
1286 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1287 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
6c595cde
DDAG
1288 __func__, start, len, ramblock->used_length);
1289 goto err;
1290 }
1291
ec481c6c
JQ
1292 struct RAMSrcPageRequest *new_entry =
1293 g_malloc0(sizeof(struct RAMSrcPageRequest));
6c595cde
DDAG
1294 new_entry->rb = ramblock;
1295 new_entry->offset = start;
1296 new_entry->len = len;
1297
1298 memory_region_ref(ramblock->mr);
ec481c6c
JQ
1299 qemu_mutex_lock(&rs->src_page_req_mutex);
1300 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1301 qemu_mutex_unlock(&rs->src_page_req_mutex);
6c595cde
DDAG
1302 rcu_read_unlock();
1303
1304 return 0;
1305
1306err:
1307 rcu_read_unlock();
1308 return -1;
1309}
1310
a82d593b 1311/**
3d0684b2 1312 * ram_save_target_page: save one target page
a82d593b 1313 *
3d0684b2 1314 * Returns the number of pages written
a82d593b 1315 *
6f37bb8b 1316 * @rs: current RAM state
3d0684b2 1317 * @ms: current migration state
a82d593b 1318 * @f: QEMUFile where to send the data
3d0684b2 1319 * @pss: data about the page we want to send
a82d593b 1320 * @last_stage: if we are at the completion stage
3d0684b2 1321 * @dirty_ram_abs: address of the start of the dirty page in ram_addr_t space
a82d593b 1322 */
6f37bb8b 1323static int ram_save_target_page(RAMState *rs, MigrationState *ms, QEMUFile *f,
a08f6890 1324 PageSearchStatus *pss,
a82d593b 1325 bool last_stage,
a82d593b
DDAG
1326 ram_addr_t dirty_ram_abs)
1327{
1328 int res = 0;
1329
1330 /* Check the pages is dirty and if it is send it */
0d8ec885 1331 if (migration_bitmap_clear_dirty(rs, dirty_ram_abs)) {
a82d593b
DDAG
1332 unsigned long *unsentmap;
1333 if (compression_switch && migrate_use_compression()) {
072c2511 1334 res = ram_save_compressed_page(rs, ms, f, pss, last_stage);
a82d593b 1335 } else {
072c2511 1336 res = ram_save_page(rs, ms, f, pss, last_stage);
a82d593b
DDAG
1337 }
1338
1339 if (res < 0) {
1340 return res;
1341 }
eb859c53 1342 unsentmap = atomic_rcu_read(&rs->ram_bitmap)->unsentmap;
a82d593b
DDAG
1343 if (unsentmap) {
1344 clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap);
1345 }
3fd3c4b3
DDAG
1346 /* Only update last_sent_block if a block was actually sent; xbzrle
1347 * might have decided the page was identical so didn't bother writing
1348 * to the stream.
1349 */
1350 if (res > 0) {
6f37bb8b 1351 rs->last_sent_block = pss->block;
3fd3c4b3 1352 }
a82d593b
DDAG
1353 }
1354
1355 return res;
1356}
1357
1358/**
3d0684b2 1359 * ram_save_host_page: save a whole host page
a82d593b 1360 *
3d0684b2
JQ
1361 * Starting at *offset send pages up to the end of the current host
1362 * page. It's valid for the initial offset to point into the middle of
1363 * a host page in which case the remainder of the hostpage is sent.
1364 * Only dirty target pages are sent. Note that the host page size may
1365 * be a huge page for this block.
a82d593b 1366 *
3d0684b2
JQ
1367 * Returns the number of pages written or negative on error
1368 *
6f37bb8b 1369 * @rs: current RAM state
3d0684b2 1370 * @ms: current migration state
a82d593b 1371 * @f: QEMUFile where to send the data
3d0684b2 1372 * @pss: data about the page we want to send
a82d593b 1373 * @last_stage: if we are at the completion stage
a82d593b
DDAG
1374 * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1375 */
6f37bb8b 1376static int ram_save_host_page(RAMState *rs, MigrationState *ms, QEMUFile *f,
a08f6890
HZ
1377 PageSearchStatus *pss,
1378 bool last_stage,
a82d593b
DDAG
1379 ram_addr_t dirty_ram_abs)
1380{
1381 int tmppages, pages = 0;
4c011c37
DDAG
1382 size_t pagesize = qemu_ram_pagesize(pss->block);
1383
a82d593b 1384 do {
6f37bb8b 1385 tmppages = ram_save_target_page(rs, ms, f, pss, last_stage,
072c2511 1386 dirty_ram_abs);
a82d593b
DDAG
1387 if (tmppages < 0) {
1388 return tmppages;
1389 }
1390
1391 pages += tmppages;
a08f6890 1392 pss->offset += TARGET_PAGE_SIZE;
a82d593b 1393 dirty_ram_abs += TARGET_PAGE_SIZE;
4c011c37 1394 } while (pss->offset & (pagesize - 1));
a82d593b
DDAG
1395
1396 /* The offset we leave with is the last one we looked at */
a08f6890 1397 pss->offset -= TARGET_PAGE_SIZE;
a82d593b
DDAG
1398 return pages;
1399}
6c595cde 1400
56e93d26 1401/**
3d0684b2 1402 * ram_find_and_save_block: finds a dirty page and sends it to f
56e93d26
JQ
1403 *
1404 * Called within an RCU critical section.
1405 *
3d0684b2 1406 * Returns the number of pages written where zero means no dirty pages
56e93d26 1407 *
6f37bb8b 1408 * @rs: current RAM state
56e93d26
JQ
1409 * @f: QEMUFile where to send the data
1410 * @last_stage: if we are at the completion stage
a82d593b
DDAG
1411 *
1412 * On systems where host-page-size > target-page-size it will send all the
1413 * pages in a host page that are dirty.
56e93d26
JQ
1414 */
1415
072c2511 1416static int ram_find_and_save_block(RAMState *rs, QEMUFile *f, bool last_stage)
56e93d26 1417{
b8fb8cb7 1418 PageSearchStatus pss;
a82d593b 1419 MigrationState *ms = migrate_get_current();
56e93d26 1420 int pages = 0;
b9e60928 1421 bool again, found;
f3f491fc
DDAG
1422 ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
1423 ram_addr_t space */
56e93d26 1424
0827b9e9
AA
1425 /* No dirty page as there is zero RAM */
1426 if (!ram_bytes_total()) {
1427 return pages;
1428 }
1429
6f37bb8b
JQ
1430 pss.block = rs->last_seen_block;
1431 pss.offset = rs->last_offset;
b8fb8cb7
DDAG
1432 pss.complete_round = false;
1433
1434 if (!pss.block) {
1435 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1436 }
56e93d26 1437
b9e60928 1438 do {
a82d593b 1439 again = true;
ec481c6c 1440 found = get_queued_page(rs, &pss, &dirty_ram_abs);
b9e60928 1441
a82d593b
DDAG
1442 if (!found) {
1443 /* priority queue empty, so just search for something dirty */
6f37bb8b 1444 found = find_dirty_block(rs, f, &pss, &again, &dirty_ram_abs);
a82d593b 1445 }
f3f491fc 1446
a82d593b 1447 if (found) {
072c2511 1448 pages = ram_save_host_page(rs, ms, f, &pss, last_stage,
a82d593b 1449 dirty_ram_abs);
56e93d26 1450 }
b9e60928 1451 } while (!pages && again);
56e93d26 1452
6f37bb8b
JQ
1453 rs->last_seen_block = pss.block;
1454 rs->last_offset = pss.offset;
56e93d26
JQ
1455
1456 return pages;
1457}
1458
1459void acct_update_position(QEMUFile *f, size_t size, bool zero)
1460{
1461 uint64_t pages = size / TARGET_PAGE_SIZE;
f7ccd61b
JQ
1462 RAMState *rs = &ram_state;
1463
56e93d26 1464 if (zero) {
f7ccd61b 1465 rs->zero_pages += pages;
56e93d26 1466 } else {
b4d1c6e7 1467 rs->norm_pages += pages;
2f4fde93 1468 rs->bytes_transferred += size;
56e93d26
JQ
1469 qemu_update_position(f, size);
1470 }
1471}
1472
56e93d26
JQ
1473uint64_t ram_bytes_total(void)
1474{
1475 RAMBlock *block;
1476 uint64_t total = 0;
1477
1478 rcu_read_lock();
1479 QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
1480 total += block->used_length;
1481 rcu_read_unlock();
1482 return total;
1483}
1484
1485void free_xbzrle_decoded_buf(void)
1486{
1487 g_free(xbzrle_decoded_buf);
1488 xbzrle_decoded_buf = NULL;
1489}
1490
eb859c53 1491static void migration_bitmap_free(struct RAMBitmap *bmap)
60be6340
DL
1492{
1493 g_free(bmap->bmap);
f3f491fc 1494 g_free(bmap->unsentmap);
60be6340
DL
1495 g_free(bmap);
1496}
1497
6ad2a215 1498static void ram_migration_cleanup(void *opaque)
56e93d26 1499{
eb859c53
JQ
1500 RAMState *rs = opaque;
1501
2ff64038
LZ
1502 /* caller have hold iothread lock or is in a bh, so there is
1503 * no writing race against this migration_bitmap
1504 */
eb859c53
JQ
1505 struct RAMBitmap *bitmap = rs->ram_bitmap;
1506 atomic_rcu_set(&rs->ram_bitmap, NULL);
2ff64038 1507 if (bitmap) {
56e93d26 1508 memory_global_dirty_log_stop();
60be6340 1509 call_rcu(bitmap, migration_bitmap_free, rcu);
56e93d26
JQ
1510 }
1511
1512 XBZRLE_cache_lock();
1513 if (XBZRLE.cache) {
1514 cache_fini(XBZRLE.cache);
1515 g_free(XBZRLE.encoded_buf);
1516 g_free(XBZRLE.current_buf);
adb65dec 1517 g_free(ZERO_TARGET_PAGE);
56e93d26
JQ
1518 XBZRLE.cache = NULL;
1519 XBZRLE.encoded_buf = NULL;
1520 XBZRLE.current_buf = NULL;
1521 }
1522 XBZRLE_cache_unlock();
1523}
1524
6f37bb8b 1525static void ram_state_reset(RAMState *rs)
56e93d26 1526{
6f37bb8b
JQ
1527 rs->last_seen_block = NULL;
1528 rs->last_sent_block = NULL;
1529 rs->last_offset = 0;
1530 rs->last_version = ram_list.version;
1531 rs->ram_bulk_stage = true;
56e93d26
JQ
1532}
1533
1534#define MAX_WAIT 50 /* ms, half buffered_file limit */
1535
dd631697
LZ
1536void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
1537{
0d8ec885 1538 RAMState *rs = &ram_state;
108cfae0 1539
dd631697
LZ
1540 /* called in qemu main thread, so there is
1541 * no writing race against this migration_bitmap
1542 */
eb859c53
JQ
1543 if (rs->ram_bitmap) {
1544 struct RAMBitmap *old_bitmap = rs->ram_bitmap, *bitmap;
1545 bitmap = g_new(struct RAMBitmap, 1);
60be6340 1546 bitmap->bmap = bitmap_new(new);
dd631697
LZ
1547
1548 /* prevent migration_bitmap content from being set bit
1549 * by migration_bitmap_sync_range() at the same time.
1550 * it is safe to migration if migration_bitmap is cleared bit
1551 * at the same time.
1552 */
108cfae0 1553 qemu_mutex_lock(&rs->bitmap_mutex);
60be6340
DL
1554 bitmap_copy(bitmap->bmap, old_bitmap->bmap, old);
1555 bitmap_set(bitmap->bmap, old, new - old);
f3f491fc
DDAG
1556
1557 /* We don't have a way to safely extend the sentmap
1558 * with RCU; so mark it as missing, entry to postcopy
1559 * will fail.
1560 */
1561 bitmap->unsentmap = NULL;
1562
eb859c53 1563 atomic_rcu_set(&rs->ram_bitmap, bitmap);
108cfae0 1564 qemu_mutex_unlock(&rs->bitmap_mutex);
0d8ec885 1565 rs->migration_dirty_pages += new - old;
60be6340 1566 call_rcu(old_bitmap, migration_bitmap_free, rcu);
dd631697
LZ
1567 }
1568}
56e93d26 1569
4f2e4252
DDAG
1570/*
1571 * 'expected' is the value you expect the bitmap mostly to be full
1572 * of; it won't bother printing lines that are all this value.
1573 * If 'todump' is null the migration bitmap is dumped.
1574 */
1575void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
1576{
1577 int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
eb859c53 1578 RAMState *rs = &ram_state;
4f2e4252
DDAG
1579 int64_t cur;
1580 int64_t linelen = 128;
1581 char linebuf[129];
1582
1583 if (!todump) {
eb859c53 1584 todump = atomic_rcu_read(&rs->ram_bitmap)->bmap;
4f2e4252
DDAG
1585 }
1586
1587 for (cur = 0; cur < ram_pages; cur += linelen) {
1588 int64_t curb;
1589 bool found = false;
1590 /*
1591 * Last line; catch the case where the line length
1592 * is longer than remaining ram
1593 */
1594 if (cur + linelen > ram_pages) {
1595 linelen = ram_pages - cur;
1596 }
1597 for (curb = 0; curb < linelen; curb++) {
1598 bool thisbit = test_bit(cur + curb, todump);
1599 linebuf[curb] = thisbit ? '1' : '.';
1600 found = found || (thisbit != expected);
1601 }
1602 if (found) {
1603 linebuf[curb] = '\0';
1604 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
1605 }
1606 }
1607}
1608
e0b266f0
DDAG
1609/* **** functions for postcopy ***** */
1610
ced1c616
PB
1611void ram_postcopy_migrated_memory_release(MigrationState *ms)
1612{
eb859c53 1613 RAMState *rs = &ram_state;
ced1c616 1614 struct RAMBlock *block;
eb859c53 1615 unsigned long *bitmap = atomic_rcu_read(&rs->ram_bitmap)->bmap;
ced1c616
PB
1616
1617 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1618 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1619 unsigned long range = first + (block->used_length >> TARGET_PAGE_BITS);
1620 unsigned long run_start = find_next_zero_bit(bitmap, range, first);
1621
1622 while (run_start < range) {
1623 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1624 ram_discard_range(NULL, block->idstr, run_start << TARGET_PAGE_BITS,
1625 (run_end - run_start) << TARGET_PAGE_BITS);
1626 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1627 }
1628 }
1629}
1630
3d0684b2
JQ
1631/**
1632 * postcopy_send_discard_bm_ram: discard a RAMBlock
1633 *
1634 * Returns zero on success
1635 *
e0b266f0
DDAG
1636 * Callback from postcopy_each_ram_send_discard for each RAMBlock
1637 * Note: At this point the 'unsentmap' is the processed bitmap combined
1638 * with the dirtymap; so a '1' means it's either dirty or unsent.
3d0684b2
JQ
1639 *
1640 * @ms: current migration state
1641 * @pds: state for postcopy
1642 * @start: RAMBlock starting page
1643 * @length: RAMBlock size
e0b266f0
DDAG
1644 */
1645static int postcopy_send_discard_bm_ram(MigrationState *ms,
1646 PostcopyDiscardState *pds,
1647 unsigned long start,
1648 unsigned long length)
1649{
eb859c53 1650 RAMState *rs = &ram_state;
e0b266f0
DDAG
1651 unsigned long end = start + length; /* one after the end */
1652 unsigned long current;
1653 unsigned long *unsentmap;
1654
eb859c53 1655 unsentmap = atomic_rcu_read(&rs->ram_bitmap)->unsentmap;
e0b266f0
DDAG
1656 for (current = start; current < end; ) {
1657 unsigned long one = find_next_bit(unsentmap, end, current);
1658
1659 if (one <= end) {
1660 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1661 unsigned long discard_length;
1662
1663 if (zero >= end) {
1664 discard_length = end - one;
1665 } else {
1666 discard_length = zero - one;
1667 }
d688c62d
DDAG
1668 if (discard_length) {
1669 postcopy_discard_send_range(ms, pds, one, discard_length);
1670 }
e0b266f0
DDAG
1671 current = one + discard_length;
1672 } else {
1673 current = one;
1674 }
1675 }
1676
1677 return 0;
1678}
1679
3d0684b2
JQ
1680/**
1681 * postcopy_each_ram_send_discard: discard all RAMBlocks
1682 *
1683 * Returns 0 for success or negative for error
1684 *
e0b266f0
DDAG
1685 * Utility for the outgoing postcopy code.
1686 * Calls postcopy_send_discard_bm_ram for each RAMBlock
1687 * passing it bitmap indexes and name.
e0b266f0
DDAG
1688 * (qemu_ram_foreach_block ends up passing unscaled lengths
1689 * which would mean postcopy code would have to deal with target page)
3d0684b2
JQ
1690 *
1691 * @ms: current migration state
e0b266f0
DDAG
1692 */
1693static int postcopy_each_ram_send_discard(MigrationState *ms)
1694{
1695 struct RAMBlock *block;
1696 int ret;
1697
1698 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1699 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1700 PostcopyDiscardState *pds = postcopy_discard_send_init(ms,
1701 first,
1702 block->idstr);
1703
1704 /*
1705 * Postcopy sends chunks of bitmap over the wire, but it
1706 * just needs indexes at this point, avoids it having
1707 * target page specific code.
1708 */
1709 ret = postcopy_send_discard_bm_ram(ms, pds, first,
1710 block->used_length >> TARGET_PAGE_BITS);
1711 postcopy_discard_send_finish(ms, pds);
1712 if (ret) {
1713 return ret;
1714 }
1715 }
1716
1717 return 0;
1718}
1719
3d0684b2
JQ
1720/**
1721 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1722 *
1723 * Helper for postcopy_chunk_hostpages; it's called twice to
1724 * canonicalize the two bitmaps, that are similar, but one is
1725 * inverted.
99e314eb 1726 *
3d0684b2
JQ
1727 * Postcopy requires that all target pages in a hostpage are dirty or
1728 * clean, not a mix. This function canonicalizes the bitmaps.
99e314eb 1729 *
3d0684b2
JQ
1730 * @ms: current migration state
1731 * @unsent_pass: if true we need to canonicalize partially unsent host pages
1732 * otherwise we need to canonicalize partially dirty host pages
1733 * @block: block that contains the page we want to canonicalize
1734 * @pds: state for postcopy
99e314eb
DDAG
1735 */
1736static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1737 RAMBlock *block,
1738 PostcopyDiscardState *pds)
1739{
0d8ec885 1740 RAMState *rs = &ram_state;
99e314eb
DDAG
1741 unsigned long *bitmap;
1742 unsigned long *unsentmap;
29c59172 1743 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
99e314eb
DDAG
1744 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1745 unsigned long len = block->used_length >> TARGET_PAGE_BITS;
1746 unsigned long last = first + (len - 1);
1747 unsigned long run_start;
1748
29c59172
DDAG
1749 if (block->page_size == TARGET_PAGE_SIZE) {
1750 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1751 return;
1752 }
1753
eb859c53
JQ
1754 bitmap = atomic_rcu_read(&rs->ram_bitmap)->bmap;
1755 unsentmap = atomic_rcu_read(&rs->ram_bitmap)->unsentmap;
99e314eb
DDAG
1756
1757 if (unsent_pass) {
1758 /* Find a sent page */
1759 run_start = find_next_zero_bit(unsentmap, last + 1, first);
1760 } else {
1761 /* Find a dirty page */
1762 run_start = find_next_bit(bitmap, last + 1, first);
1763 }
1764
1765 while (run_start <= last) {
1766 bool do_fixup = false;
1767 unsigned long fixup_start_addr;
1768 unsigned long host_offset;
1769
1770 /*
1771 * If the start of this run of pages is in the middle of a host
1772 * page, then we need to fixup this host page.
1773 */
1774 host_offset = run_start % host_ratio;
1775 if (host_offset) {
1776 do_fixup = true;
1777 run_start -= host_offset;
1778 fixup_start_addr = run_start;
1779 /* For the next pass */
1780 run_start = run_start + host_ratio;
1781 } else {
1782 /* Find the end of this run */
1783 unsigned long run_end;
1784 if (unsent_pass) {
1785 run_end = find_next_bit(unsentmap, last + 1, run_start + 1);
1786 } else {
1787 run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1);
1788 }
1789 /*
1790 * If the end isn't at the start of a host page, then the
1791 * run doesn't finish at the end of a host page
1792 * and we need to discard.
1793 */
1794 host_offset = run_end % host_ratio;
1795 if (host_offset) {
1796 do_fixup = true;
1797 fixup_start_addr = run_end - host_offset;
1798 /*
1799 * This host page has gone, the next loop iteration starts
1800 * from after the fixup
1801 */
1802 run_start = fixup_start_addr + host_ratio;
1803 } else {
1804 /*
1805 * No discards on this iteration, next loop starts from
1806 * next sent/dirty page
1807 */
1808 run_start = run_end + 1;
1809 }
1810 }
1811
1812 if (do_fixup) {
1813 unsigned long page;
1814
1815 /* Tell the destination to discard this page */
1816 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1817 /* For the unsent_pass we:
1818 * discard partially sent pages
1819 * For the !unsent_pass (dirty) we:
1820 * discard partially dirty pages that were sent
1821 * (any partially sent pages were already discarded
1822 * by the previous unsent_pass)
1823 */
1824 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1825 host_ratio);
1826 }
1827
1828 /* Clean up the bitmap */
1829 for (page = fixup_start_addr;
1830 page < fixup_start_addr + host_ratio; page++) {
1831 /* All pages in this host page are now not sent */
1832 set_bit(page, unsentmap);
1833
1834 /*
1835 * Remark them as dirty, updating the count for any pages
1836 * that weren't previously dirty.
1837 */
0d8ec885 1838 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
99e314eb
DDAG
1839 }
1840 }
1841
1842 if (unsent_pass) {
1843 /* Find the next sent page for the next iteration */
1844 run_start = find_next_zero_bit(unsentmap, last + 1,
1845 run_start);
1846 } else {
1847 /* Find the next dirty page for the next iteration */
1848 run_start = find_next_bit(bitmap, last + 1, run_start);
1849 }
1850 }
1851}
1852
3d0684b2
JQ
1853/**
1854 * postcopy_chuck_hostpages: discrad any partially sent host page
1855 *
99e314eb
DDAG
1856 * Utility for the outgoing postcopy code.
1857 *
1858 * Discard any partially sent host-page size chunks, mark any partially
29c59172
DDAG
1859 * dirty host-page size chunks as all dirty. In this case the host-page
1860 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
99e314eb 1861 *
3d0684b2
JQ
1862 * Returns zero on success
1863 *
1864 * @ms: current migration state
99e314eb
DDAG
1865 */
1866static int postcopy_chunk_hostpages(MigrationState *ms)
1867{
6f37bb8b 1868 RAMState *rs = &ram_state;
99e314eb
DDAG
1869 struct RAMBlock *block;
1870
99e314eb 1871 /* Easiest way to make sure we don't resume in the middle of a host-page */
6f37bb8b
JQ
1872 rs->last_seen_block = NULL;
1873 rs->last_sent_block = NULL;
1874 rs->last_offset = 0;
99e314eb
DDAG
1875
1876 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1877 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1878
1879 PostcopyDiscardState *pds =
1880 postcopy_discard_send_init(ms, first, block->idstr);
1881
1882 /* First pass: Discard all partially sent host pages */
1883 postcopy_chunk_hostpages_pass(ms, true, block, pds);
1884 /*
1885 * Second pass: Ensure that all partially dirty host pages are made
1886 * fully dirty.
1887 */
1888 postcopy_chunk_hostpages_pass(ms, false, block, pds);
1889
1890 postcopy_discard_send_finish(ms, pds);
1891 } /* ram_list loop */
1892
1893 return 0;
1894}
1895
3d0684b2
JQ
1896/**
1897 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1898 *
1899 * Returns zero on success
1900 *
e0b266f0
DDAG
1901 * Transmit the set of pages to be discarded after precopy to the target
1902 * these are pages that:
1903 * a) Have been previously transmitted but are now dirty again
1904 * b) Pages that have never been transmitted, this ensures that
1905 * any pages on the destination that have been mapped by background
1906 * tasks get discarded (transparent huge pages is the specific concern)
1907 * Hopefully this is pretty sparse
3d0684b2
JQ
1908 *
1909 * @ms: current migration state
e0b266f0
DDAG
1910 */
1911int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1912{
eb859c53 1913 RAMState *rs = &ram_state;
e0b266f0
DDAG
1914 int ret;
1915 unsigned long *bitmap, *unsentmap;
1916
1917 rcu_read_lock();
1918
1919 /* This should be our last sync, the src is now paused */
eb859c53 1920 migration_bitmap_sync(rs);
e0b266f0 1921
eb859c53 1922 unsentmap = atomic_rcu_read(&rs->ram_bitmap)->unsentmap;
e0b266f0
DDAG
1923 if (!unsentmap) {
1924 /* We don't have a safe way to resize the sentmap, so
1925 * if the bitmap was resized it will be NULL at this
1926 * point.
1927 */
1928 error_report("migration ram resized during precopy phase");
1929 rcu_read_unlock();
1930 return -EINVAL;
1931 }
1932
29c59172 1933 /* Deal with TPS != HPS and huge pages */
99e314eb
DDAG
1934 ret = postcopy_chunk_hostpages(ms);
1935 if (ret) {
1936 rcu_read_unlock();
1937 return ret;
1938 }
1939
e0b266f0
DDAG
1940 /*
1941 * Update the unsentmap to be unsentmap = unsentmap | dirty
1942 */
eb859c53 1943 bitmap = atomic_rcu_read(&rs->ram_bitmap)->bmap;
e0b266f0
DDAG
1944 bitmap_or(unsentmap, unsentmap, bitmap,
1945 last_ram_offset() >> TARGET_PAGE_BITS);
1946
1947
1948 trace_ram_postcopy_send_discard_bitmap();
1949#ifdef DEBUG_POSTCOPY
1950 ram_debug_dump_bitmap(unsentmap, true);
1951#endif
1952
1953 ret = postcopy_each_ram_send_discard(ms);
1954 rcu_read_unlock();
1955
1956 return ret;
1957}
1958
3d0684b2
JQ
1959/**
1960 * ram_discard_range: discard dirtied pages at the beginning of postcopy
e0b266f0 1961 *
3d0684b2 1962 * Returns zero on success
e0b266f0 1963 *
3d0684b2 1964 * @mis: current migration incoming state
36449157
JQ
1965 * @rbname: name of the RAMBlock of the request. NULL means the
1966 * same that last one.
3d0684b2
JQ
1967 * @start: RAMBlock starting page
1968 * @length: RAMBlock size
e0b266f0
DDAG
1969 */
1970int ram_discard_range(MigrationIncomingState *mis,
36449157 1971 const char *rbname,
e0b266f0
DDAG
1972 uint64_t start, size_t length)
1973{
1974 int ret = -1;
1975
36449157 1976 trace_ram_discard_range(rbname, start, length);
d3a5038c 1977
e0b266f0 1978 rcu_read_lock();
36449157 1979 RAMBlock *rb = qemu_ram_block_by_name(rbname);
e0b266f0
DDAG
1980
1981 if (!rb) {
36449157 1982 error_report("ram_discard_range: Failed to find block '%s'", rbname);
e0b266f0
DDAG
1983 goto err;
1984 }
1985
d3a5038c 1986 ret = ram_block_discard_range(rb, start, length);
e0b266f0
DDAG
1987
1988err:
1989 rcu_read_unlock();
1990
1991 return ret;
1992}
1993
ceb4d168 1994static int ram_state_init(RAMState *rs)
56e93d26 1995{
56e93d26
JQ
1996 int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
1997
ceb4d168 1998 memset(rs, 0, sizeof(*rs));
108cfae0 1999 qemu_mutex_init(&rs->bitmap_mutex);
ec481c6c
JQ
2000 qemu_mutex_init(&rs->src_page_req_mutex);
2001 QSIMPLEQ_INIT(&rs->src_page_requests);
56e93d26
JQ
2002
2003 if (migrate_use_xbzrle()) {
2004 XBZRLE_cache_lock();
adb65dec 2005 ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE);
56e93d26
JQ
2006 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
2007 TARGET_PAGE_SIZE,
2008 TARGET_PAGE_SIZE);
2009 if (!XBZRLE.cache) {
2010 XBZRLE_cache_unlock();
2011 error_report("Error creating cache");
2012 return -1;
2013 }
2014 XBZRLE_cache_unlock();
2015
2016 /* We prefer not to abort if there is no memory */
2017 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2018 if (!XBZRLE.encoded_buf) {
2019 error_report("Error allocating encoded_buf");
2020 return -1;
2021 }
2022
2023 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2024 if (!XBZRLE.current_buf) {
2025 error_report("Error allocating current_buf");
2026 g_free(XBZRLE.encoded_buf);
2027 XBZRLE.encoded_buf = NULL;
2028 return -1;
2029 }
56e93d26
JQ
2030 }
2031
49877834
PB
2032 /* For memory_global_dirty_log_start below. */
2033 qemu_mutex_lock_iothread();
2034
56e93d26
JQ
2035 qemu_mutex_lock_ramlist();
2036 rcu_read_lock();
6f37bb8b 2037 ram_state_reset(rs);
56e93d26 2038
eb859c53 2039 rs->ram_bitmap = g_new0(struct RAMBitmap, 1);
0827b9e9
AA
2040 /* Skip setting bitmap if there is no RAM */
2041 if (ram_bytes_total()) {
2042 ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
eb859c53
JQ
2043 rs->ram_bitmap->bmap = bitmap_new(ram_bitmap_pages);
2044 bitmap_set(rs->ram_bitmap->bmap, 0, ram_bitmap_pages);
0827b9e9
AA
2045
2046 if (migrate_postcopy_ram()) {
eb859c53
JQ
2047 rs->ram_bitmap->unsentmap = bitmap_new(ram_bitmap_pages);
2048 bitmap_set(rs->ram_bitmap->unsentmap, 0, ram_bitmap_pages);
0827b9e9 2049 }
f3f491fc
DDAG
2050 }
2051
56e93d26
JQ
2052 /*
2053 * Count the total number of pages used by ram blocks not including any
2054 * gaps due to alignment or unplugs.
2055 */
0d8ec885 2056 rs->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
56e93d26
JQ
2057
2058 memory_global_dirty_log_start();
8d820d6f 2059 migration_bitmap_sync(rs);
56e93d26 2060 qemu_mutex_unlock_ramlist();
49877834 2061 qemu_mutex_unlock_iothread();
a91246c9
HZ
2062 rcu_read_unlock();
2063
2064 return 0;
2065}
2066
3d0684b2
JQ
2067/*
2068 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
a91246c9
HZ
2069 * long-running RCU critical section. When rcu-reclaims in the code
2070 * start to become numerous it will be necessary to reduce the
2071 * granularity of these critical sections.
2072 */
2073
3d0684b2
JQ
2074/**
2075 * ram_save_setup: Setup RAM for migration
2076 *
2077 * Returns zero to indicate success and negative for error
2078 *
2079 * @f: QEMUFile where to send the data
2080 * @opaque: RAMState pointer
2081 */
a91246c9
HZ
2082static int ram_save_setup(QEMUFile *f, void *opaque)
2083{
6f37bb8b 2084 RAMState *rs = opaque;
a91246c9
HZ
2085 RAMBlock *block;
2086
2087 /* migration has already setup the bitmap, reuse it. */
2088 if (!migration_in_colo_state()) {
ceb4d168 2089 if (ram_state_init(rs) < 0) {
a91246c9
HZ
2090 return -1;
2091 }
2092 }
2093
2094 rcu_read_lock();
56e93d26
JQ
2095
2096 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2097
2098 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2099 qemu_put_byte(f, strlen(block->idstr));
2100 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2101 qemu_put_be64(f, block->used_length);
ef08fb38
DDAG
2102 if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2103 qemu_put_be64(f, block->page_size);
2104 }
56e93d26
JQ
2105 }
2106
2107 rcu_read_unlock();
2108
2109 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2110 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2111
2112 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2113
2114 return 0;
2115}
2116
3d0684b2
JQ
2117/**
2118 * ram_save_iterate: iterative stage for migration
2119 *
2120 * Returns zero to indicate success and negative for error
2121 *
2122 * @f: QEMUFile where to send the data
2123 * @opaque: RAMState pointer
2124 */
56e93d26
JQ
2125static int ram_save_iterate(QEMUFile *f, void *opaque)
2126{
6f37bb8b 2127 RAMState *rs = opaque;
56e93d26
JQ
2128 int ret;
2129 int i;
2130 int64_t t0;
5c90308f 2131 int done = 0;
56e93d26
JQ
2132
2133 rcu_read_lock();
6f37bb8b
JQ
2134 if (ram_list.version != rs->last_version) {
2135 ram_state_reset(rs);
56e93d26
JQ
2136 }
2137
2138 /* Read version before ram_list.blocks */
2139 smp_rmb();
2140
2141 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2142
2143 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2144 i = 0;
2145 while ((ret = qemu_file_rate_limit(f)) == 0) {
2146 int pages;
2147
072c2511 2148 pages = ram_find_and_save_block(rs, f, false);
56e93d26
JQ
2149 /* no more pages to sent */
2150 if (pages == 0) {
5c90308f 2151 done = 1;
56e93d26
JQ
2152 break;
2153 }
23b28c3c 2154 rs->iterations++;
070afca2 2155
56e93d26
JQ
2156 /* we want to check in the 1st loop, just in case it was the 1st time
2157 and we had to sync the dirty bitmap.
2158 qemu_get_clock_ns() is a bit expensive, so we only check each some
2159 iterations
2160 */
2161 if ((i & 63) == 0) {
2162 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2163 if (t1 > MAX_WAIT) {
55c4446b 2164 trace_ram_save_iterate_big_wait(t1, i);
56e93d26
JQ
2165 break;
2166 }
2167 }
2168 i++;
2169 }
2f4fde93 2170 flush_compressed_data(rs, f);
56e93d26
JQ
2171 rcu_read_unlock();
2172
2173 /*
2174 * Must occur before EOS (or any QEMUFile operation)
2175 * because of RDMA protocol.
2176 */
2177 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2178
2179 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2f4fde93 2180 rs->bytes_transferred += 8;
56e93d26
JQ
2181
2182 ret = qemu_file_get_error(f);
2183 if (ret < 0) {
2184 return ret;
2185 }
2186
5c90308f 2187 return done;
56e93d26
JQ
2188}
2189
3d0684b2
JQ
2190/**
2191 * ram_save_complete: function called to send the remaining amount of ram
2192 *
2193 * Returns zero to indicate success
2194 *
2195 * Called with iothread lock
2196 *
2197 * @f: QEMUFile where to send the data
2198 * @opaque: RAMState pointer
2199 */
56e93d26
JQ
2200static int ram_save_complete(QEMUFile *f, void *opaque)
2201{
6f37bb8b
JQ
2202 RAMState *rs = opaque;
2203
56e93d26
JQ
2204 rcu_read_lock();
2205
663e6c1d 2206 if (!migration_in_postcopy(migrate_get_current())) {
8d820d6f 2207 migration_bitmap_sync(rs);
663e6c1d 2208 }
56e93d26
JQ
2209
2210 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2211
2212 /* try transferring iterative blocks of memory */
2213
2214 /* flush all remaining blocks regardless of rate limiting */
2215 while (true) {
2216 int pages;
2217
072c2511 2218 pages = ram_find_and_save_block(rs, f, !migration_in_colo_state());
56e93d26
JQ
2219 /* no more blocks to sent */
2220 if (pages == 0) {
2221 break;
2222 }
2223 }
2224
2f4fde93 2225 flush_compressed_data(rs, f);
56e93d26 2226 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
56e93d26
JQ
2227
2228 rcu_read_unlock();
d09a6fde 2229
56e93d26
JQ
2230 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2231
2232 return 0;
2233}
2234
c31b098f
DDAG
2235static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2236 uint64_t *non_postcopiable_pending,
2237 uint64_t *postcopiable_pending)
56e93d26 2238{
8d820d6f 2239 RAMState *rs = opaque;
56e93d26
JQ
2240 uint64_t remaining_size;
2241
9edabd4d 2242 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 2243
663e6c1d
DDAG
2244 if (!migration_in_postcopy(migrate_get_current()) &&
2245 remaining_size < max_size) {
56e93d26
JQ
2246 qemu_mutex_lock_iothread();
2247 rcu_read_lock();
8d820d6f 2248 migration_bitmap_sync(rs);
56e93d26
JQ
2249 rcu_read_unlock();
2250 qemu_mutex_unlock_iothread();
9edabd4d 2251 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 2252 }
c31b098f
DDAG
2253
2254 /* We can do postcopy, and all the data is postcopiable */
2255 *postcopiable_pending += remaining_size;
56e93d26
JQ
2256}
2257
2258static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2259{
2260 unsigned int xh_len;
2261 int xh_flags;
063e760a 2262 uint8_t *loaded_data;
56e93d26
JQ
2263
2264 if (!xbzrle_decoded_buf) {
2265 xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2266 }
063e760a 2267 loaded_data = xbzrle_decoded_buf;
56e93d26
JQ
2268
2269 /* extract RLE header */
2270 xh_flags = qemu_get_byte(f);
2271 xh_len = qemu_get_be16(f);
2272
2273 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2274 error_report("Failed to load XBZRLE page - wrong compression!");
2275 return -1;
2276 }
2277
2278 if (xh_len > TARGET_PAGE_SIZE) {
2279 error_report("Failed to load XBZRLE page - len overflow!");
2280 return -1;
2281 }
2282 /* load data and decode */
063e760a 2283 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
56e93d26
JQ
2284
2285 /* decode RLE */
063e760a 2286 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
56e93d26
JQ
2287 TARGET_PAGE_SIZE) == -1) {
2288 error_report("Failed to load XBZRLE page - decode error!");
2289 return -1;
2290 }
2291
2292 return 0;
2293}
2294
3d0684b2
JQ
2295/**
2296 * ram_block_from_stream: read a RAMBlock id from the migration stream
2297 *
2298 * Must be called from within a rcu critical section.
2299 *
56e93d26 2300 * Returns a pointer from within the RCU-protected ram_list.
a7180877 2301 *
3d0684b2
JQ
2302 * @f: QEMUFile where to read the data from
2303 * @flags: Page flags (mostly to see if it's a continuation of previous block)
a7180877 2304 */
3d0684b2 2305static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
56e93d26
JQ
2306{
2307 static RAMBlock *block = NULL;
2308 char id[256];
2309 uint8_t len;
2310
2311 if (flags & RAM_SAVE_FLAG_CONTINUE) {
4c4bad48 2312 if (!block) {
56e93d26
JQ
2313 error_report("Ack, bad migration stream!");
2314 return NULL;
2315 }
4c4bad48 2316 return block;
56e93d26
JQ
2317 }
2318
2319 len = qemu_get_byte(f);
2320 qemu_get_buffer(f, (uint8_t *)id, len);
2321 id[len] = 0;
2322
e3dd7493 2323 block = qemu_ram_block_by_name(id);
4c4bad48
HZ
2324 if (!block) {
2325 error_report("Can't find block %s", id);
2326 return NULL;
56e93d26
JQ
2327 }
2328
4c4bad48
HZ
2329 return block;
2330}
2331
2332static inline void *host_from_ram_block_offset(RAMBlock *block,
2333 ram_addr_t offset)
2334{
2335 if (!offset_in_ramblock(block, offset)) {
2336 return NULL;
2337 }
2338
2339 return block->host + offset;
56e93d26
JQ
2340}
2341
3d0684b2
JQ
2342/**
2343 * ram_handle_compressed: handle the zero page case
2344 *
56e93d26
JQ
2345 * If a page (or a whole RDMA chunk) has been
2346 * determined to be zero, then zap it.
3d0684b2
JQ
2347 *
2348 * @host: host address for the zero page
2349 * @ch: what the page is filled from. We only support zero
2350 * @size: size of the zero page
56e93d26
JQ
2351 */
2352void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2353{
2354 if (ch != 0 || !is_zero_range(host, size)) {
2355 memset(host, ch, size);
2356 }
2357}
2358
2359static void *do_data_decompress(void *opaque)
2360{
2361 DecompressParam *param = opaque;
2362 unsigned long pagesize;
33d151f4
LL
2363 uint8_t *des;
2364 int len;
56e93d26 2365
33d151f4 2366 qemu_mutex_lock(&param->mutex);
90e56fb4 2367 while (!param->quit) {
33d151f4
LL
2368 if (param->des) {
2369 des = param->des;
2370 len = param->len;
2371 param->des = 0;
2372 qemu_mutex_unlock(&param->mutex);
2373
56e93d26 2374 pagesize = TARGET_PAGE_SIZE;
73a8912b
LL
2375 /* uncompress() will return failed in some case, especially
2376 * when the page is dirted when doing the compression, it's
2377 * not a problem because the dirty page will be retransferred
2378 * and uncompress() won't break the data in other pages.
2379 */
33d151f4
LL
2380 uncompress((Bytef *)des, &pagesize,
2381 (const Bytef *)param->compbuf, len);
73a8912b 2382
33d151f4
LL
2383 qemu_mutex_lock(&decomp_done_lock);
2384 param->done = true;
2385 qemu_cond_signal(&decomp_done_cond);
2386 qemu_mutex_unlock(&decomp_done_lock);
2387
2388 qemu_mutex_lock(&param->mutex);
2389 } else {
2390 qemu_cond_wait(&param->cond, &param->mutex);
2391 }
56e93d26 2392 }
33d151f4 2393 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
2394
2395 return NULL;
2396}
2397
5533b2e9
LL
2398static void wait_for_decompress_done(void)
2399{
2400 int idx, thread_count;
2401
2402 if (!migrate_use_compression()) {
2403 return;
2404 }
2405
2406 thread_count = migrate_decompress_threads();
2407 qemu_mutex_lock(&decomp_done_lock);
2408 for (idx = 0; idx < thread_count; idx++) {
2409 while (!decomp_param[idx].done) {
2410 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2411 }
2412 }
2413 qemu_mutex_unlock(&decomp_done_lock);
2414}
2415
56e93d26
JQ
2416void migrate_decompress_threads_create(void)
2417{
2418 int i, thread_count;
2419
2420 thread_count = migrate_decompress_threads();
2421 decompress_threads = g_new0(QemuThread, thread_count);
2422 decomp_param = g_new0(DecompressParam, thread_count);
73a8912b
LL
2423 qemu_mutex_init(&decomp_done_lock);
2424 qemu_cond_init(&decomp_done_cond);
56e93d26
JQ
2425 for (i = 0; i < thread_count; i++) {
2426 qemu_mutex_init(&decomp_param[i].mutex);
2427 qemu_cond_init(&decomp_param[i].cond);
2428 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
73a8912b 2429 decomp_param[i].done = true;
90e56fb4 2430 decomp_param[i].quit = false;
56e93d26
JQ
2431 qemu_thread_create(decompress_threads + i, "decompress",
2432 do_data_decompress, decomp_param + i,
2433 QEMU_THREAD_JOINABLE);
2434 }
2435}
2436
2437void migrate_decompress_threads_join(void)
2438{
2439 int i, thread_count;
2440
56e93d26
JQ
2441 thread_count = migrate_decompress_threads();
2442 for (i = 0; i < thread_count; i++) {
2443 qemu_mutex_lock(&decomp_param[i].mutex);
90e56fb4 2444 decomp_param[i].quit = true;
56e93d26
JQ
2445 qemu_cond_signal(&decomp_param[i].cond);
2446 qemu_mutex_unlock(&decomp_param[i].mutex);
2447 }
2448 for (i = 0; i < thread_count; i++) {
2449 qemu_thread_join(decompress_threads + i);
2450 qemu_mutex_destroy(&decomp_param[i].mutex);
2451 qemu_cond_destroy(&decomp_param[i].cond);
2452 g_free(decomp_param[i].compbuf);
2453 }
2454 g_free(decompress_threads);
2455 g_free(decomp_param);
56e93d26
JQ
2456 decompress_threads = NULL;
2457 decomp_param = NULL;
56e93d26
JQ
2458}
2459
c1bc6626 2460static void decompress_data_with_multi_threads(QEMUFile *f,
56e93d26
JQ
2461 void *host, int len)
2462{
2463 int idx, thread_count;
2464
2465 thread_count = migrate_decompress_threads();
73a8912b 2466 qemu_mutex_lock(&decomp_done_lock);
56e93d26
JQ
2467 while (true) {
2468 for (idx = 0; idx < thread_count; idx++) {
73a8912b 2469 if (decomp_param[idx].done) {
33d151f4
LL
2470 decomp_param[idx].done = false;
2471 qemu_mutex_lock(&decomp_param[idx].mutex);
c1bc6626 2472 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
56e93d26
JQ
2473 decomp_param[idx].des = host;
2474 decomp_param[idx].len = len;
33d151f4
LL
2475 qemu_cond_signal(&decomp_param[idx].cond);
2476 qemu_mutex_unlock(&decomp_param[idx].mutex);
56e93d26
JQ
2477 break;
2478 }
2479 }
2480 if (idx < thread_count) {
2481 break;
73a8912b
LL
2482 } else {
2483 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
56e93d26
JQ
2484 }
2485 }
73a8912b 2486 qemu_mutex_unlock(&decomp_done_lock);
56e93d26
JQ
2487}
2488
3d0684b2
JQ
2489/**
2490 * ram_postcopy_incoming_init: allocate postcopy data structures
2491 *
2492 * Returns 0 for success and negative if there was one error
2493 *
2494 * @mis: current migration incoming state
2495 *
2496 * Allocate data structures etc needed by incoming migration with
2497 * postcopy-ram. postcopy-ram's similarly names
2498 * postcopy_ram_incoming_init does the work.
1caddf8a
DDAG
2499 */
2500int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2501{
2502 size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
2503
2504 return postcopy_ram_incoming_init(mis, ram_pages);
2505}
2506
3d0684b2
JQ
2507/**
2508 * ram_load_postcopy: load a page in postcopy case
2509 *
2510 * Returns 0 for success or -errno in case of error
2511 *
a7180877
DDAG
2512 * Called in postcopy mode by ram_load().
2513 * rcu_read_lock is taken prior to this being called.
3d0684b2
JQ
2514 *
2515 * @f: QEMUFile where to send the data
a7180877
DDAG
2516 */
2517static int ram_load_postcopy(QEMUFile *f)
2518{
2519 int flags = 0, ret = 0;
2520 bool place_needed = false;
28abd200 2521 bool matching_page_sizes = false;
a7180877
DDAG
2522 MigrationIncomingState *mis = migration_incoming_get_current();
2523 /* Temporary page that is later 'placed' */
2524 void *postcopy_host_page = postcopy_get_tmp_page(mis);
c53b7ddc 2525 void *last_host = NULL;
a3b6ff6d 2526 bool all_zero = false;
a7180877
DDAG
2527
2528 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2529 ram_addr_t addr;
2530 void *host = NULL;
2531 void *page_buffer = NULL;
2532 void *place_source = NULL;
df9ff5e1 2533 RAMBlock *block = NULL;
a7180877 2534 uint8_t ch;
a7180877
DDAG
2535
2536 addr = qemu_get_be64(f);
2537 flags = addr & ~TARGET_PAGE_MASK;
2538 addr &= TARGET_PAGE_MASK;
2539
2540 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2541 place_needed = false;
2542 if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) {
df9ff5e1 2543 block = ram_block_from_stream(f, flags);
4c4bad48
HZ
2544
2545 host = host_from_ram_block_offset(block, addr);
a7180877
DDAG
2546 if (!host) {
2547 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2548 ret = -EINVAL;
2549 break;
2550 }
28abd200 2551 matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
a7180877 2552 /*
28abd200
DDAG
2553 * Postcopy requires that we place whole host pages atomically;
2554 * these may be huge pages for RAMBlocks that are backed by
2555 * hugetlbfs.
a7180877
DDAG
2556 * To make it atomic, the data is read into a temporary page
2557 * that's moved into place later.
2558 * The migration protocol uses, possibly smaller, target-pages
2559 * however the source ensures it always sends all the components
2560 * of a host page in order.
2561 */
2562 page_buffer = postcopy_host_page +
28abd200 2563 ((uintptr_t)host & (block->page_size - 1));
a7180877 2564 /* If all TP are zero then we can optimise the place */
28abd200 2565 if (!((uintptr_t)host & (block->page_size - 1))) {
a7180877 2566 all_zero = true;
c53b7ddc
DDAG
2567 } else {
2568 /* not the 1st TP within the HP */
2569 if (host != (last_host + TARGET_PAGE_SIZE)) {
9af9e0fe 2570 error_report("Non-sequential target page %p/%p",
c53b7ddc
DDAG
2571 host, last_host);
2572 ret = -EINVAL;
2573 break;
2574 }
a7180877
DDAG
2575 }
2576
c53b7ddc 2577
a7180877
DDAG
2578 /*
2579 * If it's the last part of a host page then we place the host
2580 * page
2581 */
2582 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
28abd200 2583 (block->page_size - 1)) == 0;
a7180877
DDAG
2584 place_source = postcopy_host_page;
2585 }
c53b7ddc 2586 last_host = host;
a7180877
DDAG
2587
2588 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2589 case RAM_SAVE_FLAG_COMPRESS:
2590 ch = qemu_get_byte(f);
2591 memset(page_buffer, ch, TARGET_PAGE_SIZE);
2592 if (ch) {
2593 all_zero = false;
2594 }
2595 break;
2596
2597 case RAM_SAVE_FLAG_PAGE:
2598 all_zero = false;
2599 if (!place_needed || !matching_page_sizes) {
2600 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2601 } else {
2602 /* Avoids the qemu_file copy during postcopy, which is
2603 * going to do a copy later; can only do it when we
2604 * do this read in one go (matching page sizes)
2605 */
2606 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2607 TARGET_PAGE_SIZE);
2608 }
2609 break;
2610 case RAM_SAVE_FLAG_EOS:
2611 /* normal exit */
2612 break;
2613 default:
2614 error_report("Unknown combination of migration flags: %#x"
2615 " (postcopy mode)", flags);
2616 ret = -EINVAL;
2617 }
2618
2619 if (place_needed) {
2620 /* This gets called at the last target page in the host page */
df9ff5e1
DDAG
2621 void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2622
a7180877 2623 if (all_zero) {
df9ff5e1
DDAG
2624 ret = postcopy_place_page_zero(mis, place_dest,
2625 block->page_size);
a7180877 2626 } else {
df9ff5e1
DDAG
2627 ret = postcopy_place_page(mis, place_dest,
2628 place_source, block->page_size);
a7180877
DDAG
2629 }
2630 }
2631 if (!ret) {
2632 ret = qemu_file_get_error(f);
2633 }
2634 }
2635
2636 return ret;
2637}
2638
56e93d26
JQ
2639static int ram_load(QEMUFile *f, void *opaque, int version_id)
2640{
2641 int flags = 0, ret = 0;
2642 static uint64_t seq_iter;
2643 int len = 0;
a7180877
DDAG
2644 /*
2645 * If system is running in postcopy mode, page inserts to host memory must
2646 * be atomic
2647 */
2648 bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
ef08fb38
DDAG
2649 /* ADVISE is earlier, it shows the source has the postcopy capability on */
2650 bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
56e93d26
JQ
2651
2652 seq_iter++;
2653
2654 if (version_id != 4) {
2655 ret = -EINVAL;
2656 }
2657
2658 /* This RCU critical section can be very long running.
2659 * When RCU reclaims in the code start to become numerous,
2660 * it will be necessary to reduce the granularity of this
2661 * critical section.
2662 */
2663 rcu_read_lock();
a7180877
DDAG
2664
2665 if (postcopy_running) {
2666 ret = ram_load_postcopy(f);
2667 }
2668
2669 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
56e93d26 2670 ram_addr_t addr, total_ram_bytes;
a776aa15 2671 void *host = NULL;
56e93d26
JQ
2672 uint8_t ch;
2673
2674 addr = qemu_get_be64(f);
2675 flags = addr & ~TARGET_PAGE_MASK;
2676 addr &= TARGET_PAGE_MASK;
2677
a776aa15
DDAG
2678 if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE |
2679 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4c4bad48
HZ
2680 RAMBlock *block = ram_block_from_stream(f, flags);
2681
2682 host = host_from_ram_block_offset(block, addr);
a776aa15
DDAG
2683 if (!host) {
2684 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2685 ret = -EINVAL;
2686 break;
2687 }
2688 }
2689
56e93d26
JQ
2690 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2691 case RAM_SAVE_FLAG_MEM_SIZE:
2692 /* Synchronize RAM block list */
2693 total_ram_bytes = addr;
2694 while (!ret && total_ram_bytes) {
2695 RAMBlock *block;
56e93d26
JQ
2696 char id[256];
2697 ram_addr_t length;
2698
2699 len = qemu_get_byte(f);
2700 qemu_get_buffer(f, (uint8_t *)id, len);
2701 id[len] = 0;
2702 length = qemu_get_be64(f);
2703
e3dd7493
DDAG
2704 block = qemu_ram_block_by_name(id);
2705 if (block) {
2706 if (length != block->used_length) {
2707 Error *local_err = NULL;
56e93d26 2708
fa53a0e5 2709 ret = qemu_ram_resize(block, length,
e3dd7493
DDAG
2710 &local_err);
2711 if (local_err) {
2712 error_report_err(local_err);
56e93d26 2713 }
56e93d26 2714 }
ef08fb38
DDAG
2715 /* For postcopy we need to check hugepage sizes match */
2716 if (postcopy_advised &&
2717 block->page_size != qemu_host_page_size) {
2718 uint64_t remote_page_size = qemu_get_be64(f);
2719 if (remote_page_size != block->page_size) {
2720 error_report("Mismatched RAM page size %s "
2721 "(local) %zd != %" PRId64,
2722 id, block->page_size,
2723 remote_page_size);
2724 ret = -EINVAL;
2725 }
2726 }
e3dd7493
DDAG
2727 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2728 block->idstr);
2729 } else {
56e93d26
JQ
2730 error_report("Unknown ramblock \"%s\", cannot "
2731 "accept migration", id);
2732 ret = -EINVAL;
2733 }
2734
2735 total_ram_bytes -= length;
2736 }
2737 break;
a776aa15 2738
56e93d26 2739 case RAM_SAVE_FLAG_COMPRESS:
56e93d26
JQ
2740 ch = qemu_get_byte(f);
2741 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2742 break;
a776aa15 2743
56e93d26 2744 case RAM_SAVE_FLAG_PAGE:
56e93d26
JQ
2745 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2746 break;
56e93d26 2747
a776aa15 2748 case RAM_SAVE_FLAG_COMPRESS_PAGE:
56e93d26
JQ
2749 len = qemu_get_be32(f);
2750 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2751 error_report("Invalid compressed data length: %d", len);
2752 ret = -EINVAL;
2753 break;
2754 }
c1bc6626 2755 decompress_data_with_multi_threads(f, host, len);
56e93d26 2756 break;
a776aa15 2757
56e93d26 2758 case RAM_SAVE_FLAG_XBZRLE:
56e93d26
JQ
2759 if (load_xbzrle(f, addr, host) < 0) {
2760 error_report("Failed to decompress XBZRLE page at "
2761 RAM_ADDR_FMT, addr);
2762 ret = -EINVAL;
2763 break;
2764 }
2765 break;
2766 case RAM_SAVE_FLAG_EOS:
2767 /* normal exit */
2768 break;
2769 default:
2770 if (flags & RAM_SAVE_FLAG_HOOK) {
632e3a5c 2771 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
56e93d26
JQ
2772 } else {
2773 error_report("Unknown combination of migration flags: %#x",
2774 flags);
2775 ret = -EINVAL;
2776 }
2777 }
2778 if (!ret) {
2779 ret = qemu_file_get_error(f);
2780 }
2781 }
2782
5533b2e9 2783 wait_for_decompress_done();
56e93d26 2784 rcu_read_unlock();
55c4446b 2785 trace_ram_load_complete(ret, seq_iter);
56e93d26
JQ
2786 return ret;
2787}
2788
2789static SaveVMHandlers savevm_ram_handlers = {
2790 .save_live_setup = ram_save_setup,
2791 .save_live_iterate = ram_save_iterate,
763c906b 2792 .save_live_complete_postcopy = ram_save_complete,
a3e06c3d 2793 .save_live_complete_precopy = ram_save_complete,
56e93d26
JQ
2794 .save_live_pending = ram_save_pending,
2795 .load_state = ram_load,
6ad2a215 2796 .cleanup = ram_migration_cleanup,
56e93d26
JQ
2797};
2798
2799void ram_mig_init(void)
2800{
2801 qemu_mutex_init(&XBZRLE.lock);
6f37bb8b 2802 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
56e93d26 2803}