]> git.proxmox.com Git - mirror_qemu.git/blame - migration/ram.c
migration: provide ram_state_init()
[mirror_qemu.git] / migration / ram.c
CommitLineData
56e93d26
JQ
1/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
76cc7b58
JQ
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
56e93d26
JQ
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
1393a485 28#include "qemu/osdep.h"
33c11879 29#include "cpu.h"
56e93d26 30#include <zlib.h>
4addcd4f 31#include "qapi-event.h"
f348b6d1 32#include "qemu/cutils.h"
56e93d26
JQ
33#include "qemu/bitops.h"
34#include "qemu/bitmap.h"
7205c9ec 35#include "qemu/main-loop.h"
709e3fe8 36#include "xbzrle.h"
7b1e1a22 37#include "ram.h"
6666c96a 38#include "migration.h"
f2a8f0a6 39#include "migration/register.h"
7b1e1a22 40#include "migration/misc.h"
08a0aee1 41#include "qemu-file.h"
be07b0ac 42#include "postcopy-ram.h"
56e93d26 43#include "migration/page_cache.h"
56e93d26 44#include "qemu/error-report.h"
8acabf69 45#include "qapi/qmp/qerror.h"
56e93d26 46#include "trace.h"
56e93d26 47#include "exec/ram_addr.h"
56e93d26 48#include "qemu/rcu_queue.h"
a91246c9 49#include "migration/colo.h"
9ac78b61 50#include "migration/block.h"
56e93d26 51
56e93d26
JQ
52/***********************************************************/
53/* ram save/restore */
54
bb890ed5
JQ
55/* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
56 * worked for pages that where filled with the same char. We switched
57 * it to only search for the zero value. And to avoid confusion with
58 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
59 */
60
56e93d26 61#define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
bb890ed5 62#define RAM_SAVE_FLAG_ZERO 0x02
56e93d26
JQ
63#define RAM_SAVE_FLAG_MEM_SIZE 0x04
64#define RAM_SAVE_FLAG_PAGE 0x08
65#define RAM_SAVE_FLAG_EOS 0x10
66#define RAM_SAVE_FLAG_CONTINUE 0x20
67#define RAM_SAVE_FLAG_XBZRLE 0x40
68/* 0x80 is reserved in migration.h start with 0x100 next */
69#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
70
56e93d26
JQ
71static inline bool is_zero_range(uint8_t *p, uint64_t size)
72{
a1febc49 73 return buffer_is_zero(p, size);
56e93d26
JQ
74}
75
9360447d
JQ
76XBZRLECacheStats xbzrle_counters;
77
56e93d26
JQ
78/* struct contains XBZRLE cache and a static page
79 used by the compression */
80static struct {
81 /* buffer used for XBZRLE encoding */
82 uint8_t *encoded_buf;
83 /* buffer for storing page content */
84 uint8_t *current_buf;
85 /* Cache for XBZRLE, Protected by lock. */
86 PageCache *cache;
87 QemuMutex lock;
c00e0928
JQ
88 /* it will store a page full of zeros */
89 uint8_t *zero_target_page;
f265e0e4
JQ
90 /* buffer used for XBZRLE decoding */
91 uint8_t *decoded_buf;
56e93d26
JQ
92} XBZRLE;
93
56e93d26
JQ
94static void XBZRLE_cache_lock(void)
95{
96 if (migrate_use_xbzrle())
97 qemu_mutex_lock(&XBZRLE.lock);
98}
99
100static void XBZRLE_cache_unlock(void)
101{
102 if (migrate_use_xbzrle())
103 qemu_mutex_unlock(&XBZRLE.lock);
104}
105
3d0684b2
JQ
106/**
107 * xbzrle_cache_resize: resize the xbzrle cache
108 *
109 * This function is called from qmp_migrate_set_cache_size in main
110 * thread, possibly while a migration is in progress. A running
111 * migration may be using the cache and might finish during this call,
112 * hence changes to the cache are protected by XBZRLE.lock().
113 *
114 * Returns the new_size or negative in case of error.
115 *
116 * @new_size: new cache size
8acabf69 117 * @errp: set *errp if the check failed, with reason
56e93d26 118 */
8acabf69 119int64_t xbzrle_cache_resize(int64_t new_size, Error **errp)
56e93d26
JQ
120{
121 PageCache *new_cache;
122 int64_t ret;
123
8acabf69
JQ
124 /* Check for truncation */
125 if (new_size != (size_t)new_size) {
126 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
127 "exceeding address space");
128 return -1;
129 }
130
131 /* Cache should not be larger than guest ram size */
132 if (new_size > ram_bytes_total()) {
133 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
134 "exceeds guest ram size");
135 return -1;
136 }
137
56e93d26
JQ
138 XBZRLE_cache_lock();
139
140 if (XBZRLE.cache != NULL) {
141 if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
142 goto out_new_size;
143 }
80f8dfde 144 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
56e93d26 145 if (!new_cache) {
56e93d26
JQ
146 ret = -1;
147 goto out;
148 }
149
150 cache_fini(XBZRLE.cache);
151 XBZRLE.cache = new_cache;
152 }
153
154out_new_size:
155 ret = pow2floor(new_size);
156out:
157 XBZRLE_cache_unlock();
158 return ret;
159}
160
ec481c6c
JQ
161/*
162 * An outstanding page request, on the source, having been received
163 * and queued
164 */
165struct RAMSrcPageRequest {
166 RAMBlock *rb;
167 hwaddr offset;
168 hwaddr len;
169
170 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
171};
172
6f37bb8b
JQ
173/* State of RAM for migration */
174struct RAMState {
204b88b8
JQ
175 /* QEMUFile used for this migration */
176 QEMUFile *f;
6f37bb8b
JQ
177 /* Last block that we have visited searching for dirty pages */
178 RAMBlock *last_seen_block;
179 /* Last block from where we have sent data */
180 RAMBlock *last_sent_block;
269ace29
JQ
181 /* Last dirty target page we have sent */
182 ram_addr_t last_page;
6f37bb8b
JQ
183 /* last ram version we have seen */
184 uint32_t last_version;
185 /* We are in the first round */
186 bool ram_bulk_stage;
8d820d6f
JQ
187 /* How many times we have dirty too many pages */
188 int dirty_rate_high_cnt;
f664da80
JQ
189 /* these variables are used for bitmap sync */
190 /* last time we did a full bitmap_sync */
191 int64_t time_last_bitmap_sync;
eac74159 192 /* bytes transferred at start_time */
c4bdf0cf 193 uint64_t bytes_xfer_prev;
a66cd90c 194 /* number of dirty pages since start_time */
68908ed6 195 uint64_t num_dirty_pages_period;
b5833fde
JQ
196 /* xbzrle misses since the beginning of the period */
197 uint64_t xbzrle_cache_miss_prev;
36040d9c
JQ
198 /* number of iterations at the beginning of period */
199 uint64_t iterations_prev;
23b28c3c
JQ
200 /* Iterations since start */
201 uint64_t iterations;
9360447d 202 /* number of dirty bits in the bitmap */
2dfaf12e
PX
203 uint64_t migration_dirty_pages;
204 /* protects modification of the bitmap */
108cfae0 205 QemuMutex bitmap_mutex;
68a098f3
JQ
206 /* The RAMBlock used in the last src_page_requests */
207 RAMBlock *last_req_rb;
ec481c6c
JQ
208 /* Queue of outstanding page requests from the destination */
209 QemuMutex src_page_req_mutex;
210 QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
6f37bb8b
JQ
211};
212typedef struct RAMState RAMState;
213
53518d94 214static RAMState *ram_state;
6f37bb8b 215
9edabd4d 216uint64_t ram_bytes_remaining(void)
2f4fde93 217{
53518d94 218 return ram_state->migration_dirty_pages * TARGET_PAGE_SIZE;
2f4fde93
JQ
219}
220
9360447d 221MigrationStats ram_counters;
96506894 222
b8fb8cb7
DDAG
223/* used by the search for pages to send */
224struct PageSearchStatus {
225 /* Current block being searched */
226 RAMBlock *block;
a935e30f
JQ
227 /* Current page to search from */
228 unsigned long page;
b8fb8cb7
DDAG
229 /* Set once we wrap around */
230 bool complete_round;
231};
232typedef struct PageSearchStatus PageSearchStatus;
233
56e93d26 234struct CompressParam {
56e93d26 235 bool done;
90e56fb4 236 bool quit;
56e93d26
JQ
237 QEMUFile *file;
238 QemuMutex mutex;
239 QemuCond cond;
240 RAMBlock *block;
241 ram_addr_t offset;
242};
243typedef struct CompressParam CompressParam;
244
245struct DecompressParam {
73a8912b 246 bool done;
90e56fb4 247 bool quit;
56e93d26
JQ
248 QemuMutex mutex;
249 QemuCond cond;
250 void *des;
d341d9f3 251 uint8_t *compbuf;
56e93d26
JQ
252 int len;
253};
254typedef struct DecompressParam DecompressParam;
255
256static CompressParam *comp_param;
257static QemuThread *compress_threads;
258/* comp_done_cond is used to wake up the migration thread when
259 * one of the compression threads has finished the compression.
260 * comp_done_lock is used to co-work with comp_done_cond.
261 */
0d9f9a5c
LL
262static QemuMutex comp_done_lock;
263static QemuCond comp_done_cond;
56e93d26
JQ
264/* The empty QEMUFileOps will be used by file in CompressParam */
265static const QEMUFileOps empty_ops = { };
266
56e93d26
JQ
267static DecompressParam *decomp_param;
268static QemuThread *decompress_threads;
73a8912b
LL
269static QemuMutex decomp_done_lock;
270static QemuCond decomp_done_cond;
56e93d26 271
a7a9a88f
LL
272static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
273 ram_addr_t offset);
56e93d26
JQ
274
275static void *do_data_compress(void *opaque)
276{
277 CompressParam *param = opaque;
a7a9a88f
LL
278 RAMBlock *block;
279 ram_addr_t offset;
56e93d26 280
a7a9a88f 281 qemu_mutex_lock(&param->mutex);
90e56fb4 282 while (!param->quit) {
a7a9a88f
LL
283 if (param->block) {
284 block = param->block;
285 offset = param->offset;
286 param->block = NULL;
287 qemu_mutex_unlock(&param->mutex);
288
289 do_compress_ram_page(param->file, block, offset);
290
0d9f9a5c 291 qemu_mutex_lock(&comp_done_lock);
a7a9a88f 292 param->done = true;
0d9f9a5c
LL
293 qemu_cond_signal(&comp_done_cond);
294 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
295
296 qemu_mutex_lock(&param->mutex);
297 } else {
56e93d26
JQ
298 qemu_cond_wait(&param->cond, &param->mutex);
299 }
56e93d26 300 }
a7a9a88f 301 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
302
303 return NULL;
304}
305
306static inline void terminate_compression_threads(void)
307{
308 int idx, thread_count;
309
310 thread_count = migrate_compress_threads();
3d0684b2 311
56e93d26
JQ
312 for (idx = 0; idx < thread_count; idx++) {
313 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 314 comp_param[idx].quit = true;
56e93d26
JQ
315 qemu_cond_signal(&comp_param[idx].cond);
316 qemu_mutex_unlock(&comp_param[idx].mutex);
317 }
318}
319
f0afa331 320static void compress_threads_save_cleanup(void)
56e93d26
JQ
321{
322 int i, thread_count;
323
324 if (!migrate_use_compression()) {
325 return;
326 }
327 terminate_compression_threads();
328 thread_count = migrate_compress_threads();
329 for (i = 0; i < thread_count; i++) {
330 qemu_thread_join(compress_threads + i);
331 qemu_fclose(comp_param[i].file);
332 qemu_mutex_destroy(&comp_param[i].mutex);
333 qemu_cond_destroy(&comp_param[i].cond);
334 }
0d9f9a5c
LL
335 qemu_mutex_destroy(&comp_done_lock);
336 qemu_cond_destroy(&comp_done_cond);
56e93d26
JQ
337 g_free(compress_threads);
338 g_free(comp_param);
56e93d26
JQ
339 compress_threads = NULL;
340 comp_param = NULL;
56e93d26
JQ
341}
342
f0afa331 343static void compress_threads_save_setup(void)
56e93d26
JQ
344{
345 int i, thread_count;
346
347 if (!migrate_use_compression()) {
348 return;
349 }
56e93d26
JQ
350 thread_count = migrate_compress_threads();
351 compress_threads = g_new0(QemuThread, thread_count);
352 comp_param = g_new0(CompressParam, thread_count);
0d9f9a5c
LL
353 qemu_cond_init(&comp_done_cond);
354 qemu_mutex_init(&comp_done_lock);
56e93d26 355 for (i = 0; i < thread_count; i++) {
e110aa91
C
356 /* comp_param[i].file is just used as a dummy buffer to save data,
357 * set its ops to empty.
56e93d26
JQ
358 */
359 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
360 comp_param[i].done = true;
90e56fb4 361 comp_param[i].quit = false;
56e93d26
JQ
362 qemu_mutex_init(&comp_param[i].mutex);
363 qemu_cond_init(&comp_param[i].cond);
364 qemu_thread_create(compress_threads + i, "compress",
365 do_data_compress, comp_param + i,
366 QEMU_THREAD_JOINABLE);
367 }
368}
369
f986c3d2
JQ
370/* Multiple fd's */
371
372struct MultiFDSendParams {
373 uint8_t id;
374 char *name;
375 QemuThread thread;
376 QemuSemaphore sem;
377 QemuMutex mutex;
378 bool quit;
379};
380typedef struct MultiFDSendParams MultiFDSendParams;
381
382struct {
383 MultiFDSendParams *params;
384 /* number of created threads */
385 int count;
386} *multifd_send_state;
387
388static void terminate_multifd_send_threads(Error *errp)
389{
390 int i;
391
392 for (i = 0; i < multifd_send_state->count; i++) {
393 MultiFDSendParams *p = &multifd_send_state->params[i];
394
395 qemu_mutex_lock(&p->mutex);
396 p->quit = true;
397 qemu_sem_post(&p->sem);
398 qemu_mutex_unlock(&p->mutex);
399 }
400}
401
402int multifd_save_cleanup(Error **errp)
403{
404 int i;
405 int ret = 0;
406
407 if (!migrate_use_multifd()) {
408 return 0;
409 }
410 terminate_multifd_send_threads(NULL);
411 for (i = 0; i < multifd_send_state->count; i++) {
412 MultiFDSendParams *p = &multifd_send_state->params[i];
413
414 qemu_thread_join(&p->thread);
415 qemu_mutex_destroy(&p->mutex);
416 qemu_sem_destroy(&p->sem);
417 g_free(p->name);
418 p->name = NULL;
419 }
420 g_free(multifd_send_state->params);
421 multifd_send_state->params = NULL;
422 g_free(multifd_send_state);
423 multifd_send_state = NULL;
424 return ret;
425}
426
427static void *multifd_send_thread(void *opaque)
428{
429 MultiFDSendParams *p = opaque;
430
431 while (true) {
432 qemu_mutex_lock(&p->mutex);
433 if (p->quit) {
434 qemu_mutex_unlock(&p->mutex);
435 break;
436 }
437 qemu_mutex_unlock(&p->mutex);
438 qemu_sem_wait(&p->sem);
439 }
440
441 return NULL;
442}
443
444int multifd_save_setup(void)
445{
446 int thread_count;
447 uint8_t i;
448
449 if (!migrate_use_multifd()) {
450 return 0;
451 }
452 thread_count = migrate_multifd_channels();
453 multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
454 multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
455 multifd_send_state->count = 0;
456 for (i = 0; i < thread_count; i++) {
457 MultiFDSendParams *p = &multifd_send_state->params[i];
458
459 qemu_mutex_init(&p->mutex);
460 qemu_sem_init(&p->sem, 0);
461 p->quit = false;
462 p->id = i;
463 p->name = g_strdup_printf("multifdsend_%d", i);
464 qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
465 QEMU_THREAD_JOINABLE);
466
467 multifd_send_state->count++;
468 }
469 return 0;
470}
471
472struct MultiFDRecvParams {
473 uint8_t id;
474 char *name;
475 QemuThread thread;
476 QemuSemaphore sem;
477 QemuMutex mutex;
478 bool quit;
479};
480typedef struct MultiFDRecvParams MultiFDRecvParams;
481
482struct {
483 MultiFDRecvParams *params;
484 /* number of created threads */
485 int count;
486} *multifd_recv_state;
487
488static void terminate_multifd_recv_threads(Error *errp)
489{
490 int i;
491
492 for (i = 0; i < multifd_recv_state->count; i++) {
493 MultiFDRecvParams *p = &multifd_recv_state->params[i];
494
495 qemu_mutex_lock(&p->mutex);
496 p->quit = true;
497 qemu_sem_post(&p->sem);
498 qemu_mutex_unlock(&p->mutex);
499 }
500}
501
502int multifd_load_cleanup(Error **errp)
503{
504 int i;
505 int ret = 0;
506
507 if (!migrate_use_multifd()) {
508 return 0;
509 }
510 terminate_multifd_recv_threads(NULL);
511 for (i = 0; i < multifd_recv_state->count; i++) {
512 MultiFDRecvParams *p = &multifd_recv_state->params[i];
513
514 qemu_thread_join(&p->thread);
515 qemu_mutex_destroy(&p->mutex);
516 qemu_sem_destroy(&p->sem);
517 g_free(p->name);
518 p->name = NULL;
519 }
520 g_free(multifd_recv_state->params);
521 multifd_recv_state->params = NULL;
522 g_free(multifd_recv_state);
523 multifd_recv_state = NULL;
524
525 return ret;
526}
527
528static void *multifd_recv_thread(void *opaque)
529{
530 MultiFDRecvParams *p = opaque;
531
532 while (true) {
533 qemu_mutex_lock(&p->mutex);
534 if (p->quit) {
535 qemu_mutex_unlock(&p->mutex);
536 break;
537 }
538 qemu_mutex_unlock(&p->mutex);
539 qemu_sem_wait(&p->sem);
540 }
541
542 return NULL;
543}
544
545int multifd_load_setup(void)
546{
547 int thread_count;
548 uint8_t i;
549
550 if (!migrate_use_multifd()) {
551 return 0;
552 }
553 thread_count = migrate_multifd_channels();
554 multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
555 multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
556 multifd_recv_state->count = 0;
557 for (i = 0; i < thread_count; i++) {
558 MultiFDRecvParams *p = &multifd_recv_state->params[i];
559
560 qemu_mutex_init(&p->mutex);
561 qemu_sem_init(&p->sem, 0);
562 p->quit = false;
563 p->id = i;
564 p->name = g_strdup_printf("multifdrecv_%d", i);
565 qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
566 QEMU_THREAD_JOINABLE);
567 multifd_recv_state->count++;
568 }
569 return 0;
570}
571
56e93d26 572/**
3d0684b2 573 * save_page_header: write page header to wire
56e93d26
JQ
574 *
575 * If this is the 1st block, it also writes the block identification
576 *
3d0684b2 577 * Returns the number of bytes written
56e93d26
JQ
578 *
579 * @f: QEMUFile where to send the data
580 * @block: block that contains the page we want to send
581 * @offset: offset inside the block for the page
582 * in the lower bits, it contains flags
583 */
2bf3aa85
JQ
584static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
585 ram_addr_t offset)
56e93d26 586{
9f5f380b 587 size_t size, len;
56e93d26 588
24795694
JQ
589 if (block == rs->last_sent_block) {
590 offset |= RAM_SAVE_FLAG_CONTINUE;
591 }
2bf3aa85 592 qemu_put_be64(f, offset);
56e93d26
JQ
593 size = 8;
594
595 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
9f5f380b 596 len = strlen(block->idstr);
2bf3aa85
JQ
597 qemu_put_byte(f, len);
598 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
9f5f380b 599 size += 1 + len;
24795694 600 rs->last_sent_block = block;
56e93d26
JQ
601 }
602 return size;
603}
604
3d0684b2
JQ
605/**
606 * mig_throttle_guest_down: throotle down the guest
607 *
608 * Reduce amount of guest cpu execution to hopefully slow down memory
609 * writes. If guest dirty memory rate is reduced below the rate at
610 * which we can transfer pages to the destination then we should be
611 * able to complete migration. Some workloads dirty memory way too
612 * fast and will not effectively converge, even with auto-converge.
070afca2
JH
613 */
614static void mig_throttle_guest_down(void)
615{
616 MigrationState *s = migrate_get_current();
2594f56d
DB
617 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
618 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
070afca2
JH
619
620 /* We have not started throttling yet. Let's start it. */
621 if (!cpu_throttle_active()) {
622 cpu_throttle_set(pct_initial);
623 } else {
624 /* Throttling already on, just increase the rate */
625 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
626 }
627}
628
3d0684b2
JQ
629/**
630 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
631 *
6f37bb8b 632 * @rs: current RAM state
3d0684b2
JQ
633 * @current_addr: address for the zero page
634 *
635 * Update the xbzrle cache to reflect a page that's been sent as all 0.
56e93d26
JQ
636 * The important thing is that a stale (not-yet-0'd) page be replaced
637 * by the new data.
638 * As a bonus, if the page wasn't in the cache it gets added so that
3d0684b2 639 * when a small write is made into the 0'd page it gets XBZRLE sent.
56e93d26 640 */
6f37bb8b 641static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
56e93d26 642{
6f37bb8b 643 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
56e93d26
JQ
644 return;
645 }
646
647 /* We don't care if this fails to allocate a new cache page
648 * as long as it updated an old one */
c00e0928 649 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
9360447d 650 ram_counters.dirty_sync_count);
56e93d26
JQ
651}
652
653#define ENCODING_FLAG_XBZRLE 0x1
654
655/**
656 * save_xbzrle_page: compress and send current page
657 *
658 * Returns: 1 means that we wrote the page
659 * 0 means that page is identical to the one already sent
660 * -1 means that xbzrle would be longer than normal
661 *
5a987738 662 * @rs: current RAM state
3d0684b2
JQ
663 * @current_data: pointer to the address of the page contents
664 * @current_addr: addr of the page
56e93d26
JQ
665 * @block: block that contains the page we want to send
666 * @offset: offset inside the block for the page
667 * @last_stage: if we are at the completion stage
56e93d26 668 */
204b88b8 669static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
56e93d26 670 ram_addr_t current_addr, RAMBlock *block,
072c2511 671 ram_addr_t offset, bool last_stage)
56e93d26
JQ
672{
673 int encoded_len = 0, bytes_xbzrle;
674 uint8_t *prev_cached_page;
675
9360447d
JQ
676 if (!cache_is_cached(XBZRLE.cache, current_addr,
677 ram_counters.dirty_sync_count)) {
678 xbzrle_counters.cache_miss++;
56e93d26
JQ
679 if (!last_stage) {
680 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
9360447d 681 ram_counters.dirty_sync_count) == -1) {
56e93d26
JQ
682 return -1;
683 } else {
684 /* update *current_data when the page has been
685 inserted into cache */
686 *current_data = get_cached_data(XBZRLE.cache, current_addr);
687 }
688 }
689 return -1;
690 }
691
692 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
693
694 /* save current buffer into memory */
695 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
696
697 /* XBZRLE encoding (if there is no overflow) */
698 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
699 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
700 TARGET_PAGE_SIZE);
701 if (encoded_len == 0) {
55c4446b 702 trace_save_xbzrle_page_skipping();
56e93d26
JQ
703 return 0;
704 } else if (encoded_len == -1) {
55c4446b 705 trace_save_xbzrle_page_overflow();
9360447d 706 xbzrle_counters.overflow++;
56e93d26
JQ
707 /* update data in the cache */
708 if (!last_stage) {
709 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
710 *current_data = prev_cached_page;
711 }
712 return -1;
713 }
714
715 /* we need to update the data in the cache, in order to get the same data */
716 if (!last_stage) {
717 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
718 }
719
720 /* Send XBZRLE based compressed page */
2bf3aa85 721 bytes_xbzrle = save_page_header(rs, rs->f, block,
204b88b8
JQ
722 offset | RAM_SAVE_FLAG_XBZRLE);
723 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
724 qemu_put_be16(rs->f, encoded_len);
725 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
56e93d26 726 bytes_xbzrle += encoded_len + 1 + 2;
9360447d
JQ
727 xbzrle_counters.pages++;
728 xbzrle_counters.bytes += bytes_xbzrle;
729 ram_counters.transferred += bytes_xbzrle;
56e93d26
JQ
730
731 return 1;
732}
733
3d0684b2
JQ
734/**
735 * migration_bitmap_find_dirty: find the next dirty page from start
f3f491fc 736 *
3d0684b2
JQ
737 * Called with rcu_read_lock() to protect migration_bitmap
738 *
739 * Returns the byte offset within memory region of the start of a dirty page
740 *
6f37bb8b 741 * @rs: current RAM state
3d0684b2 742 * @rb: RAMBlock where to search for dirty pages
a935e30f 743 * @start: page where we start the search
f3f491fc 744 */
56e93d26 745static inline
a935e30f 746unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
f20e2865 747 unsigned long start)
56e93d26 748{
6b6712ef
JQ
749 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
750 unsigned long *bitmap = rb->bmap;
56e93d26
JQ
751 unsigned long next;
752
6b6712ef
JQ
753 if (rs->ram_bulk_stage && start > 0) {
754 next = start + 1;
56e93d26 755 } else {
6b6712ef 756 next = find_next_bit(bitmap, size, start);
56e93d26
JQ
757 }
758
6b6712ef 759 return next;
56e93d26
JQ
760}
761
06b10688 762static inline bool migration_bitmap_clear_dirty(RAMState *rs,
f20e2865
JQ
763 RAMBlock *rb,
764 unsigned long page)
a82d593b
DDAG
765{
766 bool ret;
a82d593b 767
6b6712ef 768 ret = test_and_clear_bit(page, rb->bmap);
a82d593b
DDAG
769
770 if (ret) {
0d8ec885 771 rs->migration_dirty_pages--;
a82d593b
DDAG
772 }
773 return ret;
774}
775
15440dd5
JQ
776static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
777 ram_addr_t start, ram_addr_t length)
56e93d26 778{
0d8ec885 779 rs->migration_dirty_pages +=
6b6712ef 780 cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
0d8ec885 781 &rs->num_dirty_pages_period);
56e93d26
JQ
782}
783
3d0684b2
JQ
784/**
785 * ram_pagesize_summary: calculate all the pagesizes of a VM
786 *
787 * Returns a summary bitmap of the page sizes of all RAMBlocks
788 *
789 * For VMs with just normal pages this is equivalent to the host page
790 * size. If it's got some huge pages then it's the OR of all the
791 * different page sizes.
e8ca1db2
DDAG
792 */
793uint64_t ram_pagesize_summary(void)
794{
795 RAMBlock *block;
796 uint64_t summary = 0;
797
99e15582 798 RAMBLOCK_FOREACH(block) {
e8ca1db2
DDAG
799 summary |= block->page_size;
800 }
801
802 return summary;
803}
804
8d820d6f 805static void migration_bitmap_sync(RAMState *rs)
56e93d26
JQ
806{
807 RAMBlock *block;
56e93d26 808 int64_t end_time;
c4bdf0cf 809 uint64_t bytes_xfer_now;
56e93d26 810
9360447d 811 ram_counters.dirty_sync_count++;
56e93d26 812
f664da80
JQ
813 if (!rs->time_last_bitmap_sync) {
814 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
56e93d26
JQ
815 }
816
817 trace_migration_bitmap_sync_start();
9c1f8f44 818 memory_global_dirty_log_sync();
56e93d26 819
108cfae0 820 qemu_mutex_lock(&rs->bitmap_mutex);
56e93d26 821 rcu_read_lock();
99e15582 822 RAMBLOCK_FOREACH(block) {
15440dd5 823 migration_bitmap_sync_range(rs, block, 0, block->used_length);
56e93d26
JQ
824 }
825 rcu_read_unlock();
108cfae0 826 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26 827
a66cd90c 828 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1ffb5dfd 829
56e93d26
JQ
830 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
831
832 /* more than 1 second = 1000 millisecons */
f664da80 833 if (end_time > rs->time_last_bitmap_sync + 1000) {
d693c6f1 834 /* calculate period counters */
9360447d 835 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
d693c6f1 836 / (end_time - rs->time_last_bitmap_sync);
9360447d 837 bytes_xfer_now = ram_counters.transferred;
d693c6f1 838
9ac78b61
PL
839 /* During block migration the auto-converge logic incorrectly detects
840 * that ram migration makes no progress. Avoid this by disabling the
841 * throttling logic during the bulk phase of block migration. */
842 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
56e93d26
JQ
843 /* The following detection logic can be refined later. For now:
844 Check to see if the dirtied bytes is 50% more than the approx.
845 amount of bytes that just got transferred since the last time we
070afca2
JH
846 were in this routine. If that happens twice, start or increase
847 throttling */
070afca2 848
d693c6f1 849 if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
eac74159 850 (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
b4a3c64b 851 (++rs->dirty_rate_high_cnt >= 2)) {
56e93d26 852 trace_migration_throttle();
8d820d6f 853 rs->dirty_rate_high_cnt = 0;
070afca2 854 mig_throttle_guest_down();
d693c6f1 855 }
56e93d26 856 }
070afca2 857
56e93d26 858 if (migrate_use_xbzrle()) {
23b28c3c 859 if (rs->iterations_prev != rs->iterations) {
9360447d
JQ
860 xbzrle_counters.cache_miss_rate =
861 (double)(xbzrle_counters.cache_miss -
b5833fde 862 rs->xbzrle_cache_miss_prev) /
23b28c3c 863 (rs->iterations - rs->iterations_prev);
56e93d26 864 }
23b28c3c 865 rs->iterations_prev = rs->iterations;
9360447d 866 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
56e93d26 867 }
d693c6f1
FF
868
869 /* reset period counters */
f664da80 870 rs->time_last_bitmap_sync = end_time;
a66cd90c 871 rs->num_dirty_pages_period = 0;
d2a4d85a 872 rs->bytes_xfer_prev = bytes_xfer_now;
56e93d26 873 }
4addcd4f 874 if (migrate_use_events()) {
9360447d 875 qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
4addcd4f 876 }
56e93d26
JQ
877}
878
879/**
3d0684b2 880 * save_zero_page: send the zero page to the stream
56e93d26 881 *
3d0684b2 882 * Returns the number of pages written.
56e93d26 883 *
f7ccd61b 884 * @rs: current RAM state
56e93d26
JQ
885 * @block: block that contains the page we want to send
886 * @offset: offset inside the block for the page
887 * @p: pointer to the page
56e93d26 888 */
ce25d337
JQ
889static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
890 uint8_t *p)
56e93d26
JQ
891{
892 int pages = -1;
893
894 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
9360447d
JQ
895 ram_counters.duplicate++;
896 ram_counters.transferred +=
bb890ed5 897 save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
ce25d337 898 qemu_put_byte(rs->f, 0);
9360447d 899 ram_counters.transferred += 1;
56e93d26
JQ
900 pages = 1;
901 }
902
903 return pages;
904}
905
5727309d 906static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
53f09a10 907{
5727309d 908 if (!migrate_release_ram() || !migration_in_postcopy()) {
53f09a10
PB
909 return;
910 }
911
aaa2064c 912 ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
53f09a10
PB
913}
914
56e93d26 915/**
3d0684b2 916 * ram_save_page: send the given page to the stream
56e93d26 917 *
3d0684b2 918 * Returns the number of pages written.
3fd3c4b3
DDAG
919 * < 0 - error
920 * >=0 - Number of pages written - this might legally be 0
921 * if xbzrle noticed the page was the same.
56e93d26 922 *
6f37bb8b 923 * @rs: current RAM state
56e93d26
JQ
924 * @block: block that contains the page we want to send
925 * @offset: offset inside the block for the page
926 * @last_stage: if we are at the completion stage
56e93d26 927 */
a0a8aa14 928static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
56e93d26
JQ
929{
930 int pages = -1;
931 uint64_t bytes_xmit;
932 ram_addr_t current_addr;
56e93d26
JQ
933 uint8_t *p;
934 int ret;
935 bool send_async = true;
a08f6890 936 RAMBlock *block = pss->block;
a935e30f 937 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
56e93d26 938
2f68e399 939 p = block->host + offset;
1db9d8e5 940 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
56e93d26
JQ
941
942 /* In doubt sent page as normal */
943 bytes_xmit = 0;
ce25d337 944 ret = ram_control_save_page(rs->f, block->offset,
56e93d26
JQ
945 offset, TARGET_PAGE_SIZE, &bytes_xmit);
946 if (bytes_xmit) {
9360447d 947 ram_counters.transferred += bytes_xmit;
56e93d26
JQ
948 pages = 1;
949 }
950
951 XBZRLE_cache_lock();
952
953 current_addr = block->offset + offset;
954
56e93d26
JQ
955 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
956 if (ret != RAM_SAVE_CONTROL_DELAYED) {
957 if (bytes_xmit > 0) {
9360447d 958 ram_counters.normal++;
56e93d26 959 } else if (bytes_xmit == 0) {
9360447d 960 ram_counters.duplicate++;
56e93d26
JQ
961 }
962 }
963 } else {
ce25d337 964 pages = save_zero_page(rs, block, offset, p);
56e93d26
JQ
965 if (pages > 0) {
966 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
967 * page would be stale
968 */
6f37bb8b 969 xbzrle_cache_zero_page(rs, current_addr);
a935e30f 970 ram_release_pages(block->idstr, offset, pages);
6f37bb8b 971 } else if (!rs->ram_bulk_stage &&
5727309d 972 !migration_in_postcopy() && migrate_use_xbzrle()) {
204b88b8 973 pages = save_xbzrle_page(rs, &p, current_addr, block,
072c2511 974 offset, last_stage);
56e93d26
JQ
975 if (!last_stage) {
976 /* Can't send this cached data async, since the cache page
977 * might get updated before it gets to the wire
978 */
979 send_async = false;
980 }
981 }
982 }
983
984 /* XBZRLE overflow or normal page */
985 if (pages == -1) {
9360447d
JQ
986 ram_counters.transferred +=
987 save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_PAGE);
56e93d26 988 if (send_async) {
ce25d337 989 qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
53f09a10 990 migrate_release_ram() &
5727309d 991 migration_in_postcopy());
56e93d26 992 } else {
ce25d337 993 qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
56e93d26 994 }
9360447d 995 ram_counters.transferred += TARGET_PAGE_SIZE;
56e93d26 996 pages = 1;
9360447d 997 ram_counters.normal++;
56e93d26
JQ
998 }
999
1000 XBZRLE_cache_unlock();
1001
1002 return pages;
1003}
1004
a7a9a88f
LL
1005static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
1006 ram_addr_t offset)
56e93d26 1007{
53518d94 1008 RAMState *rs = ram_state;
56e93d26 1009 int bytes_sent, blen;
a7a9a88f 1010 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
56e93d26 1011
2bf3aa85 1012 bytes_sent = save_page_header(rs, f, block, offset |
56e93d26 1013 RAM_SAVE_FLAG_COMPRESS_PAGE);
a7a9a88f 1014 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
56e93d26 1015 migrate_compress_level());
b3be2896
LL
1016 if (blen < 0) {
1017 bytes_sent = 0;
1018 qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
1019 error_report("compressed data failed!");
1020 } else {
1021 bytes_sent += blen;
5727309d 1022 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
b3be2896 1023 }
56e93d26
JQ
1024
1025 return bytes_sent;
1026}
1027
ce25d337 1028static void flush_compressed_data(RAMState *rs)
56e93d26
JQ
1029{
1030 int idx, len, thread_count;
1031
1032 if (!migrate_use_compression()) {
1033 return;
1034 }
1035 thread_count = migrate_compress_threads();
a7a9a88f 1036
0d9f9a5c 1037 qemu_mutex_lock(&comp_done_lock);
56e93d26 1038 for (idx = 0; idx < thread_count; idx++) {
a7a9a88f 1039 while (!comp_param[idx].done) {
0d9f9a5c 1040 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26 1041 }
a7a9a88f 1042 }
0d9f9a5c 1043 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
1044
1045 for (idx = 0; idx < thread_count; idx++) {
1046 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 1047 if (!comp_param[idx].quit) {
ce25d337 1048 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
9360447d 1049 ram_counters.transferred += len;
56e93d26 1050 }
a7a9a88f 1051 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26
JQ
1052 }
1053}
1054
1055static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1056 ram_addr_t offset)
1057{
1058 param->block = block;
1059 param->offset = offset;
1060}
1061
ce25d337
JQ
1062static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1063 ram_addr_t offset)
56e93d26
JQ
1064{
1065 int idx, thread_count, bytes_xmit = -1, pages = -1;
1066
1067 thread_count = migrate_compress_threads();
0d9f9a5c 1068 qemu_mutex_lock(&comp_done_lock);
56e93d26
JQ
1069 while (true) {
1070 for (idx = 0; idx < thread_count; idx++) {
1071 if (comp_param[idx].done) {
a7a9a88f 1072 comp_param[idx].done = false;
ce25d337 1073 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
a7a9a88f 1074 qemu_mutex_lock(&comp_param[idx].mutex);
56e93d26 1075 set_compress_params(&comp_param[idx], block, offset);
a7a9a88f
LL
1076 qemu_cond_signal(&comp_param[idx].cond);
1077 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26 1078 pages = 1;
9360447d
JQ
1079 ram_counters.normal++;
1080 ram_counters.transferred += bytes_xmit;
56e93d26
JQ
1081 break;
1082 }
1083 }
1084 if (pages > 0) {
1085 break;
1086 } else {
0d9f9a5c 1087 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26
JQ
1088 }
1089 }
0d9f9a5c 1090 qemu_mutex_unlock(&comp_done_lock);
56e93d26
JQ
1091
1092 return pages;
1093}
1094
1095/**
1096 * ram_save_compressed_page: compress the given page and send it to the stream
1097 *
3d0684b2 1098 * Returns the number of pages written.
56e93d26 1099 *
6f37bb8b 1100 * @rs: current RAM state
56e93d26
JQ
1101 * @block: block that contains the page we want to send
1102 * @offset: offset inside the block for the page
1103 * @last_stage: if we are at the completion stage
56e93d26 1104 */
a0a8aa14
JQ
1105static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
1106 bool last_stage)
56e93d26
JQ
1107{
1108 int pages = -1;
fc50438e 1109 uint64_t bytes_xmit = 0;
56e93d26 1110 uint8_t *p;
fc50438e 1111 int ret, blen;
a08f6890 1112 RAMBlock *block = pss->block;
a935e30f 1113 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
56e93d26 1114
2f68e399 1115 p = block->host + offset;
56e93d26 1116
ce25d337 1117 ret = ram_control_save_page(rs->f, block->offset,
56e93d26
JQ
1118 offset, TARGET_PAGE_SIZE, &bytes_xmit);
1119 if (bytes_xmit) {
9360447d 1120 ram_counters.transferred += bytes_xmit;
56e93d26
JQ
1121 pages = 1;
1122 }
56e93d26
JQ
1123 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
1124 if (ret != RAM_SAVE_CONTROL_DELAYED) {
1125 if (bytes_xmit > 0) {
9360447d 1126 ram_counters.normal++;
56e93d26 1127 } else if (bytes_xmit == 0) {
9360447d 1128 ram_counters.duplicate++;
56e93d26
JQ
1129 }
1130 }
1131 } else {
1132 /* When starting the process of a new block, the first page of
1133 * the block should be sent out before other pages in the same
1134 * block, and all the pages in last block should have been sent
1135 * out, keeping this order is important, because the 'cont' flag
1136 * is used to avoid resending the block name.
1137 */
6f37bb8b 1138 if (block != rs->last_sent_block) {
ce25d337
JQ
1139 flush_compressed_data(rs);
1140 pages = save_zero_page(rs, block, offset, p);
56e93d26 1141 if (pages == -1) {
fc50438e 1142 /* Make sure the first page is sent out before other pages */
2bf3aa85 1143 bytes_xmit = save_page_header(rs, rs->f, block, offset |
fc50438e 1144 RAM_SAVE_FLAG_COMPRESS_PAGE);
ce25d337 1145 blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
fc50438e
LL
1146 migrate_compress_level());
1147 if (blen > 0) {
9360447d
JQ
1148 ram_counters.transferred += bytes_xmit + blen;
1149 ram_counters.normal++;
b3be2896 1150 pages = 1;
fc50438e 1151 } else {
ce25d337 1152 qemu_file_set_error(rs->f, blen);
fc50438e 1153 error_report("compressed data failed!");
b3be2896 1154 }
56e93d26 1155 }
53f09a10 1156 if (pages > 0) {
a935e30f 1157 ram_release_pages(block->idstr, offset, pages);
53f09a10 1158 }
56e93d26 1159 } else {
ce25d337 1160 pages = save_zero_page(rs, block, offset, p);
56e93d26 1161 if (pages == -1) {
ce25d337 1162 pages = compress_page_with_multi_thread(rs, block, offset);
53f09a10 1163 } else {
a935e30f 1164 ram_release_pages(block->idstr, offset, pages);
56e93d26
JQ
1165 }
1166 }
1167 }
1168
1169 return pages;
1170}
1171
3d0684b2
JQ
1172/**
1173 * find_dirty_block: find the next dirty page and update any state
1174 * associated with the search process.
b9e60928 1175 *
3d0684b2 1176 * Returns if a page is found
b9e60928 1177 *
6f37bb8b 1178 * @rs: current RAM state
3d0684b2
JQ
1179 * @pss: data about the state of the current dirty page scan
1180 * @again: set to false if the search has scanned the whole of RAM
b9e60928 1181 */
f20e2865 1182static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
b9e60928 1183{
f20e2865 1184 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
6f37bb8b 1185 if (pss->complete_round && pss->block == rs->last_seen_block &&
a935e30f 1186 pss->page >= rs->last_page) {
b9e60928
DDAG
1187 /*
1188 * We've been once around the RAM and haven't found anything.
1189 * Give up.
1190 */
1191 *again = false;
1192 return false;
1193 }
a935e30f 1194 if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
b9e60928 1195 /* Didn't find anything in this RAM Block */
a935e30f 1196 pss->page = 0;
b9e60928
DDAG
1197 pss->block = QLIST_NEXT_RCU(pss->block, next);
1198 if (!pss->block) {
1199 /* Hit the end of the list */
1200 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1201 /* Flag that we've looped */
1202 pss->complete_round = true;
6f37bb8b 1203 rs->ram_bulk_stage = false;
b9e60928
DDAG
1204 if (migrate_use_xbzrle()) {
1205 /* If xbzrle is on, stop using the data compression at this
1206 * point. In theory, xbzrle can do better than compression.
1207 */
ce25d337 1208 flush_compressed_data(rs);
b9e60928
DDAG
1209 }
1210 }
1211 /* Didn't find anything this time, but try again on the new block */
1212 *again = true;
1213 return false;
1214 } else {
1215 /* Can go around again, but... */
1216 *again = true;
1217 /* We've found something so probably don't need to */
1218 return true;
1219 }
1220}
1221
3d0684b2
JQ
1222/**
1223 * unqueue_page: gets a page of the queue
1224 *
a82d593b 1225 * Helper for 'get_queued_page' - gets a page off the queue
a82d593b 1226 *
3d0684b2
JQ
1227 * Returns the block of the page (or NULL if none available)
1228 *
ec481c6c 1229 * @rs: current RAM state
3d0684b2 1230 * @offset: used to return the offset within the RAMBlock
a82d593b 1231 */
f20e2865 1232static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
a82d593b
DDAG
1233{
1234 RAMBlock *block = NULL;
1235
ec481c6c
JQ
1236 qemu_mutex_lock(&rs->src_page_req_mutex);
1237 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1238 struct RAMSrcPageRequest *entry =
1239 QSIMPLEQ_FIRST(&rs->src_page_requests);
a82d593b
DDAG
1240 block = entry->rb;
1241 *offset = entry->offset;
a82d593b
DDAG
1242
1243 if (entry->len > TARGET_PAGE_SIZE) {
1244 entry->len -= TARGET_PAGE_SIZE;
1245 entry->offset += TARGET_PAGE_SIZE;
1246 } else {
1247 memory_region_unref(block->mr);
ec481c6c 1248 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
a82d593b
DDAG
1249 g_free(entry);
1250 }
1251 }
ec481c6c 1252 qemu_mutex_unlock(&rs->src_page_req_mutex);
a82d593b
DDAG
1253
1254 return block;
1255}
1256
3d0684b2
JQ
1257/**
1258 * get_queued_page: unqueue a page from the postocpy requests
1259 *
1260 * Skips pages that are already sent (!dirty)
a82d593b 1261 *
3d0684b2 1262 * Returns if a queued page is found
a82d593b 1263 *
6f37bb8b 1264 * @rs: current RAM state
3d0684b2 1265 * @pss: data about the state of the current dirty page scan
a82d593b 1266 */
f20e2865 1267static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
a82d593b
DDAG
1268{
1269 RAMBlock *block;
1270 ram_addr_t offset;
1271 bool dirty;
1272
1273 do {
f20e2865 1274 block = unqueue_page(rs, &offset);
a82d593b
DDAG
1275 /*
1276 * We're sending this page, and since it's postcopy nothing else
1277 * will dirty it, and we must make sure it doesn't get sent again
1278 * even if this queue request was received after the background
1279 * search already sent it.
1280 */
1281 if (block) {
f20e2865
JQ
1282 unsigned long page;
1283
6b6712ef
JQ
1284 page = offset >> TARGET_PAGE_BITS;
1285 dirty = test_bit(page, block->bmap);
a82d593b 1286 if (!dirty) {
06b10688 1287 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
6b6712ef 1288 page, test_bit(page, block->unsentmap));
a82d593b 1289 } else {
f20e2865 1290 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
a82d593b
DDAG
1291 }
1292 }
1293
1294 } while (block && !dirty);
1295
1296 if (block) {
1297 /*
1298 * As soon as we start servicing pages out of order, then we have
1299 * to kill the bulk stage, since the bulk stage assumes
1300 * in (migration_bitmap_find_and_reset_dirty) that every page is
1301 * dirty, that's no longer true.
1302 */
6f37bb8b 1303 rs->ram_bulk_stage = false;
a82d593b
DDAG
1304
1305 /*
1306 * We want the background search to continue from the queued page
1307 * since the guest is likely to want other pages near to the page
1308 * it just requested.
1309 */
1310 pss->block = block;
a935e30f 1311 pss->page = offset >> TARGET_PAGE_BITS;
a82d593b
DDAG
1312 }
1313
1314 return !!block;
1315}
1316
6c595cde 1317/**
5e58f968
JQ
1318 * migration_page_queue_free: drop any remaining pages in the ram
1319 * request queue
6c595cde 1320 *
3d0684b2
JQ
1321 * It should be empty at the end anyway, but in error cases there may
1322 * be some left. in case that there is any page left, we drop it.
1323 *
6c595cde 1324 */
83c13382 1325static void migration_page_queue_free(RAMState *rs)
6c595cde 1326{
ec481c6c 1327 struct RAMSrcPageRequest *mspr, *next_mspr;
6c595cde
DDAG
1328 /* This queue generally should be empty - but in the case of a failed
1329 * migration might have some droppings in.
1330 */
1331 rcu_read_lock();
ec481c6c 1332 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
6c595cde 1333 memory_region_unref(mspr->rb->mr);
ec481c6c 1334 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
6c595cde
DDAG
1335 g_free(mspr);
1336 }
1337 rcu_read_unlock();
1338}
1339
1340/**
3d0684b2
JQ
1341 * ram_save_queue_pages: queue the page for transmission
1342 *
1343 * A request from postcopy destination for example.
1344 *
1345 * Returns zero on success or negative on error
1346 *
3d0684b2
JQ
1347 * @rbname: Name of the RAMBLock of the request. NULL means the
1348 * same that last one.
1349 * @start: starting address from the start of the RAMBlock
1350 * @len: length (in bytes) to send
6c595cde 1351 */
96506894 1352int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
6c595cde
DDAG
1353{
1354 RAMBlock *ramblock;
53518d94 1355 RAMState *rs = ram_state;
6c595cde 1356
9360447d 1357 ram_counters.postcopy_requests++;
6c595cde
DDAG
1358 rcu_read_lock();
1359 if (!rbname) {
1360 /* Reuse last RAMBlock */
68a098f3 1361 ramblock = rs->last_req_rb;
6c595cde
DDAG
1362
1363 if (!ramblock) {
1364 /*
1365 * Shouldn't happen, we can't reuse the last RAMBlock if
1366 * it's the 1st request.
1367 */
1368 error_report("ram_save_queue_pages no previous block");
1369 goto err;
1370 }
1371 } else {
1372 ramblock = qemu_ram_block_by_name(rbname);
1373
1374 if (!ramblock) {
1375 /* We shouldn't be asked for a non-existent RAMBlock */
1376 error_report("ram_save_queue_pages no block '%s'", rbname);
1377 goto err;
1378 }
68a098f3 1379 rs->last_req_rb = ramblock;
6c595cde
DDAG
1380 }
1381 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1382 if (start+len > ramblock->used_length) {
9458ad6b
JQ
1383 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1384 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
6c595cde
DDAG
1385 __func__, start, len, ramblock->used_length);
1386 goto err;
1387 }
1388
ec481c6c
JQ
1389 struct RAMSrcPageRequest *new_entry =
1390 g_malloc0(sizeof(struct RAMSrcPageRequest));
6c595cde
DDAG
1391 new_entry->rb = ramblock;
1392 new_entry->offset = start;
1393 new_entry->len = len;
1394
1395 memory_region_ref(ramblock->mr);
ec481c6c
JQ
1396 qemu_mutex_lock(&rs->src_page_req_mutex);
1397 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1398 qemu_mutex_unlock(&rs->src_page_req_mutex);
6c595cde
DDAG
1399 rcu_read_unlock();
1400
1401 return 0;
1402
1403err:
1404 rcu_read_unlock();
1405 return -1;
1406}
1407
a82d593b 1408/**
3d0684b2 1409 * ram_save_target_page: save one target page
a82d593b 1410 *
3d0684b2 1411 * Returns the number of pages written
a82d593b 1412 *
6f37bb8b 1413 * @rs: current RAM state
3d0684b2 1414 * @ms: current migration state
3d0684b2 1415 * @pss: data about the page we want to send
a82d593b 1416 * @last_stage: if we are at the completion stage
a82d593b 1417 */
a0a8aa14 1418static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
f20e2865 1419 bool last_stage)
a82d593b
DDAG
1420{
1421 int res = 0;
1422
1423 /* Check the pages is dirty and if it is send it */
f20e2865 1424 if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
6d358d94
JQ
1425 /*
1426 * If xbzrle is on, stop using the data compression after first
1427 * round of migration even if compression is enabled. In theory,
1428 * xbzrle can do better than compression.
1429 */
6b6712ef
JQ
1430 if (migrate_use_compression() &&
1431 (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
a0a8aa14 1432 res = ram_save_compressed_page(rs, pss, last_stage);
a82d593b 1433 } else {
a0a8aa14 1434 res = ram_save_page(rs, pss, last_stage);
a82d593b
DDAG
1435 }
1436
1437 if (res < 0) {
1438 return res;
1439 }
6b6712ef
JQ
1440 if (pss->block->unsentmap) {
1441 clear_bit(pss->page, pss->block->unsentmap);
a82d593b
DDAG
1442 }
1443 }
1444
1445 return res;
1446}
1447
1448/**
3d0684b2 1449 * ram_save_host_page: save a whole host page
a82d593b 1450 *
3d0684b2
JQ
1451 * Starting at *offset send pages up to the end of the current host
1452 * page. It's valid for the initial offset to point into the middle of
1453 * a host page in which case the remainder of the hostpage is sent.
1454 * Only dirty target pages are sent. Note that the host page size may
1455 * be a huge page for this block.
1eb3fc0a
DDAG
1456 * The saving stops at the boundary of the used_length of the block
1457 * if the RAMBlock isn't a multiple of the host page size.
a82d593b 1458 *
3d0684b2
JQ
1459 * Returns the number of pages written or negative on error
1460 *
6f37bb8b 1461 * @rs: current RAM state
3d0684b2 1462 * @ms: current migration state
3d0684b2 1463 * @pss: data about the page we want to send
a82d593b 1464 * @last_stage: if we are at the completion stage
a82d593b 1465 */
a0a8aa14 1466static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
f20e2865 1467 bool last_stage)
a82d593b
DDAG
1468{
1469 int tmppages, pages = 0;
a935e30f
JQ
1470 size_t pagesize_bits =
1471 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
4c011c37 1472
a82d593b 1473 do {
f20e2865 1474 tmppages = ram_save_target_page(rs, pss, last_stage);
a82d593b
DDAG
1475 if (tmppages < 0) {
1476 return tmppages;
1477 }
1478
1479 pages += tmppages;
a935e30f 1480 pss->page++;
1eb3fc0a
DDAG
1481 } while ((pss->page & (pagesize_bits - 1)) &&
1482 offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
a82d593b
DDAG
1483
1484 /* The offset we leave with is the last one we looked at */
a935e30f 1485 pss->page--;
a82d593b
DDAG
1486 return pages;
1487}
6c595cde 1488
56e93d26 1489/**
3d0684b2 1490 * ram_find_and_save_block: finds a dirty page and sends it to f
56e93d26
JQ
1491 *
1492 * Called within an RCU critical section.
1493 *
3d0684b2 1494 * Returns the number of pages written where zero means no dirty pages
56e93d26 1495 *
6f37bb8b 1496 * @rs: current RAM state
56e93d26 1497 * @last_stage: if we are at the completion stage
a82d593b
DDAG
1498 *
1499 * On systems where host-page-size > target-page-size it will send all the
1500 * pages in a host page that are dirty.
56e93d26
JQ
1501 */
1502
ce25d337 1503static int ram_find_and_save_block(RAMState *rs, bool last_stage)
56e93d26 1504{
b8fb8cb7 1505 PageSearchStatus pss;
56e93d26 1506 int pages = 0;
b9e60928 1507 bool again, found;
56e93d26 1508
0827b9e9
AA
1509 /* No dirty page as there is zero RAM */
1510 if (!ram_bytes_total()) {
1511 return pages;
1512 }
1513
6f37bb8b 1514 pss.block = rs->last_seen_block;
a935e30f 1515 pss.page = rs->last_page;
b8fb8cb7
DDAG
1516 pss.complete_round = false;
1517
1518 if (!pss.block) {
1519 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1520 }
56e93d26 1521
b9e60928 1522 do {
a82d593b 1523 again = true;
f20e2865 1524 found = get_queued_page(rs, &pss);
b9e60928 1525
a82d593b
DDAG
1526 if (!found) {
1527 /* priority queue empty, so just search for something dirty */
f20e2865 1528 found = find_dirty_block(rs, &pss, &again);
a82d593b 1529 }
f3f491fc 1530
a82d593b 1531 if (found) {
f20e2865 1532 pages = ram_save_host_page(rs, &pss, last_stage);
56e93d26 1533 }
b9e60928 1534 } while (!pages && again);
56e93d26 1535
6f37bb8b 1536 rs->last_seen_block = pss.block;
a935e30f 1537 rs->last_page = pss.page;
56e93d26
JQ
1538
1539 return pages;
1540}
1541
1542void acct_update_position(QEMUFile *f, size_t size, bool zero)
1543{
1544 uint64_t pages = size / TARGET_PAGE_SIZE;
f7ccd61b 1545
56e93d26 1546 if (zero) {
9360447d 1547 ram_counters.duplicate += pages;
56e93d26 1548 } else {
9360447d
JQ
1549 ram_counters.normal += pages;
1550 ram_counters.transferred += size;
56e93d26
JQ
1551 qemu_update_position(f, size);
1552 }
1553}
1554
56e93d26
JQ
1555uint64_t ram_bytes_total(void)
1556{
1557 RAMBlock *block;
1558 uint64_t total = 0;
1559
1560 rcu_read_lock();
99e15582 1561 RAMBLOCK_FOREACH(block) {
56e93d26 1562 total += block->used_length;
99e15582 1563 }
56e93d26
JQ
1564 rcu_read_unlock();
1565 return total;
1566}
1567
f265e0e4 1568static void xbzrle_load_setup(void)
56e93d26 1569{
f265e0e4 1570 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
56e93d26
JQ
1571}
1572
f265e0e4
JQ
1573static void xbzrle_load_cleanup(void)
1574{
1575 g_free(XBZRLE.decoded_buf);
1576 XBZRLE.decoded_buf = NULL;
1577}
1578
1579static void ram_save_cleanup(void *opaque)
56e93d26 1580{
53518d94 1581 RAMState **rsp = opaque;
6b6712ef 1582 RAMBlock *block;
eb859c53 1583
2ff64038
LZ
1584 /* caller have hold iothread lock or is in a bh, so there is
1585 * no writing race against this migration_bitmap
1586 */
6b6712ef
JQ
1587 memory_global_dirty_log_stop();
1588
1589 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1590 g_free(block->bmap);
1591 block->bmap = NULL;
1592 g_free(block->unsentmap);
1593 block->unsentmap = NULL;
56e93d26
JQ
1594 }
1595
1596 XBZRLE_cache_lock();
1597 if (XBZRLE.cache) {
1598 cache_fini(XBZRLE.cache);
1599 g_free(XBZRLE.encoded_buf);
1600 g_free(XBZRLE.current_buf);
c00e0928 1601 g_free(XBZRLE.zero_target_page);
56e93d26
JQ
1602 XBZRLE.cache = NULL;
1603 XBZRLE.encoded_buf = NULL;
1604 XBZRLE.current_buf = NULL;
c00e0928 1605 XBZRLE.zero_target_page = NULL;
56e93d26
JQ
1606 }
1607 XBZRLE_cache_unlock();
53518d94 1608 migration_page_queue_free(*rsp);
f0afa331 1609 compress_threads_save_cleanup();
53518d94
JQ
1610 g_free(*rsp);
1611 *rsp = NULL;
56e93d26
JQ
1612}
1613
6f37bb8b 1614static void ram_state_reset(RAMState *rs)
56e93d26 1615{
6f37bb8b
JQ
1616 rs->last_seen_block = NULL;
1617 rs->last_sent_block = NULL;
269ace29 1618 rs->last_page = 0;
6f37bb8b
JQ
1619 rs->last_version = ram_list.version;
1620 rs->ram_bulk_stage = true;
56e93d26
JQ
1621}
1622
1623#define MAX_WAIT 50 /* ms, half buffered_file limit */
1624
4f2e4252
DDAG
1625/*
1626 * 'expected' is the value you expect the bitmap mostly to be full
1627 * of; it won't bother printing lines that are all this value.
1628 * If 'todump' is null the migration bitmap is dumped.
1629 */
6b6712ef
JQ
1630void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1631 unsigned long pages)
4f2e4252 1632{
4f2e4252
DDAG
1633 int64_t cur;
1634 int64_t linelen = 128;
1635 char linebuf[129];
1636
6b6712ef 1637 for (cur = 0; cur < pages; cur += linelen) {
4f2e4252
DDAG
1638 int64_t curb;
1639 bool found = false;
1640 /*
1641 * Last line; catch the case where the line length
1642 * is longer than remaining ram
1643 */
6b6712ef
JQ
1644 if (cur + linelen > pages) {
1645 linelen = pages - cur;
4f2e4252
DDAG
1646 }
1647 for (curb = 0; curb < linelen; curb++) {
1648 bool thisbit = test_bit(cur + curb, todump);
1649 linebuf[curb] = thisbit ? '1' : '.';
1650 found = found || (thisbit != expected);
1651 }
1652 if (found) {
1653 linebuf[curb] = '\0';
1654 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
1655 }
1656 }
1657}
1658
e0b266f0
DDAG
1659/* **** functions for postcopy ***** */
1660
ced1c616
PB
1661void ram_postcopy_migrated_memory_release(MigrationState *ms)
1662{
1663 struct RAMBlock *block;
ced1c616 1664
99e15582 1665 RAMBLOCK_FOREACH(block) {
6b6712ef
JQ
1666 unsigned long *bitmap = block->bmap;
1667 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1668 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
ced1c616
PB
1669
1670 while (run_start < range) {
1671 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
aaa2064c 1672 ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
ced1c616
PB
1673 (run_end - run_start) << TARGET_PAGE_BITS);
1674 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1675 }
1676 }
1677}
1678
3d0684b2
JQ
1679/**
1680 * postcopy_send_discard_bm_ram: discard a RAMBlock
1681 *
1682 * Returns zero on success
1683 *
e0b266f0
DDAG
1684 * Callback from postcopy_each_ram_send_discard for each RAMBlock
1685 * Note: At this point the 'unsentmap' is the processed bitmap combined
1686 * with the dirtymap; so a '1' means it's either dirty or unsent.
3d0684b2
JQ
1687 *
1688 * @ms: current migration state
1689 * @pds: state for postcopy
1690 * @start: RAMBlock starting page
1691 * @length: RAMBlock size
e0b266f0
DDAG
1692 */
1693static int postcopy_send_discard_bm_ram(MigrationState *ms,
1694 PostcopyDiscardState *pds,
6b6712ef 1695 RAMBlock *block)
e0b266f0 1696{
6b6712ef 1697 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
e0b266f0 1698 unsigned long current;
6b6712ef 1699 unsigned long *unsentmap = block->unsentmap;
e0b266f0 1700
6b6712ef 1701 for (current = 0; current < end; ) {
e0b266f0
DDAG
1702 unsigned long one = find_next_bit(unsentmap, end, current);
1703
1704 if (one <= end) {
1705 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1706 unsigned long discard_length;
1707
1708 if (zero >= end) {
1709 discard_length = end - one;
1710 } else {
1711 discard_length = zero - one;
1712 }
d688c62d
DDAG
1713 if (discard_length) {
1714 postcopy_discard_send_range(ms, pds, one, discard_length);
1715 }
e0b266f0
DDAG
1716 current = one + discard_length;
1717 } else {
1718 current = one;
1719 }
1720 }
1721
1722 return 0;
1723}
1724
3d0684b2
JQ
1725/**
1726 * postcopy_each_ram_send_discard: discard all RAMBlocks
1727 *
1728 * Returns 0 for success or negative for error
1729 *
e0b266f0
DDAG
1730 * Utility for the outgoing postcopy code.
1731 * Calls postcopy_send_discard_bm_ram for each RAMBlock
1732 * passing it bitmap indexes and name.
e0b266f0
DDAG
1733 * (qemu_ram_foreach_block ends up passing unscaled lengths
1734 * which would mean postcopy code would have to deal with target page)
3d0684b2
JQ
1735 *
1736 * @ms: current migration state
e0b266f0
DDAG
1737 */
1738static int postcopy_each_ram_send_discard(MigrationState *ms)
1739{
1740 struct RAMBlock *block;
1741 int ret;
1742
99e15582 1743 RAMBLOCK_FOREACH(block) {
6b6712ef
JQ
1744 PostcopyDiscardState *pds =
1745 postcopy_discard_send_init(ms, block->idstr);
e0b266f0
DDAG
1746
1747 /*
1748 * Postcopy sends chunks of bitmap over the wire, but it
1749 * just needs indexes at this point, avoids it having
1750 * target page specific code.
1751 */
6b6712ef 1752 ret = postcopy_send_discard_bm_ram(ms, pds, block);
e0b266f0
DDAG
1753 postcopy_discard_send_finish(ms, pds);
1754 if (ret) {
1755 return ret;
1756 }
1757 }
1758
1759 return 0;
1760}
1761
3d0684b2
JQ
1762/**
1763 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1764 *
1765 * Helper for postcopy_chunk_hostpages; it's called twice to
1766 * canonicalize the two bitmaps, that are similar, but one is
1767 * inverted.
99e314eb 1768 *
3d0684b2
JQ
1769 * Postcopy requires that all target pages in a hostpage are dirty or
1770 * clean, not a mix. This function canonicalizes the bitmaps.
99e314eb 1771 *
3d0684b2
JQ
1772 * @ms: current migration state
1773 * @unsent_pass: if true we need to canonicalize partially unsent host pages
1774 * otherwise we need to canonicalize partially dirty host pages
1775 * @block: block that contains the page we want to canonicalize
1776 * @pds: state for postcopy
99e314eb
DDAG
1777 */
1778static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1779 RAMBlock *block,
1780 PostcopyDiscardState *pds)
1781{
53518d94 1782 RAMState *rs = ram_state;
6b6712ef
JQ
1783 unsigned long *bitmap = block->bmap;
1784 unsigned long *unsentmap = block->unsentmap;
29c59172 1785 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
6b6712ef 1786 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
99e314eb
DDAG
1787 unsigned long run_start;
1788
29c59172
DDAG
1789 if (block->page_size == TARGET_PAGE_SIZE) {
1790 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1791 return;
1792 }
1793
99e314eb
DDAG
1794 if (unsent_pass) {
1795 /* Find a sent page */
6b6712ef 1796 run_start = find_next_zero_bit(unsentmap, pages, 0);
99e314eb
DDAG
1797 } else {
1798 /* Find a dirty page */
6b6712ef 1799 run_start = find_next_bit(bitmap, pages, 0);
99e314eb
DDAG
1800 }
1801
6b6712ef 1802 while (run_start < pages) {
99e314eb
DDAG
1803 bool do_fixup = false;
1804 unsigned long fixup_start_addr;
1805 unsigned long host_offset;
1806
1807 /*
1808 * If the start of this run of pages is in the middle of a host
1809 * page, then we need to fixup this host page.
1810 */
1811 host_offset = run_start % host_ratio;
1812 if (host_offset) {
1813 do_fixup = true;
1814 run_start -= host_offset;
1815 fixup_start_addr = run_start;
1816 /* For the next pass */
1817 run_start = run_start + host_ratio;
1818 } else {
1819 /* Find the end of this run */
1820 unsigned long run_end;
1821 if (unsent_pass) {
6b6712ef 1822 run_end = find_next_bit(unsentmap, pages, run_start + 1);
99e314eb 1823 } else {
6b6712ef 1824 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
99e314eb
DDAG
1825 }
1826 /*
1827 * If the end isn't at the start of a host page, then the
1828 * run doesn't finish at the end of a host page
1829 * and we need to discard.
1830 */
1831 host_offset = run_end % host_ratio;
1832 if (host_offset) {
1833 do_fixup = true;
1834 fixup_start_addr = run_end - host_offset;
1835 /*
1836 * This host page has gone, the next loop iteration starts
1837 * from after the fixup
1838 */
1839 run_start = fixup_start_addr + host_ratio;
1840 } else {
1841 /*
1842 * No discards on this iteration, next loop starts from
1843 * next sent/dirty page
1844 */
1845 run_start = run_end + 1;
1846 }
1847 }
1848
1849 if (do_fixup) {
1850 unsigned long page;
1851
1852 /* Tell the destination to discard this page */
1853 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1854 /* For the unsent_pass we:
1855 * discard partially sent pages
1856 * For the !unsent_pass (dirty) we:
1857 * discard partially dirty pages that were sent
1858 * (any partially sent pages were already discarded
1859 * by the previous unsent_pass)
1860 */
1861 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1862 host_ratio);
1863 }
1864
1865 /* Clean up the bitmap */
1866 for (page = fixup_start_addr;
1867 page < fixup_start_addr + host_ratio; page++) {
1868 /* All pages in this host page are now not sent */
1869 set_bit(page, unsentmap);
1870
1871 /*
1872 * Remark them as dirty, updating the count for any pages
1873 * that weren't previously dirty.
1874 */
0d8ec885 1875 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
99e314eb
DDAG
1876 }
1877 }
1878
1879 if (unsent_pass) {
1880 /* Find the next sent page for the next iteration */
6b6712ef 1881 run_start = find_next_zero_bit(unsentmap, pages, run_start);
99e314eb
DDAG
1882 } else {
1883 /* Find the next dirty page for the next iteration */
6b6712ef 1884 run_start = find_next_bit(bitmap, pages, run_start);
99e314eb
DDAG
1885 }
1886 }
1887}
1888
3d0684b2
JQ
1889/**
1890 * postcopy_chuck_hostpages: discrad any partially sent host page
1891 *
99e314eb
DDAG
1892 * Utility for the outgoing postcopy code.
1893 *
1894 * Discard any partially sent host-page size chunks, mark any partially
29c59172
DDAG
1895 * dirty host-page size chunks as all dirty. In this case the host-page
1896 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
99e314eb 1897 *
3d0684b2
JQ
1898 * Returns zero on success
1899 *
1900 * @ms: current migration state
6b6712ef 1901 * @block: block we want to work with
99e314eb 1902 */
6b6712ef 1903static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
99e314eb 1904{
6b6712ef
JQ
1905 PostcopyDiscardState *pds =
1906 postcopy_discard_send_init(ms, block->idstr);
99e314eb 1907
6b6712ef
JQ
1908 /* First pass: Discard all partially sent host pages */
1909 postcopy_chunk_hostpages_pass(ms, true, block, pds);
1910 /*
1911 * Second pass: Ensure that all partially dirty host pages are made
1912 * fully dirty.
1913 */
1914 postcopy_chunk_hostpages_pass(ms, false, block, pds);
99e314eb 1915
6b6712ef 1916 postcopy_discard_send_finish(ms, pds);
99e314eb
DDAG
1917 return 0;
1918}
1919
3d0684b2
JQ
1920/**
1921 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1922 *
1923 * Returns zero on success
1924 *
e0b266f0
DDAG
1925 * Transmit the set of pages to be discarded after precopy to the target
1926 * these are pages that:
1927 * a) Have been previously transmitted but are now dirty again
1928 * b) Pages that have never been transmitted, this ensures that
1929 * any pages on the destination that have been mapped by background
1930 * tasks get discarded (transparent huge pages is the specific concern)
1931 * Hopefully this is pretty sparse
3d0684b2
JQ
1932 *
1933 * @ms: current migration state
e0b266f0
DDAG
1934 */
1935int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1936{
53518d94 1937 RAMState *rs = ram_state;
6b6712ef 1938 RAMBlock *block;
e0b266f0 1939 int ret;
e0b266f0
DDAG
1940
1941 rcu_read_lock();
1942
1943 /* This should be our last sync, the src is now paused */
eb859c53 1944 migration_bitmap_sync(rs);
e0b266f0 1945
6b6712ef
JQ
1946 /* Easiest way to make sure we don't resume in the middle of a host-page */
1947 rs->last_seen_block = NULL;
1948 rs->last_sent_block = NULL;
1949 rs->last_page = 0;
e0b266f0 1950
6b6712ef
JQ
1951 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1952 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1953 unsigned long *bitmap = block->bmap;
1954 unsigned long *unsentmap = block->unsentmap;
1955
1956 if (!unsentmap) {
1957 /* We don't have a safe way to resize the sentmap, so
1958 * if the bitmap was resized it will be NULL at this
1959 * point.
1960 */
1961 error_report("migration ram resized during precopy phase");
1962 rcu_read_unlock();
1963 return -EINVAL;
1964 }
1965 /* Deal with TPS != HPS and huge pages */
1966 ret = postcopy_chunk_hostpages(ms, block);
1967 if (ret) {
1968 rcu_read_unlock();
1969 return ret;
1970 }
e0b266f0 1971
6b6712ef
JQ
1972 /*
1973 * Update the unsentmap to be unsentmap = unsentmap | dirty
1974 */
1975 bitmap_or(unsentmap, unsentmap, bitmap, pages);
e0b266f0 1976#ifdef DEBUG_POSTCOPY
6b6712ef 1977 ram_debug_dump_bitmap(unsentmap, true, pages);
e0b266f0 1978#endif
6b6712ef
JQ
1979 }
1980 trace_ram_postcopy_send_discard_bitmap();
e0b266f0
DDAG
1981
1982 ret = postcopy_each_ram_send_discard(ms);
1983 rcu_read_unlock();
1984
1985 return ret;
1986}
1987
3d0684b2
JQ
1988/**
1989 * ram_discard_range: discard dirtied pages at the beginning of postcopy
e0b266f0 1990 *
3d0684b2 1991 * Returns zero on success
e0b266f0 1992 *
36449157
JQ
1993 * @rbname: name of the RAMBlock of the request. NULL means the
1994 * same that last one.
3d0684b2
JQ
1995 * @start: RAMBlock starting page
1996 * @length: RAMBlock size
e0b266f0 1997 */
aaa2064c 1998int ram_discard_range(const char *rbname, uint64_t start, size_t length)
e0b266f0
DDAG
1999{
2000 int ret = -1;
2001
36449157 2002 trace_ram_discard_range(rbname, start, length);
d3a5038c 2003
e0b266f0 2004 rcu_read_lock();
36449157 2005 RAMBlock *rb = qemu_ram_block_by_name(rbname);
e0b266f0
DDAG
2006
2007 if (!rb) {
36449157 2008 error_report("ram_discard_range: Failed to find block '%s'", rbname);
e0b266f0
DDAG
2009 goto err;
2010 }
2011
d3a5038c 2012 ret = ram_block_discard_range(rb, start, length);
e0b266f0
DDAG
2013
2014err:
2015 rcu_read_unlock();
2016
2017 return ret;
2018}
2019
53518d94 2020static int ram_state_init(RAMState **rsp)
56e93d26 2021{
7d00ee6a
PX
2022 *rsp = g_try_new0(RAMState, 1);
2023
2024 if (!*rsp) {
2025 error_report("%s: Init ramstate fail", __func__);
2026 return -1;
2027 }
53518d94
JQ
2028
2029 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2030 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2031 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
56e93d26 2032
7d00ee6a
PX
2033 /*
2034 * Count the total number of pages used by ram blocks not including any
2035 * gaps due to alignment or unplugs.
2036 */
2037 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2038
2039 ram_state_reset(*rsp);
2040
2041 return 0;
2042}
2043
2044static int ram_init_all(RAMState **rsp)
2045{
2046 Error *local_err = NULL;
2047
2048 if (ram_state_init(rsp)) {
2049 return -1;
2050 }
2051
56e93d26
JQ
2052 if (migrate_use_xbzrle()) {
2053 XBZRLE_cache_lock();
c00e0928 2054 XBZRLE.zero_target_page = g_malloc0(TARGET_PAGE_SIZE);
80f8dfde
JQ
2055 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2056 TARGET_PAGE_SIZE, &local_err);
56e93d26
JQ
2057 if (!XBZRLE.cache) {
2058 XBZRLE_cache_unlock();
80f8dfde 2059 error_report_err(local_err);
53518d94
JQ
2060 g_free(*rsp);
2061 *rsp = NULL;
56e93d26
JQ
2062 return -1;
2063 }
2064 XBZRLE_cache_unlock();
2065
2066 /* We prefer not to abort if there is no memory */
2067 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2068 if (!XBZRLE.encoded_buf) {
2069 error_report("Error allocating encoded_buf");
53518d94
JQ
2070 g_free(*rsp);
2071 *rsp = NULL;
56e93d26
JQ
2072 return -1;
2073 }
2074
2075 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2076 if (!XBZRLE.current_buf) {
2077 error_report("Error allocating current_buf");
2078 g_free(XBZRLE.encoded_buf);
2079 XBZRLE.encoded_buf = NULL;
53518d94
JQ
2080 g_free(*rsp);
2081 *rsp = NULL;
56e93d26
JQ
2082 return -1;
2083 }
56e93d26
JQ
2084 }
2085
49877834
PB
2086 /* For memory_global_dirty_log_start below. */
2087 qemu_mutex_lock_iothread();
2088
56e93d26
JQ
2089 qemu_mutex_lock_ramlist();
2090 rcu_read_lock();
56e93d26 2091
0827b9e9
AA
2092 /* Skip setting bitmap if there is no RAM */
2093 if (ram_bytes_total()) {
6b6712ef
JQ
2094 RAMBlock *block;
2095
2096 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2097 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
0827b9e9 2098
6b6712ef
JQ
2099 block->bmap = bitmap_new(pages);
2100 bitmap_set(block->bmap, 0, pages);
2101 if (migrate_postcopy_ram()) {
2102 block->unsentmap = bitmap_new(pages);
2103 bitmap_set(block->unsentmap, 0, pages);
2104 }
0827b9e9 2105 }
f3f491fc
DDAG
2106 }
2107
56e93d26 2108 memory_global_dirty_log_start();
53518d94 2109 migration_bitmap_sync(*rsp);
56e93d26 2110 qemu_mutex_unlock_ramlist();
49877834 2111 qemu_mutex_unlock_iothread();
a91246c9
HZ
2112 rcu_read_unlock();
2113
2114 return 0;
2115}
2116
3d0684b2
JQ
2117/*
2118 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
a91246c9
HZ
2119 * long-running RCU critical section. When rcu-reclaims in the code
2120 * start to become numerous it will be necessary to reduce the
2121 * granularity of these critical sections.
2122 */
2123
3d0684b2
JQ
2124/**
2125 * ram_save_setup: Setup RAM for migration
2126 *
2127 * Returns zero to indicate success and negative for error
2128 *
2129 * @f: QEMUFile where to send the data
2130 * @opaque: RAMState pointer
2131 */
a91246c9
HZ
2132static int ram_save_setup(QEMUFile *f, void *opaque)
2133{
53518d94 2134 RAMState **rsp = opaque;
a91246c9
HZ
2135 RAMBlock *block;
2136
2137 /* migration has already setup the bitmap, reuse it. */
2138 if (!migration_in_colo_state()) {
7d00ee6a 2139 if (ram_init_all(rsp) != 0) {
a91246c9 2140 return -1;
53518d94 2141 }
a91246c9 2142 }
53518d94 2143 (*rsp)->f = f;
a91246c9
HZ
2144
2145 rcu_read_lock();
56e93d26
JQ
2146
2147 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2148
99e15582 2149 RAMBLOCK_FOREACH(block) {
56e93d26
JQ
2150 qemu_put_byte(f, strlen(block->idstr));
2151 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2152 qemu_put_be64(f, block->used_length);
ef08fb38
DDAG
2153 if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2154 qemu_put_be64(f, block->page_size);
2155 }
56e93d26
JQ
2156 }
2157
2158 rcu_read_unlock();
f0afa331 2159 compress_threads_save_setup();
56e93d26
JQ
2160
2161 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2162 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2163
2164 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2165
2166 return 0;
2167}
2168
3d0684b2
JQ
2169/**
2170 * ram_save_iterate: iterative stage for migration
2171 *
2172 * Returns zero to indicate success and negative for error
2173 *
2174 * @f: QEMUFile where to send the data
2175 * @opaque: RAMState pointer
2176 */
56e93d26
JQ
2177static int ram_save_iterate(QEMUFile *f, void *opaque)
2178{
53518d94
JQ
2179 RAMState **temp = opaque;
2180 RAMState *rs = *temp;
56e93d26
JQ
2181 int ret;
2182 int i;
2183 int64_t t0;
5c90308f 2184 int done = 0;
56e93d26
JQ
2185
2186 rcu_read_lock();
6f37bb8b
JQ
2187 if (ram_list.version != rs->last_version) {
2188 ram_state_reset(rs);
56e93d26
JQ
2189 }
2190
2191 /* Read version before ram_list.blocks */
2192 smp_rmb();
2193
2194 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2195
2196 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2197 i = 0;
2198 while ((ret = qemu_file_rate_limit(f)) == 0) {
2199 int pages;
2200
ce25d337 2201 pages = ram_find_and_save_block(rs, false);
56e93d26
JQ
2202 /* no more pages to sent */
2203 if (pages == 0) {
5c90308f 2204 done = 1;
56e93d26
JQ
2205 break;
2206 }
23b28c3c 2207 rs->iterations++;
070afca2 2208
56e93d26
JQ
2209 /* we want to check in the 1st loop, just in case it was the 1st time
2210 and we had to sync the dirty bitmap.
2211 qemu_get_clock_ns() is a bit expensive, so we only check each some
2212 iterations
2213 */
2214 if ((i & 63) == 0) {
2215 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2216 if (t1 > MAX_WAIT) {
55c4446b 2217 trace_ram_save_iterate_big_wait(t1, i);
56e93d26
JQ
2218 break;
2219 }
2220 }
2221 i++;
2222 }
ce25d337 2223 flush_compressed_data(rs);
56e93d26
JQ
2224 rcu_read_unlock();
2225
2226 /*
2227 * Must occur before EOS (or any QEMUFile operation)
2228 * because of RDMA protocol.
2229 */
2230 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2231
2232 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
9360447d 2233 ram_counters.transferred += 8;
56e93d26
JQ
2234
2235 ret = qemu_file_get_error(f);
2236 if (ret < 0) {
2237 return ret;
2238 }
2239
5c90308f 2240 return done;
56e93d26
JQ
2241}
2242
3d0684b2
JQ
2243/**
2244 * ram_save_complete: function called to send the remaining amount of ram
2245 *
2246 * Returns zero to indicate success
2247 *
2248 * Called with iothread lock
2249 *
2250 * @f: QEMUFile where to send the data
2251 * @opaque: RAMState pointer
2252 */
56e93d26
JQ
2253static int ram_save_complete(QEMUFile *f, void *opaque)
2254{
53518d94
JQ
2255 RAMState **temp = opaque;
2256 RAMState *rs = *temp;
6f37bb8b 2257
56e93d26
JQ
2258 rcu_read_lock();
2259
5727309d 2260 if (!migration_in_postcopy()) {
8d820d6f 2261 migration_bitmap_sync(rs);
663e6c1d 2262 }
56e93d26
JQ
2263
2264 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2265
2266 /* try transferring iterative blocks of memory */
2267
2268 /* flush all remaining blocks regardless of rate limiting */
2269 while (true) {
2270 int pages;
2271
ce25d337 2272 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
56e93d26
JQ
2273 /* no more blocks to sent */
2274 if (pages == 0) {
2275 break;
2276 }
2277 }
2278
ce25d337 2279 flush_compressed_data(rs);
56e93d26 2280 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
56e93d26
JQ
2281
2282 rcu_read_unlock();
d09a6fde 2283
56e93d26
JQ
2284 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2285
2286 return 0;
2287}
2288
c31b098f
DDAG
2289static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2290 uint64_t *non_postcopiable_pending,
2291 uint64_t *postcopiable_pending)
56e93d26 2292{
53518d94
JQ
2293 RAMState **temp = opaque;
2294 RAMState *rs = *temp;
56e93d26
JQ
2295 uint64_t remaining_size;
2296
9edabd4d 2297 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 2298
5727309d 2299 if (!migration_in_postcopy() &&
663e6c1d 2300 remaining_size < max_size) {
56e93d26
JQ
2301 qemu_mutex_lock_iothread();
2302 rcu_read_lock();
8d820d6f 2303 migration_bitmap_sync(rs);
56e93d26
JQ
2304 rcu_read_unlock();
2305 qemu_mutex_unlock_iothread();
9edabd4d 2306 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 2307 }
c31b098f 2308
86e1167e
VSO
2309 if (migrate_postcopy_ram()) {
2310 /* We can do postcopy, and all the data is postcopiable */
2311 *postcopiable_pending += remaining_size;
2312 } else {
2313 *non_postcopiable_pending += remaining_size;
2314 }
56e93d26
JQ
2315}
2316
2317static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2318{
2319 unsigned int xh_len;
2320 int xh_flags;
063e760a 2321 uint8_t *loaded_data;
56e93d26 2322
56e93d26
JQ
2323 /* extract RLE header */
2324 xh_flags = qemu_get_byte(f);
2325 xh_len = qemu_get_be16(f);
2326
2327 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2328 error_report("Failed to load XBZRLE page - wrong compression!");
2329 return -1;
2330 }
2331
2332 if (xh_len > TARGET_PAGE_SIZE) {
2333 error_report("Failed to load XBZRLE page - len overflow!");
2334 return -1;
2335 }
f265e0e4 2336 loaded_data = XBZRLE.decoded_buf;
56e93d26 2337 /* load data and decode */
f265e0e4 2338 /* it can change loaded_data to point to an internal buffer */
063e760a 2339 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
56e93d26
JQ
2340
2341 /* decode RLE */
063e760a 2342 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
56e93d26
JQ
2343 TARGET_PAGE_SIZE) == -1) {
2344 error_report("Failed to load XBZRLE page - decode error!");
2345 return -1;
2346 }
2347
2348 return 0;
2349}
2350
3d0684b2
JQ
2351/**
2352 * ram_block_from_stream: read a RAMBlock id from the migration stream
2353 *
2354 * Must be called from within a rcu critical section.
2355 *
56e93d26 2356 * Returns a pointer from within the RCU-protected ram_list.
a7180877 2357 *
3d0684b2
JQ
2358 * @f: QEMUFile where to read the data from
2359 * @flags: Page flags (mostly to see if it's a continuation of previous block)
a7180877 2360 */
3d0684b2 2361static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
56e93d26
JQ
2362{
2363 static RAMBlock *block = NULL;
2364 char id[256];
2365 uint8_t len;
2366
2367 if (flags & RAM_SAVE_FLAG_CONTINUE) {
4c4bad48 2368 if (!block) {
56e93d26
JQ
2369 error_report("Ack, bad migration stream!");
2370 return NULL;
2371 }
4c4bad48 2372 return block;
56e93d26
JQ
2373 }
2374
2375 len = qemu_get_byte(f);
2376 qemu_get_buffer(f, (uint8_t *)id, len);
2377 id[len] = 0;
2378
e3dd7493 2379 block = qemu_ram_block_by_name(id);
4c4bad48
HZ
2380 if (!block) {
2381 error_report("Can't find block %s", id);
2382 return NULL;
56e93d26
JQ
2383 }
2384
4c4bad48
HZ
2385 return block;
2386}
2387
2388static inline void *host_from_ram_block_offset(RAMBlock *block,
2389 ram_addr_t offset)
2390{
2391 if (!offset_in_ramblock(block, offset)) {
2392 return NULL;
2393 }
2394
2395 return block->host + offset;
56e93d26
JQ
2396}
2397
3d0684b2
JQ
2398/**
2399 * ram_handle_compressed: handle the zero page case
2400 *
56e93d26
JQ
2401 * If a page (or a whole RDMA chunk) has been
2402 * determined to be zero, then zap it.
3d0684b2
JQ
2403 *
2404 * @host: host address for the zero page
2405 * @ch: what the page is filled from. We only support zero
2406 * @size: size of the zero page
56e93d26
JQ
2407 */
2408void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2409{
2410 if (ch != 0 || !is_zero_range(host, size)) {
2411 memset(host, ch, size);
2412 }
2413}
2414
2415static void *do_data_decompress(void *opaque)
2416{
2417 DecompressParam *param = opaque;
2418 unsigned long pagesize;
33d151f4
LL
2419 uint8_t *des;
2420 int len;
56e93d26 2421
33d151f4 2422 qemu_mutex_lock(&param->mutex);
90e56fb4 2423 while (!param->quit) {
33d151f4
LL
2424 if (param->des) {
2425 des = param->des;
2426 len = param->len;
2427 param->des = 0;
2428 qemu_mutex_unlock(&param->mutex);
2429
56e93d26 2430 pagesize = TARGET_PAGE_SIZE;
73a8912b
LL
2431 /* uncompress() will return failed in some case, especially
2432 * when the page is dirted when doing the compression, it's
2433 * not a problem because the dirty page will be retransferred
2434 * and uncompress() won't break the data in other pages.
2435 */
33d151f4
LL
2436 uncompress((Bytef *)des, &pagesize,
2437 (const Bytef *)param->compbuf, len);
73a8912b 2438
33d151f4
LL
2439 qemu_mutex_lock(&decomp_done_lock);
2440 param->done = true;
2441 qemu_cond_signal(&decomp_done_cond);
2442 qemu_mutex_unlock(&decomp_done_lock);
2443
2444 qemu_mutex_lock(&param->mutex);
2445 } else {
2446 qemu_cond_wait(&param->cond, &param->mutex);
2447 }
56e93d26 2448 }
33d151f4 2449 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
2450
2451 return NULL;
2452}
2453
5533b2e9
LL
2454static void wait_for_decompress_done(void)
2455{
2456 int idx, thread_count;
2457
2458 if (!migrate_use_compression()) {
2459 return;
2460 }
2461
2462 thread_count = migrate_decompress_threads();
2463 qemu_mutex_lock(&decomp_done_lock);
2464 for (idx = 0; idx < thread_count; idx++) {
2465 while (!decomp_param[idx].done) {
2466 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2467 }
2468 }
2469 qemu_mutex_unlock(&decomp_done_lock);
2470}
2471
f0afa331 2472static void compress_threads_load_setup(void)
56e93d26
JQ
2473{
2474 int i, thread_count;
2475
3416ab5b
JQ
2476 if (!migrate_use_compression()) {
2477 return;
2478 }
56e93d26
JQ
2479 thread_count = migrate_decompress_threads();
2480 decompress_threads = g_new0(QemuThread, thread_count);
2481 decomp_param = g_new0(DecompressParam, thread_count);
73a8912b
LL
2482 qemu_mutex_init(&decomp_done_lock);
2483 qemu_cond_init(&decomp_done_cond);
56e93d26
JQ
2484 for (i = 0; i < thread_count; i++) {
2485 qemu_mutex_init(&decomp_param[i].mutex);
2486 qemu_cond_init(&decomp_param[i].cond);
2487 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
73a8912b 2488 decomp_param[i].done = true;
90e56fb4 2489 decomp_param[i].quit = false;
56e93d26
JQ
2490 qemu_thread_create(decompress_threads + i, "decompress",
2491 do_data_decompress, decomp_param + i,
2492 QEMU_THREAD_JOINABLE);
2493 }
2494}
2495
f0afa331 2496static void compress_threads_load_cleanup(void)
56e93d26
JQ
2497{
2498 int i, thread_count;
2499
3416ab5b
JQ
2500 if (!migrate_use_compression()) {
2501 return;
2502 }
56e93d26
JQ
2503 thread_count = migrate_decompress_threads();
2504 for (i = 0; i < thread_count; i++) {
2505 qemu_mutex_lock(&decomp_param[i].mutex);
90e56fb4 2506 decomp_param[i].quit = true;
56e93d26
JQ
2507 qemu_cond_signal(&decomp_param[i].cond);
2508 qemu_mutex_unlock(&decomp_param[i].mutex);
2509 }
2510 for (i = 0; i < thread_count; i++) {
2511 qemu_thread_join(decompress_threads + i);
2512 qemu_mutex_destroy(&decomp_param[i].mutex);
2513 qemu_cond_destroy(&decomp_param[i].cond);
2514 g_free(decomp_param[i].compbuf);
2515 }
2516 g_free(decompress_threads);
2517 g_free(decomp_param);
56e93d26
JQ
2518 decompress_threads = NULL;
2519 decomp_param = NULL;
56e93d26
JQ
2520}
2521
c1bc6626 2522static void decompress_data_with_multi_threads(QEMUFile *f,
56e93d26
JQ
2523 void *host, int len)
2524{
2525 int idx, thread_count;
2526
2527 thread_count = migrate_decompress_threads();
73a8912b 2528 qemu_mutex_lock(&decomp_done_lock);
56e93d26
JQ
2529 while (true) {
2530 for (idx = 0; idx < thread_count; idx++) {
73a8912b 2531 if (decomp_param[idx].done) {
33d151f4
LL
2532 decomp_param[idx].done = false;
2533 qemu_mutex_lock(&decomp_param[idx].mutex);
c1bc6626 2534 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
56e93d26
JQ
2535 decomp_param[idx].des = host;
2536 decomp_param[idx].len = len;
33d151f4
LL
2537 qemu_cond_signal(&decomp_param[idx].cond);
2538 qemu_mutex_unlock(&decomp_param[idx].mutex);
56e93d26
JQ
2539 break;
2540 }
2541 }
2542 if (idx < thread_count) {
2543 break;
73a8912b
LL
2544 } else {
2545 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
56e93d26
JQ
2546 }
2547 }
73a8912b 2548 qemu_mutex_unlock(&decomp_done_lock);
56e93d26
JQ
2549}
2550
f265e0e4
JQ
2551/**
2552 * ram_load_setup: Setup RAM for migration incoming side
2553 *
2554 * Returns zero to indicate success and negative for error
2555 *
2556 * @f: QEMUFile where to receive the data
2557 * @opaque: RAMState pointer
2558 */
2559static int ram_load_setup(QEMUFile *f, void *opaque)
2560{
2561 xbzrle_load_setup();
f0afa331 2562 compress_threads_load_setup();
f265e0e4
JQ
2563 return 0;
2564}
2565
2566static int ram_load_cleanup(void *opaque)
2567{
2568 xbzrle_load_cleanup();
f0afa331 2569 compress_threads_load_cleanup();
f265e0e4
JQ
2570 return 0;
2571}
2572
3d0684b2
JQ
2573/**
2574 * ram_postcopy_incoming_init: allocate postcopy data structures
2575 *
2576 * Returns 0 for success and negative if there was one error
2577 *
2578 * @mis: current migration incoming state
2579 *
2580 * Allocate data structures etc needed by incoming migration with
2581 * postcopy-ram. postcopy-ram's similarly names
2582 * postcopy_ram_incoming_init does the work.
1caddf8a
DDAG
2583 */
2584int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2585{
b8c48993 2586 unsigned long ram_pages = last_ram_page();
1caddf8a
DDAG
2587
2588 return postcopy_ram_incoming_init(mis, ram_pages);
2589}
2590
3d0684b2
JQ
2591/**
2592 * ram_load_postcopy: load a page in postcopy case
2593 *
2594 * Returns 0 for success or -errno in case of error
2595 *
a7180877
DDAG
2596 * Called in postcopy mode by ram_load().
2597 * rcu_read_lock is taken prior to this being called.
3d0684b2
JQ
2598 *
2599 * @f: QEMUFile where to send the data
a7180877
DDAG
2600 */
2601static int ram_load_postcopy(QEMUFile *f)
2602{
2603 int flags = 0, ret = 0;
2604 bool place_needed = false;
28abd200 2605 bool matching_page_sizes = false;
a7180877
DDAG
2606 MigrationIncomingState *mis = migration_incoming_get_current();
2607 /* Temporary page that is later 'placed' */
2608 void *postcopy_host_page = postcopy_get_tmp_page(mis);
c53b7ddc 2609 void *last_host = NULL;
a3b6ff6d 2610 bool all_zero = false;
a7180877
DDAG
2611
2612 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2613 ram_addr_t addr;
2614 void *host = NULL;
2615 void *page_buffer = NULL;
2616 void *place_source = NULL;
df9ff5e1 2617 RAMBlock *block = NULL;
a7180877 2618 uint8_t ch;
a7180877
DDAG
2619
2620 addr = qemu_get_be64(f);
2621 flags = addr & ~TARGET_PAGE_MASK;
2622 addr &= TARGET_PAGE_MASK;
2623
2624 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2625 place_needed = false;
bb890ed5 2626 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
df9ff5e1 2627 block = ram_block_from_stream(f, flags);
4c4bad48
HZ
2628
2629 host = host_from_ram_block_offset(block, addr);
a7180877
DDAG
2630 if (!host) {
2631 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2632 ret = -EINVAL;
2633 break;
2634 }
28abd200 2635 matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
a7180877 2636 /*
28abd200
DDAG
2637 * Postcopy requires that we place whole host pages atomically;
2638 * these may be huge pages for RAMBlocks that are backed by
2639 * hugetlbfs.
a7180877
DDAG
2640 * To make it atomic, the data is read into a temporary page
2641 * that's moved into place later.
2642 * The migration protocol uses, possibly smaller, target-pages
2643 * however the source ensures it always sends all the components
2644 * of a host page in order.
2645 */
2646 page_buffer = postcopy_host_page +
28abd200 2647 ((uintptr_t)host & (block->page_size - 1));
a7180877 2648 /* If all TP are zero then we can optimise the place */
28abd200 2649 if (!((uintptr_t)host & (block->page_size - 1))) {
a7180877 2650 all_zero = true;
c53b7ddc
DDAG
2651 } else {
2652 /* not the 1st TP within the HP */
2653 if (host != (last_host + TARGET_PAGE_SIZE)) {
9af9e0fe 2654 error_report("Non-sequential target page %p/%p",
c53b7ddc
DDAG
2655 host, last_host);
2656 ret = -EINVAL;
2657 break;
2658 }
a7180877
DDAG
2659 }
2660
c53b7ddc 2661
a7180877
DDAG
2662 /*
2663 * If it's the last part of a host page then we place the host
2664 * page
2665 */
2666 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
28abd200 2667 (block->page_size - 1)) == 0;
a7180877
DDAG
2668 place_source = postcopy_host_page;
2669 }
c53b7ddc 2670 last_host = host;
a7180877
DDAG
2671
2672 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
bb890ed5 2673 case RAM_SAVE_FLAG_ZERO:
a7180877
DDAG
2674 ch = qemu_get_byte(f);
2675 memset(page_buffer, ch, TARGET_PAGE_SIZE);
2676 if (ch) {
2677 all_zero = false;
2678 }
2679 break;
2680
2681 case RAM_SAVE_FLAG_PAGE:
2682 all_zero = false;
2683 if (!place_needed || !matching_page_sizes) {
2684 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2685 } else {
2686 /* Avoids the qemu_file copy during postcopy, which is
2687 * going to do a copy later; can only do it when we
2688 * do this read in one go (matching page sizes)
2689 */
2690 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2691 TARGET_PAGE_SIZE);
2692 }
2693 break;
2694 case RAM_SAVE_FLAG_EOS:
2695 /* normal exit */
2696 break;
2697 default:
2698 error_report("Unknown combination of migration flags: %#x"
2699 " (postcopy mode)", flags);
2700 ret = -EINVAL;
2701 }
2702
2703 if (place_needed) {
2704 /* This gets called at the last target page in the host page */
df9ff5e1
DDAG
2705 void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2706
a7180877 2707 if (all_zero) {
df9ff5e1
DDAG
2708 ret = postcopy_place_page_zero(mis, place_dest,
2709 block->page_size);
a7180877 2710 } else {
df9ff5e1
DDAG
2711 ret = postcopy_place_page(mis, place_dest,
2712 place_source, block->page_size);
a7180877
DDAG
2713 }
2714 }
2715 if (!ret) {
2716 ret = qemu_file_get_error(f);
2717 }
2718 }
2719
2720 return ret;
2721}
2722
56e93d26
JQ
2723static int ram_load(QEMUFile *f, void *opaque, int version_id)
2724{
edc60127 2725 int flags = 0, ret = 0, invalid_flags = 0;
56e93d26
JQ
2726 static uint64_t seq_iter;
2727 int len = 0;
a7180877
DDAG
2728 /*
2729 * If system is running in postcopy mode, page inserts to host memory must
2730 * be atomic
2731 */
2732 bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
ef08fb38
DDAG
2733 /* ADVISE is earlier, it shows the source has the postcopy capability on */
2734 bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
56e93d26
JQ
2735
2736 seq_iter++;
2737
2738 if (version_id != 4) {
2739 ret = -EINVAL;
2740 }
2741
edc60127
JQ
2742 if (!migrate_use_compression()) {
2743 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
2744 }
56e93d26
JQ
2745 /* This RCU critical section can be very long running.
2746 * When RCU reclaims in the code start to become numerous,
2747 * it will be necessary to reduce the granularity of this
2748 * critical section.
2749 */
2750 rcu_read_lock();
a7180877
DDAG
2751
2752 if (postcopy_running) {
2753 ret = ram_load_postcopy(f);
2754 }
2755
2756 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
56e93d26 2757 ram_addr_t addr, total_ram_bytes;
a776aa15 2758 void *host = NULL;
56e93d26
JQ
2759 uint8_t ch;
2760
2761 addr = qemu_get_be64(f);
2762 flags = addr & ~TARGET_PAGE_MASK;
2763 addr &= TARGET_PAGE_MASK;
2764
edc60127
JQ
2765 if (flags & invalid_flags) {
2766 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
2767 error_report("Received an unexpected compressed page");
2768 }
2769
2770 ret = -EINVAL;
2771 break;
2772 }
2773
bb890ed5 2774 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
a776aa15 2775 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4c4bad48
HZ
2776 RAMBlock *block = ram_block_from_stream(f, flags);
2777
2778 host = host_from_ram_block_offset(block, addr);
a776aa15
DDAG
2779 if (!host) {
2780 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2781 ret = -EINVAL;
2782 break;
2783 }
1db9d8e5 2784 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
a776aa15
DDAG
2785 }
2786
56e93d26
JQ
2787 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2788 case RAM_SAVE_FLAG_MEM_SIZE:
2789 /* Synchronize RAM block list */
2790 total_ram_bytes = addr;
2791 while (!ret && total_ram_bytes) {
2792 RAMBlock *block;
56e93d26
JQ
2793 char id[256];
2794 ram_addr_t length;
2795
2796 len = qemu_get_byte(f);
2797 qemu_get_buffer(f, (uint8_t *)id, len);
2798 id[len] = 0;
2799 length = qemu_get_be64(f);
2800
e3dd7493
DDAG
2801 block = qemu_ram_block_by_name(id);
2802 if (block) {
2803 if (length != block->used_length) {
2804 Error *local_err = NULL;
56e93d26 2805
fa53a0e5 2806 ret = qemu_ram_resize(block, length,
e3dd7493
DDAG
2807 &local_err);
2808 if (local_err) {
2809 error_report_err(local_err);
56e93d26 2810 }
56e93d26 2811 }
ef08fb38
DDAG
2812 /* For postcopy we need to check hugepage sizes match */
2813 if (postcopy_advised &&
2814 block->page_size != qemu_host_page_size) {
2815 uint64_t remote_page_size = qemu_get_be64(f);
2816 if (remote_page_size != block->page_size) {
2817 error_report("Mismatched RAM page size %s "
2818 "(local) %zd != %" PRId64,
2819 id, block->page_size,
2820 remote_page_size);
2821 ret = -EINVAL;
2822 }
2823 }
e3dd7493
DDAG
2824 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2825 block->idstr);
2826 } else {
56e93d26
JQ
2827 error_report("Unknown ramblock \"%s\", cannot "
2828 "accept migration", id);
2829 ret = -EINVAL;
2830 }
2831
2832 total_ram_bytes -= length;
2833 }
2834 break;
a776aa15 2835
bb890ed5 2836 case RAM_SAVE_FLAG_ZERO:
56e93d26
JQ
2837 ch = qemu_get_byte(f);
2838 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2839 break;
a776aa15 2840
56e93d26 2841 case RAM_SAVE_FLAG_PAGE:
56e93d26
JQ
2842 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2843 break;
56e93d26 2844
a776aa15 2845 case RAM_SAVE_FLAG_COMPRESS_PAGE:
56e93d26
JQ
2846 len = qemu_get_be32(f);
2847 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2848 error_report("Invalid compressed data length: %d", len);
2849 ret = -EINVAL;
2850 break;
2851 }
c1bc6626 2852 decompress_data_with_multi_threads(f, host, len);
56e93d26 2853 break;
a776aa15 2854
56e93d26 2855 case RAM_SAVE_FLAG_XBZRLE:
56e93d26
JQ
2856 if (load_xbzrle(f, addr, host) < 0) {
2857 error_report("Failed to decompress XBZRLE page at "
2858 RAM_ADDR_FMT, addr);
2859 ret = -EINVAL;
2860 break;
2861 }
2862 break;
2863 case RAM_SAVE_FLAG_EOS:
2864 /* normal exit */
2865 break;
2866 default:
2867 if (flags & RAM_SAVE_FLAG_HOOK) {
632e3a5c 2868 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
56e93d26
JQ
2869 } else {
2870 error_report("Unknown combination of migration flags: %#x",
2871 flags);
2872 ret = -EINVAL;
2873 }
2874 }
2875 if (!ret) {
2876 ret = qemu_file_get_error(f);
2877 }
2878 }
2879
5533b2e9 2880 wait_for_decompress_done();
56e93d26 2881 rcu_read_unlock();
55c4446b 2882 trace_ram_load_complete(ret, seq_iter);
56e93d26
JQ
2883 return ret;
2884}
2885
c6467627
VSO
2886static bool ram_has_postcopy(void *opaque)
2887{
2888 return migrate_postcopy_ram();
2889}
2890
56e93d26 2891static SaveVMHandlers savevm_ram_handlers = {
9907e842 2892 .save_setup = ram_save_setup,
56e93d26 2893 .save_live_iterate = ram_save_iterate,
763c906b 2894 .save_live_complete_postcopy = ram_save_complete,
a3e06c3d 2895 .save_live_complete_precopy = ram_save_complete,
c6467627 2896 .has_postcopy = ram_has_postcopy,
56e93d26
JQ
2897 .save_live_pending = ram_save_pending,
2898 .load_state = ram_load,
f265e0e4
JQ
2899 .save_cleanup = ram_save_cleanup,
2900 .load_setup = ram_load_setup,
2901 .load_cleanup = ram_load_cleanup,
56e93d26
JQ
2902};
2903
2904void ram_mig_init(void)
2905{
2906 qemu_mutex_init(&XBZRLE.lock);
6f37bb8b 2907 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
56e93d26 2908}