]> git.proxmox.com Git - mirror_qemu.git/blob - migration/ram.c
Merge remote-tracking branch 'remotes/kraxel/tags/egl-20211105-pull-request' into...
[mirror_qemu.git] / migration / ram.c
1 /*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
28
29 #include "qemu/osdep.h"
30 #include "qemu/cutils.h"
31 #include "qemu/bitops.h"
32 #include "qemu/bitmap.h"
33 #include "qemu/main-loop.h"
34 #include "xbzrle.h"
35 #include "ram.h"
36 #include "migration.h"
37 #include "migration/register.h"
38 #include "migration/misc.h"
39 #include "qemu-file.h"
40 #include "postcopy-ram.h"
41 #include "page_cache.h"
42 #include "qemu/error-report.h"
43 #include "qapi/error.h"
44 #include "qapi/qapi-types-migration.h"
45 #include "qapi/qapi-events-migration.h"
46 #include "qapi/qmp/qerror.h"
47 #include "trace.h"
48 #include "exec/ram_addr.h"
49 #include "exec/target_page.h"
50 #include "qemu/rcu_queue.h"
51 #include "migration/colo.h"
52 #include "block.h"
53 #include "sysemu/cpu-throttle.h"
54 #include "savevm.h"
55 #include "qemu/iov.h"
56 #include "multifd.h"
57 #include "sysemu/runstate.h"
58
59 #include "hw/boards.h" /* for machine_dump_guest_core() */
60
61 #if defined(__linux__)
62 #include "qemu/userfaultfd.h"
63 #endif /* defined(__linux__) */
64
65 /***********************************************************/
66 /* ram save/restore */
67
68 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
69 * worked for pages that where filled with the same char. We switched
70 * it to only search for the zero value. And to avoid confusion with
71 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
72 */
73
74 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
75 #define RAM_SAVE_FLAG_ZERO 0x02
76 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
77 #define RAM_SAVE_FLAG_PAGE 0x08
78 #define RAM_SAVE_FLAG_EOS 0x10
79 #define RAM_SAVE_FLAG_CONTINUE 0x20
80 #define RAM_SAVE_FLAG_XBZRLE 0x40
81 /* 0x80 is reserved in migration.h start with 0x100 next */
82 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
83
84 static inline bool is_zero_range(uint8_t *p, uint64_t size)
85 {
86 return buffer_is_zero(p, size);
87 }
88
89 XBZRLECacheStats xbzrle_counters;
90
91 /* struct contains XBZRLE cache and a static page
92 used by the compression */
93 static struct {
94 /* buffer used for XBZRLE encoding */
95 uint8_t *encoded_buf;
96 /* buffer for storing page content */
97 uint8_t *current_buf;
98 /* Cache for XBZRLE, Protected by lock. */
99 PageCache *cache;
100 QemuMutex lock;
101 /* it will store a page full of zeros */
102 uint8_t *zero_target_page;
103 /* buffer used for XBZRLE decoding */
104 uint8_t *decoded_buf;
105 } XBZRLE;
106
107 static void XBZRLE_cache_lock(void)
108 {
109 if (migrate_use_xbzrle()) {
110 qemu_mutex_lock(&XBZRLE.lock);
111 }
112 }
113
114 static void XBZRLE_cache_unlock(void)
115 {
116 if (migrate_use_xbzrle()) {
117 qemu_mutex_unlock(&XBZRLE.lock);
118 }
119 }
120
121 /**
122 * xbzrle_cache_resize: resize the xbzrle cache
123 *
124 * This function is called from migrate_params_apply in main
125 * thread, possibly while a migration is in progress. A running
126 * migration may be using the cache and might finish during this call,
127 * hence changes to the cache are protected by XBZRLE.lock().
128 *
129 * Returns 0 for success or -1 for error
130 *
131 * @new_size: new cache size
132 * @errp: set *errp if the check failed, with reason
133 */
134 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
135 {
136 PageCache *new_cache;
137 int64_t ret = 0;
138
139 /* Check for truncation */
140 if (new_size != (size_t)new_size) {
141 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
142 "exceeding address space");
143 return -1;
144 }
145
146 if (new_size == migrate_xbzrle_cache_size()) {
147 /* nothing to do */
148 return 0;
149 }
150
151 XBZRLE_cache_lock();
152
153 if (XBZRLE.cache != NULL) {
154 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
155 if (!new_cache) {
156 ret = -1;
157 goto out;
158 }
159
160 cache_fini(XBZRLE.cache);
161 XBZRLE.cache = new_cache;
162 }
163 out:
164 XBZRLE_cache_unlock();
165 return ret;
166 }
167
168 bool ramblock_is_ignored(RAMBlock *block)
169 {
170 return !qemu_ram_is_migratable(block) ||
171 (migrate_ignore_shared() && qemu_ram_is_shared(block));
172 }
173
174 #undef RAMBLOCK_FOREACH
175
176 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
177 {
178 RAMBlock *block;
179 int ret = 0;
180
181 RCU_READ_LOCK_GUARD();
182
183 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
184 ret = func(block, opaque);
185 if (ret) {
186 break;
187 }
188 }
189 return ret;
190 }
191
192 static void ramblock_recv_map_init(void)
193 {
194 RAMBlock *rb;
195
196 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
197 assert(!rb->receivedmap);
198 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
199 }
200 }
201
202 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
203 {
204 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
205 rb->receivedmap);
206 }
207
208 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
209 {
210 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
211 }
212
213 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
214 {
215 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
216 }
217
218 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
219 size_t nr)
220 {
221 bitmap_set_atomic(rb->receivedmap,
222 ramblock_recv_bitmap_offset(host_addr, rb),
223 nr);
224 }
225
226 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
227
228 /*
229 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
230 *
231 * Returns >0 if success with sent bytes, or <0 if error.
232 */
233 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
234 const char *block_name)
235 {
236 RAMBlock *block = qemu_ram_block_by_name(block_name);
237 unsigned long *le_bitmap, nbits;
238 uint64_t size;
239
240 if (!block) {
241 error_report("%s: invalid block name: %s", __func__, block_name);
242 return -1;
243 }
244
245 nbits = block->postcopy_length >> TARGET_PAGE_BITS;
246
247 /*
248 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
249 * machines we may need 4 more bytes for padding (see below
250 * comment). So extend it a bit before hand.
251 */
252 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
253
254 /*
255 * Always use little endian when sending the bitmap. This is
256 * required that when source and destination VMs are not using the
257 * same endianness. (Note: big endian won't work.)
258 */
259 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
260
261 /* Size of the bitmap, in bytes */
262 size = DIV_ROUND_UP(nbits, 8);
263
264 /*
265 * size is always aligned to 8 bytes for 64bit machines, but it
266 * may not be true for 32bit machines. We need this padding to
267 * make sure the migration can survive even between 32bit and
268 * 64bit machines.
269 */
270 size = ROUND_UP(size, 8);
271
272 qemu_put_be64(file, size);
273 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
274 /*
275 * Mark as an end, in case the middle part is screwed up due to
276 * some "mysterious" reason.
277 */
278 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
279 qemu_fflush(file);
280
281 g_free(le_bitmap);
282
283 if (qemu_file_get_error(file)) {
284 return qemu_file_get_error(file);
285 }
286
287 return size + sizeof(size);
288 }
289
290 /*
291 * An outstanding page request, on the source, having been received
292 * and queued
293 */
294 struct RAMSrcPageRequest {
295 RAMBlock *rb;
296 hwaddr offset;
297 hwaddr len;
298
299 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
300 };
301
302 /* State of RAM for migration */
303 struct RAMState {
304 /* QEMUFile used for this migration */
305 QEMUFile *f;
306 /* UFFD file descriptor, used in 'write-tracking' migration */
307 int uffdio_fd;
308 /* Last block that we have visited searching for dirty pages */
309 RAMBlock *last_seen_block;
310 /* Last block from where we have sent data */
311 RAMBlock *last_sent_block;
312 /* Last dirty target page we have sent */
313 ram_addr_t last_page;
314 /* last ram version we have seen */
315 uint32_t last_version;
316 /* How many times we have dirty too many pages */
317 int dirty_rate_high_cnt;
318 /* these variables are used for bitmap sync */
319 /* last time we did a full bitmap_sync */
320 int64_t time_last_bitmap_sync;
321 /* bytes transferred at start_time */
322 uint64_t bytes_xfer_prev;
323 /* number of dirty pages since start_time */
324 uint64_t num_dirty_pages_period;
325 /* xbzrle misses since the beginning of the period */
326 uint64_t xbzrle_cache_miss_prev;
327 /* Amount of xbzrle pages since the beginning of the period */
328 uint64_t xbzrle_pages_prev;
329 /* Amount of xbzrle encoded bytes since the beginning of the period */
330 uint64_t xbzrle_bytes_prev;
331 /* Start using XBZRLE (e.g., after the first round). */
332 bool xbzrle_enabled;
333
334 /* compression statistics since the beginning of the period */
335 /* amount of count that no free thread to compress data */
336 uint64_t compress_thread_busy_prev;
337 /* amount bytes after compression */
338 uint64_t compressed_size_prev;
339 /* amount of compressed pages */
340 uint64_t compress_pages_prev;
341
342 /* total handled target pages at the beginning of period */
343 uint64_t target_page_count_prev;
344 /* total handled target pages since start */
345 uint64_t target_page_count;
346 /* number of dirty bits in the bitmap */
347 uint64_t migration_dirty_pages;
348 /* Protects modification of the bitmap and migration dirty pages */
349 QemuMutex bitmap_mutex;
350 /* The RAMBlock used in the last src_page_requests */
351 RAMBlock *last_req_rb;
352 /* Queue of outstanding page requests from the destination */
353 QemuMutex src_page_req_mutex;
354 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
355 };
356 typedef struct RAMState RAMState;
357
358 static RAMState *ram_state;
359
360 static NotifierWithReturnList precopy_notifier_list;
361
362 void precopy_infrastructure_init(void)
363 {
364 notifier_with_return_list_init(&precopy_notifier_list);
365 }
366
367 void precopy_add_notifier(NotifierWithReturn *n)
368 {
369 notifier_with_return_list_add(&precopy_notifier_list, n);
370 }
371
372 void precopy_remove_notifier(NotifierWithReturn *n)
373 {
374 notifier_with_return_remove(n);
375 }
376
377 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
378 {
379 PrecopyNotifyData pnd;
380 pnd.reason = reason;
381 pnd.errp = errp;
382
383 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
384 }
385
386 uint64_t ram_bytes_remaining(void)
387 {
388 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
389 0;
390 }
391
392 MigrationStats ram_counters;
393
394 /* used by the search for pages to send */
395 struct PageSearchStatus {
396 /* Current block being searched */
397 RAMBlock *block;
398 /* Current page to search from */
399 unsigned long page;
400 /* Set once we wrap around */
401 bool complete_round;
402 };
403 typedef struct PageSearchStatus PageSearchStatus;
404
405 CompressionStats compression_counters;
406
407 struct CompressParam {
408 bool done;
409 bool quit;
410 bool zero_page;
411 QEMUFile *file;
412 QemuMutex mutex;
413 QemuCond cond;
414 RAMBlock *block;
415 ram_addr_t offset;
416
417 /* internally used fields */
418 z_stream stream;
419 uint8_t *originbuf;
420 };
421 typedef struct CompressParam CompressParam;
422
423 struct DecompressParam {
424 bool done;
425 bool quit;
426 QemuMutex mutex;
427 QemuCond cond;
428 void *des;
429 uint8_t *compbuf;
430 int len;
431 z_stream stream;
432 };
433 typedef struct DecompressParam DecompressParam;
434
435 static CompressParam *comp_param;
436 static QemuThread *compress_threads;
437 /* comp_done_cond is used to wake up the migration thread when
438 * one of the compression threads has finished the compression.
439 * comp_done_lock is used to co-work with comp_done_cond.
440 */
441 static QemuMutex comp_done_lock;
442 static QemuCond comp_done_cond;
443 /* The empty QEMUFileOps will be used by file in CompressParam */
444 static const QEMUFileOps empty_ops = { };
445
446 static QEMUFile *decomp_file;
447 static DecompressParam *decomp_param;
448 static QemuThread *decompress_threads;
449 static QemuMutex decomp_done_lock;
450 static QemuCond decomp_done_cond;
451
452 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
453 ram_addr_t offset, uint8_t *source_buf);
454
455 static void *do_data_compress(void *opaque)
456 {
457 CompressParam *param = opaque;
458 RAMBlock *block;
459 ram_addr_t offset;
460 bool zero_page;
461
462 qemu_mutex_lock(&param->mutex);
463 while (!param->quit) {
464 if (param->block) {
465 block = param->block;
466 offset = param->offset;
467 param->block = NULL;
468 qemu_mutex_unlock(&param->mutex);
469
470 zero_page = do_compress_ram_page(param->file, &param->stream,
471 block, offset, param->originbuf);
472
473 qemu_mutex_lock(&comp_done_lock);
474 param->done = true;
475 param->zero_page = zero_page;
476 qemu_cond_signal(&comp_done_cond);
477 qemu_mutex_unlock(&comp_done_lock);
478
479 qemu_mutex_lock(&param->mutex);
480 } else {
481 qemu_cond_wait(&param->cond, &param->mutex);
482 }
483 }
484 qemu_mutex_unlock(&param->mutex);
485
486 return NULL;
487 }
488
489 static void compress_threads_save_cleanup(void)
490 {
491 int i, thread_count;
492
493 if (!migrate_use_compression() || !comp_param) {
494 return;
495 }
496
497 thread_count = migrate_compress_threads();
498 for (i = 0; i < thread_count; i++) {
499 /*
500 * we use it as a indicator which shows if the thread is
501 * properly init'd or not
502 */
503 if (!comp_param[i].file) {
504 break;
505 }
506
507 qemu_mutex_lock(&comp_param[i].mutex);
508 comp_param[i].quit = true;
509 qemu_cond_signal(&comp_param[i].cond);
510 qemu_mutex_unlock(&comp_param[i].mutex);
511
512 qemu_thread_join(compress_threads + i);
513 qemu_mutex_destroy(&comp_param[i].mutex);
514 qemu_cond_destroy(&comp_param[i].cond);
515 deflateEnd(&comp_param[i].stream);
516 g_free(comp_param[i].originbuf);
517 qemu_fclose(comp_param[i].file);
518 comp_param[i].file = NULL;
519 }
520 qemu_mutex_destroy(&comp_done_lock);
521 qemu_cond_destroy(&comp_done_cond);
522 g_free(compress_threads);
523 g_free(comp_param);
524 compress_threads = NULL;
525 comp_param = NULL;
526 }
527
528 static int compress_threads_save_setup(void)
529 {
530 int i, thread_count;
531
532 if (!migrate_use_compression()) {
533 return 0;
534 }
535 thread_count = migrate_compress_threads();
536 compress_threads = g_new0(QemuThread, thread_count);
537 comp_param = g_new0(CompressParam, thread_count);
538 qemu_cond_init(&comp_done_cond);
539 qemu_mutex_init(&comp_done_lock);
540 for (i = 0; i < thread_count; i++) {
541 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
542 if (!comp_param[i].originbuf) {
543 goto exit;
544 }
545
546 if (deflateInit(&comp_param[i].stream,
547 migrate_compress_level()) != Z_OK) {
548 g_free(comp_param[i].originbuf);
549 goto exit;
550 }
551
552 /* comp_param[i].file is just used as a dummy buffer to save data,
553 * set its ops to empty.
554 */
555 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops, false);
556 comp_param[i].done = true;
557 comp_param[i].quit = false;
558 qemu_mutex_init(&comp_param[i].mutex);
559 qemu_cond_init(&comp_param[i].cond);
560 qemu_thread_create(compress_threads + i, "compress",
561 do_data_compress, comp_param + i,
562 QEMU_THREAD_JOINABLE);
563 }
564 return 0;
565
566 exit:
567 compress_threads_save_cleanup();
568 return -1;
569 }
570
571 /**
572 * save_page_header: write page header to wire
573 *
574 * If this is the 1st block, it also writes the block identification
575 *
576 * Returns the number of bytes written
577 *
578 * @f: QEMUFile where to send the data
579 * @block: block that contains the page we want to send
580 * @offset: offset inside the block for the page
581 * in the lower bits, it contains flags
582 */
583 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
584 ram_addr_t offset)
585 {
586 size_t size, len;
587
588 if (block == rs->last_sent_block) {
589 offset |= RAM_SAVE_FLAG_CONTINUE;
590 }
591 qemu_put_be64(f, offset);
592 size = 8;
593
594 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
595 len = strlen(block->idstr);
596 qemu_put_byte(f, len);
597 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
598 size += 1 + len;
599 rs->last_sent_block = block;
600 }
601 return size;
602 }
603
604 /**
605 * mig_throttle_guest_down: throttle down the guest
606 *
607 * Reduce amount of guest cpu execution to hopefully slow down memory
608 * writes. If guest dirty memory rate is reduced below the rate at
609 * which we can transfer pages to the destination then we should be
610 * able to complete migration. Some workloads dirty memory way too
611 * fast and will not effectively converge, even with auto-converge.
612 */
613 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
614 uint64_t bytes_dirty_threshold)
615 {
616 MigrationState *s = migrate_get_current();
617 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
618 uint64_t pct_increment = s->parameters.cpu_throttle_increment;
619 bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
620 int pct_max = s->parameters.max_cpu_throttle;
621
622 uint64_t throttle_now = cpu_throttle_get_percentage();
623 uint64_t cpu_now, cpu_ideal, throttle_inc;
624
625 /* We have not started throttling yet. Let's start it. */
626 if (!cpu_throttle_active()) {
627 cpu_throttle_set(pct_initial);
628 } else {
629 /* Throttling already on, just increase the rate */
630 if (!pct_tailslow) {
631 throttle_inc = pct_increment;
632 } else {
633 /* Compute the ideal CPU percentage used by Guest, which may
634 * make the dirty rate match the dirty rate threshold. */
635 cpu_now = 100 - throttle_now;
636 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
637 bytes_dirty_period);
638 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
639 }
640 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
641 }
642 }
643
644 /**
645 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
646 *
647 * @rs: current RAM state
648 * @current_addr: address for the zero page
649 *
650 * Update the xbzrle cache to reflect a page that's been sent as all 0.
651 * The important thing is that a stale (not-yet-0'd) page be replaced
652 * by the new data.
653 * As a bonus, if the page wasn't in the cache it gets added so that
654 * when a small write is made into the 0'd page it gets XBZRLE sent.
655 */
656 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
657 {
658 if (!rs->xbzrle_enabled) {
659 return;
660 }
661
662 /* We don't care if this fails to allocate a new cache page
663 * as long as it updated an old one */
664 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
665 ram_counters.dirty_sync_count);
666 }
667
668 #define ENCODING_FLAG_XBZRLE 0x1
669
670 /**
671 * save_xbzrle_page: compress and send current page
672 *
673 * Returns: 1 means that we wrote the page
674 * 0 means that page is identical to the one already sent
675 * -1 means that xbzrle would be longer than normal
676 *
677 * @rs: current RAM state
678 * @current_data: pointer to the address of the page contents
679 * @current_addr: addr of the page
680 * @block: block that contains the page we want to send
681 * @offset: offset inside the block for the page
682 * @last_stage: if we are at the completion stage
683 */
684 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
685 ram_addr_t current_addr, RAMBlock *block,
686 ram_addr_t offset, bool last_stage)
687 {
688 int encoded_len = 0, bytes_xbzrle;
689 uint8_t *prev_cached_page;
690
691 if (!cache_is_cached(XBZRLE.cache, current_addr,
692 ram_counters.dirty_sync_count)) {
693 xbzrle_counters.cache_miss++;
694 if (!last_stage) {
695 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
696 ram_counters.dirty_sync_count) == -1) {
697 return -1;
698 } else {
699 /* update *current_data when the page has been
700 inserted into cache */
701 *current_data = get_cached_data(XBZRLE.cache, current_addr);
702 }
703 }
704 return -1;
705 }
706
707 /*
708 * Reaching here means the page has hit the xbzrle cache, no matter what
709 * encoding result it is (normal encoding, overflow or skipping the page),
710 * count the page as encoded. This is used to calculate the encoding rate.
711 *
712 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
713 * 2nd page turns out to be skipped (i.e. no new bytes written to the
714 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
715 * skipped page included. In this way, the encoding rate can tell if the
716 * guest page is good for xbzrle encoding.
717 */
718 xbzrle_counters.pages++;
719 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
720
721 /* save current buffer into memory */
722 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
723
724 /* XBZRLE encoding (if there is no overflow) */
725 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
726 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
727 TARGET_PAGE_SIZE);
728
729 /*
730 * Update the cache contents, so that it corresponds to the data
731 * sent, in all cases except where we skip the page.
732 */
733 if (!last_stage && encoded_len != 0) {
734 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
735 /*
736 * In the case where we couldn't compress, ensure that the caller
737 * sends the data from the cache, since the guest might have
738 * changed the RAM since we copied it.
739 */
740 *current_data = prev_cached_page;
741 }
742
743 if (encoded_len == 0) {
744 trace_save_xbzrle_page_skipping();
745 return 0;
746 } else if (encoded_len == -1) {
747 trace_save_xbzrle_page_overflow();
748 xbzrle_counters.overflow++;
749 xbzrle_counters.bytes += TARGET_PAGE_SIZE;
750 return -1;
751 }
752
753 /* Send XBZRLE based compressed page */
754 bytes_xbzrle = save_page_header(rs, rs->f, block,
755 offset | RAM_SAVE_FLAG_XBZRLE);
756 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
757 qemu_put_be16(rs->f, encoded_len);
758 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
759 bytes_xbzrle += encoded_len + 1 + 2;
760 /*
761 * Like compressed_size (please see update_compress_thread_counts),
762 * the xbzrle encoded bytes don't count the 8 byte header with
763 * RAM_SAVE_FLAG_CONTINUE.
764 */
765 xbzrle_counters.bytes += bytes_xbzrle - 8;
766 ram_counters.transferred += bytes_xbzrle;
767
768 return 1;
769 }
770
771 /**
772 * migration_bitmap_find_dirty: find the next dirty page from start
773 *
774 * Returns the page offset within memory region of the start of a dirty page
775 *
776 * @rs: current RAM state
777 * @rb: RAMBlock where to search for dirty pages
778 * @start: page where we start the search
779 */
780 static inline
781 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
782 unsigned long start)
783 {
784 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
785 unsigned long *bitmap = rb->bmap;
786
787 if (ramblock_is_ignored(rb)) {
788 return size;
789 }
790
791 return find_next_bit(bitmap, size, start);
792 }
793
794 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
795 unsigned long page)
796 {
797 uint8_t shift;
798 hwaddr size, start;
799
800 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
801 return;
802 }
803
804 shift = rb->clear_bmap_shift;
805 /*
806 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
807 * can make things easier sometimes since then start address
808 * of the small chunk will always be 64 pages aligned so the
809 * bitmap will always be aligned to unsigned long. We should
810 * even be able to remove this restriction but I'm simply
811 * keeping it.
812 */
813 assert(shift >= 6);
814
815 size = 1ULL << (TARGET_PAGE_BITS + shift);
816 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
817 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
818 memory_region_clear_dirty_bitmap(rb->mr, start, size);
819 }
820
821 static void
822 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
823 unsigned long start,
824 unsigned long npages)
825 {
826 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
827 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
828 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
829
830 /*
831 * Clear pages from start to start + npages - 1, so the end boundary is
832 * exclusive.
833 */
834 for (i = chunk_start; i < chunk_end; i += chunk_pages) {
835 migration_clear_memory_region_dirty_bitmap(rb, i);
836 }
837 }
838
839 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
840 RAMBlock *rb,
841 unsigned long page)
842 {
843 bool ret;
844
845 /*
846 * Clear dirty bitmap if needed. This _must_ be called before we
847 * send any of the page in the chunk because we need to make sure
848 * we can capture further page content changes when we sync dirty
849 * log the next time. So as long as we are going to send any of
850 * the page in the chunk we clear the remote dirty bitmap for all.
851 * Clearing it earlier won't be a problem, but too late will.
852 */
853 migration_clear_memory_region_dirty_bitmap(rb, page);
854
855 ret = test_and_clear_bit(page, rb->bmap);
856 if (ret) {
857 rs->migration_dirty_pages--;
858 }
859
860 return ret;
861 }
862
863 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
864 void *opaque)
865 {
866 const hwaddr offset = section->offset_within_region;
867 const hwaddr size = int128_get64(section->size);
868 const unsigned long start = offset >> TARGET_PAGE_BITS;
869 const unsigned long npages = size >> TARGET_PAGE_BITS;
870 RAMBlock *rb = section->mr->ram_block;
871 uint64_t *cleared_bits = opaque;
872
873 /*
874 * We don't grab ram_state->bitmap_mutex because we expect to run
875 * only when starting migration or during postcopy recovery where
876 * we don't have concurrent access.
877 */
878 if (!migration_in_postcopy() && !migrate_background_snapshot()) {
879 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
880 }
881 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
882 bitmap_clear(rb->bmap, start, npages);
883 }
884
885 /*
886 * Exclude all dirty pages from migration that fall into a discarded range as
887 * managed by a RamDiscardManager responsible for the mapped memory region of
888 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
889 *
890 * Discarded pages ("logically unplugged") have undefined content and must
891 * not get migrated, because even reading these pages for migration might
892 * result in undesired behavior.
893 *
894 * Returns the number of cleared bits in the RAMBlock dirty bitmap.
895 *
896 * Note: The result is only stable while migrating (precopy/postcopy).
897 */
898 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
899 {
900 uint64_t cleared_bits = 0;
901
902 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
903 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
904 MemoryRegionSection section = {
905 .mr = rb->mr,
906 .offset_within_region = 0,
907 .size = int128_make64(qemu_ram_get_used_length(rb)),
908 };
909
910 ram_discard_manager_replay_discarded(rdm, &section,
911 dirty_bitmap_clear_section,
912 &cleared_bits);
913 }
914 return cleared_bits;
915 }
916
917 /*
918 * Check if a host-page aligned page falls into a discarded range as managed by
919 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
920 *
921 * Note: The result is only stable while migrating (precopy/postcopy).
922 */
923 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
924 {
925 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
926 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
927 MemoryRegionSection section = {
928 .mr = rb->mr,
929 .offset_within_region = start,
930 .size = int128_make64(qemu_ram_pagesize(rb)),
931 };
932
933 return !ram_discard_manager_is_populated(rdm, &section);
934 }
935 return false;
936 }
937
938 /* Called with RCU critical section */
939 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
940 {
941 uint64_t new_dirty_pages =
942 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
943
944 rs->migration_dirty_pages += new_dirty_pages;
945 rs->num_dirty_pages_period += new_dirty_pages;
946 }
947
948 /**
949 * ram_pagesize_summary: calculate all the pagesizes of a VM
950 *
951 * Returns a summary bitmap of the page sizes of all RAMBlocks
952 *
953 * For VMs with just normal pages this is equivalent to the host page
954 * size. If it's got some huge pages then it's the OR of all the
955 * different page sizes.
956 */
957 uint64_t ram_pagesize_summary(void)
958 {
959 RAMBlock *block;
960 uint64_t summary = 0;
961
962 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
963 summary |= block->page_size;
964 }
965
966 return summary;
967 }
968
969 uint64_t ram_get_total_transferred_pages(void)
970 {
971 return ram_counters.normal + ram_counters.duplicate +
972 compression_counters.pages + xbzrle_counters.pages;
973 }
974
975 static void migration_update_rates(RAMState *rs, int64_t end_time)
976 {
977 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
978 double compressed_size;
979
980 /* calculate period counters */
981 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
982 / (end_time - rs->time_last_bitmap_sync);
983
984 if (!page_count) {
985 return;
986 }
987
988 if (migrate_use_xbzrle()) {
989 double encoded_size, unencoded_size;
990
991 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
992 rs->xbzrle_cache_miss_prev) / page_count;
993 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
994 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
995 TARGET_PAGE_SIZE;
996 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
997 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
998 xbzrle_counters.encoding_rate = 0;
999 } else {
1000 xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1001 }
1002 rs->xbzrle_pages_prev = xbzrle_counters.pages;
1003 rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
1004 }
1005
1006 if (migrate_use_compression()) {
1007 compression_counters.busy_rate = (double)(compression_counters.busy -
1008 rs->compress_thread_busy_prev) / page_count;
1009 rs->compress_thread_busy_prev = compression_counters.busy;
1010
1011 compressed_size = compression_counters.compressed_size -
1012 rs->compressed_size_prev;
1013 if (compressed_size) {
1014 double uncompressed_size = (compression_counters.pages -
1015 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1016
1017 /* Compression-Ratio = Uncompressed-size / Compressed-size */
1018 compression_counters.compression_rate =
1019 uncompressed_size / compressed_size;
1020
1021 rs->compress_pages_prev = compression_counters.pages;
1022 rs->compressed_size_prev = compression_counters.compressed_size;
1023 }
1024 }
1025 }
1026
1027 static void migration_trigger_throttle(RAMState *rs)
1028 {
1029 MigrationState *s = migrate_get_current();
1030 uint64_t threshold = s->parameters.throttle_trigger_threshold;
1031
1032 uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
1033 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1034 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1035
1036 /* During block migration the auto-converge logic incorrectly detects
1037 * that ram migration makes no progress. Avoid this by disabling the
1038 * throttling logic during the bulk phase of block migration. */
1039 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1040 /* The following detection logic can be refined later. For now:
1041 Check to see if the ratio between dirtied bytes and the approx.
1042 amount of bytes that just got transferred since the last time
1043 we were in this routine reaches the threshold. If that happens
1044 twice, start or increase throttling. */
1045
1046 if ((bytes_dirty_period > bytes_dirty_threshold) &&
1047 (++rs->dirty_rate_high_cnt >= 2)) {
1048 trace_migration_throttle();
1049 rs->dirty_rate_high_cnt = 0;
1050 mig_throttle_guest_down(bytes_dirty_period,
1051 bytes_dirty_threshold);
1052 }
1053 }
1054 }
1055
1056 static void migration_bitmap_sync(RAMState *rs)
1057 {
1058 RAMBlock *block;
1059 int64_t end_time;
1060
1061 ram_counters.dirty_sync_count++;
1062
1063 if (!rs->time_last_bitmap_sync) {
1064 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1065 }
1066
1067 trace_migration_bitmap_sync_start();
1068 memory_global_dirty_log_sync();
1069
1070 qemu_mutex_lock(&rs->bitmap_mutex);
1071 WITH_RCU_READ_LOCK_GUARD() {
1072 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1073 ramblock_sync_dirty_bitmap(rs, block);
1074 }
1075 ram_counters.remaining = ram_bytes_remaining();
1076 }
1077 qemu_mutex_unlock(&rs->bitmap_mutex);
1078
1079 memory_global_after_dirty_log_sync();
1080 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1081
1082 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1083
1084 /* more than 1 second = 1000 millisecons */
1085 if (end_time > rs->time_last_bitmap_sync + 1000) {
1086 migration_trigger_throttle(rs);
1087
1088 migration_update_rates(rs, end_time);
1089
1090 rs->target_page_count_prev = rs->target_page_count;
1091
1092 /* reset period counters */
1093 rs->time_last_bitmap_sync = end_time;
1094 rs->num_dirty_pages_period = 0;
1095 rs->bytes_xfer_prev = ram_counters.transferred;
1096 }
1097 if (migrate_use_events()) {
1098 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1099 }
1100 }
1101
1102 static void migration_bitmap_sync_precopy(RAMState *rs)
1103 {
1104 Error *local_err = NULL;
1105
1106 /*
1107 * The current notifier usage is just an optimization to migration, so we
1108 * don't stop the normal migration process in the error case.
1109 */
1110 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1111 error_report_err(local_err);
1112 local_err = NULL;
1113 }
1114
1115 migration_bitmap_sync(rs);
1116
1117 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1118 error_report_err(local_err);
1119 }
1120 }
1121
1122 /**
1123 * save_zero_page_to_file: send the zero page to the file
1124 *
1125 * Returns the size of data written to the file, 0 means the page is not
1126 * a zero page
1127 *
1128 * @rs: current RAM state
1129 * @file: the file where the data is saved
1130 * @block: block that contains the page we want to send
1131 * @offset: offset inside the block for the page
1132 */
1133 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1134 RAMBlock *block, ram_addr_t offset)
1135 {
1136 uint8_t *p = block->host + offset;
1137 int len = 0;
1138
1139 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1140 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1141 qemu_put_byte(file, 0);
1142 len += 1;
1143 }
1144 return len;
1145 }
1146
1147 /**
1148 * save_zero_page: send the zero page to the stream
1149 *
1150 * Returns the number of pages written.
1151 *
1152 * @rs: current RAM state
1153 * @block: block that contains the page we want to send
1154 * @offset: offset inside the block for the page
1155 */
1156 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1157 {
1158 int len = save_zero_page_to_file(rs, rs->f, block, offset);
1159
1160 if (len) {
1161 ram_counters.duplicate++;
1162 ram_counters.transferred += len;
1163 return 1;
1164 }
1165 return -1;
1166 }
1167
1168 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1169 {
1170 if (!migrate_release_ram() || !migration_in_postcopy()) {
1171 return;
1172 }
1173
1174 ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1175 }
1176
1177 /*
1178 * @pages: the number of pages written by the control path,
1179 * < 0 - error
1180 * > 0 - number of pages written
1181 *
1182 * Return true if the pages has been saved, otherwise false is returned.
1183 */
1184 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1185 int *pages)
1186 {
1187 uint64_t bytes_xmit = 0;
1188 int ret;
1189
1190 *pages = -1;
1191 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1192 &bytes_xmit);
1193 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1194 return false;
1195 }
1196
1197 if (bytes_xmit) {
1198 ram_counters.transferred += bytes_xmit;
1199 *pages = 1;
1200 }
1201
1202 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1203 return true;
1204 }
1205
1206 if (bytes_xmit > 0) {
1207 ram_counters.normal++;
1208 } else if (bytes_xmit == 0) {
1209 ram_counters.duplicate++;
1210 }
1211
1212 return true;
1213 }
1214
1215 /*
1216 * directly send the page to the stream
1217 *
1218 * Returns the number of pages written.
1219 *
1220 * @rs: current RAM state
1221 * @block: block that contains the page we want to send
1222 * @offset: offset inside the block for the page
1223 * @buf: the page to be sent
1224 * @async: send to page asyncly
1225 */
1226 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1227 uint8_t *buf, bool async)
1228 {
1229 ram_counters.transferred += save_page_header(rs, rs->f, block,
1230 offset | RAM_SAVE_FLAG_PAGE);
1231 if (async) {
1232 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1233 migrate_release_ram() &
1234 migration_in_postcopy());
1235 } else {
1236 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1237 }
1238 ram_counters.transferred += TARGET_PAGE_SIZE;
1239 ram_counters.normal++;
1240 return 1;
1241 }
1242
1243 /**
1244 * ram_save_page: send the given page to the stream
1245 *
1246 * Returns the number of pages written.
1247 * < 0 - error
1248 * >=0 - Number of pages written - this might legally be 0
1249 * if xbzrle noticed the page was the same.
1250 *
1251 * @rs: current RAM state
1252 * @block: block that contains the page we want to send
1253 * @offset: offset inside the block for the page
1254 * @last_stage: if we are at the completion stage
1255 */
1256 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1257 {
1258 int pages = -1;
1259 uint8_t *p;
1260 bool send_async = true;
1261 RAMBlock *block = pss->block;
1262 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1263 ram_addr_t current_addr = block->offset + offset;
1264
1265 p = block->host + offset;
1266 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1267
1268 XBZRLE_cache_lock();
1269 if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1270 pages = save_xbzrle_page(rs, &p, current_addr, block,
1271 offset, last_stage);
1272 if (!last_stage) {
1273 /* Can't send this cached data async, since the cache page
1274 * might get updated before it gets to the wire
1275 */
1276 send_async = false;
1277 }
1278 }
1279
1280 /* XBZRLE overflow or normal page */
1281 if (pages == -1) {
1282 pages = save_normal_page(rs, block, offset, p, send_async);
1283 }
1284
1285 XBZRLE_cache_unlock();
1286
1287 return pages;
1288 }
1289
1290 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1291 ram_addr_t offset)
1292 {
1293 if (multifd_queue_page(rs->f, block, offset) < 0) {
1294 return -1;
1295 }
1296 ram_counters.normal++;
1297
1298 return 1;
1299 }
1300
1301 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1302 ram_addr_t offset, uint8_t *source_buf)
1303 {
1304 RAMState *rs = ram_state;
1305 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1306 bool zero_page = false;
1307 int ret;
1308
1309 if (save_zero_page_to_file(rs, f, block, offset)) {
1310 zero_page = true;
1311 goto exit;
1312 }
1313
1314 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1315
1316 /*
1317 * copy it to a internal buffer to avoid it being modified by VM
1318 * so that we can catch up the error during compression and
1319 * decompression
1320 */
1321 memcpy(source_buf, p, TARGET_PAGE_SIZE);
1322 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1323 if (ret < 0) {
1324 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1325 error_report("compressed data failed!");
1326 return false;
1327 }
1328
1329 exit:
1330 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1331 return zero_page;
1332 }
1333
1334 static void
1335 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1336 {
1337 ram_counters.transferred += bytes_xmit;
1338
1339 if (param->zero_page) {
1340 ram_counters.duplicate++;
1341 return;
1342 }
1343
1344 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1345 compression_counters.compressed_size += bytes_xmit - 8;
1346 compression_counters.pages++;
1347 }
1348
1349 static bool save_page_use_compression(RAMState *rs);
1350
1351 static void flush_compressed_data(RAMState *rs)
1352 {
1353 int idx, len, thread_count;
1354
1355 if (!save_page_use_compression(rs)) {
1356 return;
1357 }
1358 thread_count = migrate_compress_threads();
1359
1360 qemu_mutex_lock(&comp_done_lock);
1361 for (idx = 0; idx < thread_count; idx++) {
1362 while (!comp_param[idx].done) {
1363 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1364 }
1365 }
1366 qemu_mutex_unlock(&comp_done_lock);
1367
1368 for (idx = 0; idx < thread_count; idx++) {
1369 qemu_mutex_lock(&comp_param[idx].mutex);
1370 if (!comp_param[idx].quit) {
1371 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1372 /*
1373 * it's safe to fetch zero_page without holding comp_done_lock
1374 * as there is no further request submitted to the thread,
1375 * i.e, the thread should be waiting for a request at this point.
1376 */
1377 update_compress_thread_counts(&comp_param[idx], len);
1378 }
1379 qemu_mutex_unlock(&comp_param[idx].mutex);
1380 }
1381 }
1382
1383 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1384 ram_addr_t offset)
1385 {
1386 param->block = block;
1387 param->offset = offset;
1388 }
1389
1390 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1391 ram_addr_t offset)
1392 {
1393 int idx, thread_count, bytes_xmit = -1, pages = -1;
1394 bool wait = migrate_compress_wait_thread();
1395
1396 thread_count = migrate_compress_threads();
1397 qemu_mutex_lock(&comp_done_lock);
1398 retry:
1399 for (idx = 0; idx < thread_count; idx++) {
1400 if (comp_param[idx].done) {
1401 comp_param[idx].done = false;
1402 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1403 qemu_mutex_lock(&comp_param[idx].mutex);
1404 set_compress_params(&comp_param[idx], block, offset);
1405 qemu_cond_signal(&comp_param[idx].cond);
1406 qemu_mutex_unlock(&comp_param[idx].mutex);
1407 pages = 1;
1408 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1409 break;
1410 }
1411 }
1412
1413 /*
1414 * wait for the free thread if the user specifies 'compress-wait-thread',
1415 * otherwise we will post the page out in the main thread as normal page.
1416 */
1417 if (pages < 0 && wait) {
1418 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1419 goto retry;
1420 }
1421 qemu_mutex_unlock(&comp_done_lock);
1422
1423 return pages;
1424 }
1425
1426 /**
1427 * find_dirty_block: find the next dirty page and update any state
1428 * associated with the search process.
1429 *
1430 * Returns true if a page is found
1431 *
1432 * @rs: current RAM state
1433 * @pss: data about the state of the current dirty page scan
1434 * @again: set to false if the search has scanned the whole of RAM
1435 */
1436 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1437 {
1438 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1439 if (pss->complete_round && pss->block == rs->last_seen_block &&
1440 pss->page >= rs->last_page) {
1441 /*
1442 * We've been once around the RAM and haven't found anything.
1443 * Give up.
1444 */
1445 *again = false;
1446 return false;
1447 }
1448 if (!offset_in_ramblock(pss->block,
1449 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1450 /* Didn't find anything in this RAM Block */
1451 pss->page = 0;
1452 pss->block = QLIST_NEXT_RCU(pss->block, next);
1453 if (!pss->block) {
1454 /*
1455 * If memory migration starts over, we will meet a dirtied page
1456 * which may still exists in compression threads's ring, so we
1457 * should flush the compressed data to make sure the new page
1458 * is not overwritten by the old one in the destination.
1459 *
1460 * Also If xbzrle is on, stop using the data compression at this
1461 * point. In theory, xbzrle can do better than compression.
1462 */
1463 flush_compressed_data(rs);
1464
1465 /* Hit the end of the list */
1466 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1467 /* Flag that we've looped */
1468 pss->complete_round = true;
1469 /* After the first round, enable XBZRLE. */
1470 if (migrate_use_xbzrle()) {
1471 rs->xbzrle_enabled = true;
1472 }
1473 }
1474 /* Didn't find anything this time, but try again on the new block */
1475 *again = true;
1476 return false;
1477 } else {
1478 /* Can go around again, but... */
1479 *again = true;
1480 /* We've found something so probably don't need to */
1481 return true;
1482 }
1483 }
1484
1485 /**
1486 * unqueue_page: gets a page of the queue
1487 *
1488 * Helper for 'get_queued_page' - gets a page off the queue
1489 *
1490 * Returns the block of the page (or NULL if none available)
1491 *
1492 * @rs: current RAM state
1493 * @offset: used to return the offset within the RAMBlock
1494 */
1495 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1496 {
1497 RAMBlock *block = NULL;
1498
1499 if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1500 return NULL;
1501 }
1502
1503 QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1504 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1505 struct RAMSrcPageRequest *entry =
1506 QSIMPLEQ_FIRST(&rs->src_page_requests);
1507 block = entry->rb;
1508 *offset = entry->offset;
1509
1510 if (entry->len > TARGET_PAGE_SIZE) {
1511 entry->len -= TARGET_PAGE_SIZE;
1512 entry->offset += TARGET_PAGE_SIZE;
1513 } else {
1514 memory_region_unref(block->mr);
1515 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1516 g_free(entry);
1517 migration_consume_urgent_request();
1518 }
1519 }
1520
1521 return block;
1522 }
1523
1524 #if defined(__linux__)
1525 /**
1526 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1527 * is found, return RAM block pointer and page offset
1528 *
1529 * Returns pointer to the RAMBlock containing faulting page,
1530 * NULL if no write faults are pending
1531 *
1532 * @rs: current RAM state
1533 * @offset: page offset from the beginning of the block
1534 */
1535 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1536 {
1537 struct uffd_msg uffd_msg;
1538 void *page_address;
1539 RAMBlock *block;
1540 int res;
1541
1542 if (!migrate_background_snapshot()) {
1543 return NULL;
1544 }
1545
1546 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1547 if (res <= 0) {
1548 return NULL;
1549 }
1550
1551 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1552 block = qemu_ram_block_from_host(page_address, false, offset);
1553 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1554 return block;
1555 }
1556
1557 /**
1558 * ram_save_release_protection: release UFFD write protection after
1559 * a range of pages has been saved
1560 *
1561 * @rs: current RAM state
1562 * @pss: page-search-status structure
1563 * @start_page: index of the first page in the range relative to pss->block
1564 *
1565 * Returns 0 on success, negative value in case of an error
1566 */
1567 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1568 unsigned long start_page)
1569 {
1570 int res = 0;
1571
1572 /* Check if page is from UFFD-managed region. */
1573 if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1574 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1575 uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS;
1576
1577 /* Flush async buffers before un-protect. */
1578 qemu_fflush(rs->f);
1579 /* Un-protect memory range. */
1580 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1581 false, false);
1582 }
1583
1584 return res;
1585 }
1586
1587 /* ram_write_tracking_available: check if kernel supports required UFFD features
1588 *
1589 * Returns true if supports, false otherwise
1590 */
1591 bool ram_write_tracking_available(void)
1592 {
1593 uint64_t uffd_features;
1594 int res;
1595
1596 res = uffd_query_features(&uffd_features);
1597 return (res == 0 &&
1598 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1599 }
1600
1601 /* ram_write_tracking_compatible: check if guest configuration is
1602 * compatible with 'write-tracking'
1603 *
1604 * Returns true if compatible, false otherwise
1605 */
1606 bool ram_write_tracking_compatible(void)
1607 {
1608 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1609 int uffd_fd;
1610 RAMBlock *block;
1611 bool ret = false;
1612
1613 /* Open UFFD file descriptor */
1614 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1615 if (uffd_fd < 0) {
1616 return false;
1617 }
1618
1619 RCU_READ_LOCK_GUARD();
1620
1621 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1622 uint64_t uffd_ioctls;
1623
1624 /* Nothing to do with read-only and MMIO-writable regions */
1625 if (block->mr->readonly || block->mr->rom_device) {
1626 continue;
1627 }
1628 /* Try to register block memory via UFFD-IO to track writes */
1629 if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1630 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1631 goto out;
1632 }
1633 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1634 goto out;
1635 }
1636 }
1637 ret = true;
1638
1639 out:
1640 uffd_close_fd(uffd_fd);
1641 return ret;
1642 }
1643
1644 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1645 ram_addr_t size)
1646 {
1647 /*
1648 * We read one byte of each page; this will preallocate page tables if
1649 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1650 * where no page was populated yet. This might require adaption when
1651 * supporting other mappings, like shmem.
1652 */
1653 for (; offset < size; offset += block->page_size) {
1654 char tmp = *((char *)block->host + offset);
1655
1656 /* Don't optimize the read out */
1657 asm volatile("" : "+r" (tmp));
1658 }
1659 }
1660
1661 static inline int populate_read_section(MemoryRegionSection *section,
1662 void *opaque)
1663 {
1664 const hwaddr size = int128_get64(section->size);
1665 hwaddr offset = section->offset_within_region;
1666 RAMBlock *block = section->mr->ram_block;
1667
1668 populate_read_range(block, offset, size);
1669 return 0;
1670 }
1671
1672 /*
1673 * ram_block_populate_read: preallocate page tables and populate pages in the
1674 * RAM block by reading a byte of each page.
1675 *
1676 * Since it's solely used for userfault_fd WP feature, here we just
1677 * hardcode page size to qemu_real_host_page_size.
1678 *
1679 * @block: RAM block to populate
1680 */
1681 static void ram_block_populate_read(RAMBlock *rb)
1682 {
1683 /*
1684 * Skip populating all pages that fall into a discarded range as managed by
1685 * a RamDiscardManager responsible for the mapped memory region of the
1686 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1687 * must not get populated automatically. We don't have to track
1688 * modifications via userfaultfd WP reliably, because these pages will
1689 * not be part of the migration stream either way -- see
1690 * ramblock_dirty_bitmap_exclude_discarded_pages().
1691 *
1692 * Note: The result is only stable while migrating (precopy/postcopy).
1693 */
1694 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1695 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1696 MemoryRegionSection section = {
1697 .mr = rb->mr,
1698 .offset_within_region = 0,
1699 .size = rb->mr->size,
1700 };
1701
1702 ram_discard_manager_replay_populated(rdm, &section,
1703 populate_read_section, NULL);
1704 } else {
1705 populate_read_range(rb, 0, rb->used_length);
1706 }
1707 }
1708
1709 /*
1710 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1711 */
1712 void ram_write_tracking_prepare(void)
1713 {
1714 RAMBlock *block;
1715
1716 RCU_READ_LOCK_GUARD();
1717
1718 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1719 /* Nothing to do with read-only and MMIO-writable regions */
1720 if (block->mr->readonly || block->mr->rom_device) {
1721 continue;
1722 }
1723
1724 /*
1725 * Populate pages of the RAM block before enabling userfault_fd
1726 * write protection.
1727 *
1728 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1729 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1730 * pages with pte_none() entries in page table.
1731 */
1732 ram_block_populate_read(block);
1733 }
1734 }
1735
1736 /*
1737 * ram_write_tracking_start: start UFFD-WP memory tracking
1738 *
1739 * Returns 0 for success or negative value in case of error
1740 */
1741 int ram_write_tracking_start(void)
1742 {
1743 int uffd_fd;
1744 RAMState *rs = ram_state;
1745 RAMBlock *block;
1746
1747 /* Open UFFD file descriptor */
1748 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1749 if (uffd_fd < 0) {
1750 return uffd_fd;
1751 }
1752 rs->uffdio_fd = uffd_fd;
1753
1754 RCU_READ_LOCK_GUARD();
1755
1756 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1757 /* Nothing to do with read-only and MMIO-writable regions */
1758 if (block->mr->readonly || block->mr->rom_device) {
1759 continue;
1760 }
1761
1762 /* Register block memory with UFFD to track writes */
1763 if (uffd_register_memory(rs->uffdio_fd, block->host,
1764 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1765 goto fail;
1766 }
1767 /* Apply UFFD write protection to the block memory range */
1768 if (uffd_change_protection(rs->uffdio_fd, block->host,
1769 block->max_length, true, false)) {
1770 goto fail;
1771 }
1772 block->flags |= RAM_UF_WRITEPROTECT;
1773 memory_region_ref(block->mr);
1774
1775 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1776 block->host, block->max_length);
1777 }
1778
1779 return 0;
1780
1781 fail:
1782 error_report("ram_write_tracking_start() failed: restoring initial memory state");
1783
1784 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1785 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1786 continue;
1787 }
1788 /*
1789 * In case some memory block failed to be write-protected
1790 * remove protection and unregister all succeeded RAM blocks
1791 */
1792 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1793 false, false);
1794 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1795 /* Cleanup flags and remove reference */
1796 block->flags &= ~RAM_UF_WRITEPROTECT;
1797 memory_region_unref(block->mr);
1798 }
1799
1800 uffd_close_fd(uffd_fd);
1801 rs->uffdio_fd = -1;
1802 return -1;
1803 }
1804
1805 /**
1806 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1807 */
1808 void ram_write_tracking_stop(void)
1809 {
1810 RAMState *rs = ram_state;
1811 RAMBlock *block;
1812
1813 RCU_READ_LOCK_GUARD();
1814
1815 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1816 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1817 continue;
1818 }
1819 /* Remove protection and unregister all affected RAM blocks */
1820 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1821 false, false);
1822 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1823
1824 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1825 block->host, block->max_length);
1826
1827 /* Cleanup flags and remove reference */
1828 block->flags &= ~RAM_UF_WRITEPROTECT;
1829 memory_region_unref(block->mr);
1830 }
1831
1832 /* Finally close UFFD file descriptor */
1833 uffd_close_fd(rs->uffdio_fd);
1834 rs->uffdio_fd = -1;
1835 }
1836
1837 #else
1838 /* No target OS support, stubs just fail or ignore */
1839
1840 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1841 {
1842 (void) rs;
1843 (void) offset;
1844
1845 return NULL;
1846 }
1847
1848 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1849 unsigned long start_page)
1850 {
1851 (void) rs;
1852 (void) pss;
1853 (void) start_page;
1854
1855 return 0;
1856 }
1857
1858 bool ram_write_tracking_available(void)
1859 {
1860 return false;
1861 }
1862
1863 bool ram_write_tracking_compatible(void)
1864 {
1865 assert(0);
1866 return false;
1867 }
1868
1869 int ram_write_tracking_start(void)
1870 {
1871 assert(0);
1872 return -1;
1873 }
1874
1875 void ram_write_tracking_stop(void)
1876 {
1877 assert(0);
1878 }
1879 #endif /* defined(__linux__) */
1880
1881 /**
1882 * get_queued_page: unqueue a page from the postcopy requests
1883 *
1884 * Skips pages that are already sent (!dirty)
1885 *
1886 * Returns true if a queued page is found
1887 *
1888 * @rs: current RAM state
1889 * @pss: data about the state of the current dirty page scan
1890 */
1891 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1892 {
1893 RAMBlock *block;
1894 ram_addr_t offset;
1895 bool dirty;
1896
1897 do {
1898 block = unqueue_page(rs, &offset);
1899 /*
1900 * We're sending this page, and since it's postcopy nothing else
1901 * will dirty it, and we must make sure it doesn't get sent again
1902 * even if this queue request was received after the background
1903 * search already sent it.
1904 */
1905 if (block) {
1906 unsigned long page;
1907
1908 page = offset >> TARGET_PAGE_BITS;
1909 dirty = test_bit(page, block->bmap);
1910 if (!dirty) {
1911 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1912 page);
1913 } else {
1914 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1915 }
1916 }
1917
1918 } while (block && !dirty);
1919
1920 if (!block) {
1921 /*
1922 * Poll write faults too if background snapshot is enabled; that's
1923 * when we have vcpus got blocked by the write protected pages.
1924 */
1925 block = poll_fault_page(rs, &offset);
1926 }
1927
1928 if (block) {
1929 /*
1930 * We want the background search to continue from the queued page
1931 * since the guest is likely to want other pages near to the page
1932 * it just requested.
1933 */
1934 pss->block = block;
1935 pss->page = offset >> TARGET_PAGE_BITS;
1936
1937 /*
1938 * This unqueued page would break the "one round" check, even is
1939 * really rare.
1940 */
1941 pss->complete_round = false;
1942 }
1943
1944 return !!block;
1945 }
1946
1947 /**
1948 * migration_page_queue_free: drop any remaining pages in the ram
1949 * request queue
1950 *
1951 * It should be empty at the end anyway, but in error cases there may
1952 * be some left. in case that there is any page left, we drop it.
1953 *
1954 */
1955 static void migration_page_queue_free(RAMState *rs)
1956 {
1957 struct RAMSrcPageRequest *mspr, *next_mspr;
1958 /* This queue generally should be empty - but in the case of a failed
1959 * migration might have some droppings in.
1960 */
1961 RCU_READ_LOCK_GUARD();
1962 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1963 memory_region_unref(mspr->rb->mr);
1964 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1965 g_free(mspr);
1966 }
1967 }
1968
1969 /**
1970 * ram_save_queue_pages: queue the page for transmission
1971 *
1972 * A request from postcopy destination for example.
1973 *
1974 * Returns zero on success or negative on error
1975 *
1976 * @rbname: Name of the RAMBLock of the request. NULL means the
1977 * same that last one.
1978 * @start: starting address from the start of the RAMBlock
1979 * @len: length (in bytes) to send
1980 */
1981 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1982 {
1983 RAMBlock *ramblock;
1984 RAMState *rs = ram_state;
1985
1986 ram_counters.postcopy_requests++;
1987 RCU_READ_LOCK_GUARD();
1988
1989 if (!rbname) {
1990 /* Reuse last RAMBlock */
1991 ramblock = rs->last_req_rb;
1992
1993 if (!ramblock) {
1994 /*
1995 * Shouldn't happen, we can't reuse the last RAMBlock if
1996 * it's the 1st request.
1997 */
1998 error_report("ram_save_queue_pages no previous block");
1999 return -1;
2000 }
2001 } else {
2002 ramblock = qemu_ram_block_by_name(rbname);
2003
2004 if (!ramblock) {
2005 /* We shouldn't be asked for a non-existent RAMBlock */
2006 error_report("ram_save_queue_pages no block '%s'", rbname);
2007 return -1;
2008 }
2009 rs->last_req_rb = ramblock;
2010 }
2011 trace_ram_save_queue_pages(ramblock->idstr, start, len);
2012 if (!offset_in_ramblock(ramblock, start + len - 1)) {
2013 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2014 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2015 __func__, start, len, ramblock->used_length);
2016 return -1;
2017 }
2018
2019 struct RAMSrcPageRequest *new_entry =
2020 g_malloc0(sizeof(struct RAMSrcPageRequest));
2021 new_entry->rb = ramblock;
2022 new_entry->offset = start;
2023 new_entry->len = len;
2024
2025 memory_region_ref(ramblock->mr);
2026 qemu_mutex_lock(&rs->src_page_req_mutex);
2027 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2028 migration_make_urgent_request();
2029 qemu_mutex_unlock(&rs->src_page_req_mutex);
2030
2031 return 0;
2032 }
2033
2034 static bool save_page_use_compression(RAMState *rs)
2035 {
2036 if (!migrate_use_compression()) {
2037 return false;
2038 }
2039
2040 /*
2041 * If xbzrle is enabled (e.g., after first round of migration), stop
2042 * using the data compression. In theory, xbzrle can do better than
2043 * compression.
2044 */
2045 if (rs->xbzrle_enabled) {
2046 return false;
2047 }
2048
2049 return true;
2050 }
2051
2052 /*
2053 * try to compress the page before posting it out, return true if the page
2054 * has been properly handled by compression, otherwise needs other
2055 * paths to handle it
2056 */
2057 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2058 {
2059 if (!save_page_use_compression(rs)) {
2060 return false;
2061 }
2062
2063 /*
2064 * When starting the process of a new block, the first page of
2065 * the block should be sent out before other pages in the same
2066 * block, and all the pages in last block should have been sent
2067 * out, keeping this order is important, because the 'cont' flag
2068 * is used to avoid resending the block name.
2069 *
2070 * We post the fist page as normal page as compression will take
2071 * much CPU resource.
2072 */
2073 if (block != rs->last_sent_block) {
2074 flush_compressed_data(rs);
2075 return false;
2076 }
2077
2078 if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2079 return true;
2080 }
2081
2082 compression_counters.busy++;
2083 return false;
2084 }
2085
2086 /**
2087 * ram_save_target_page: save one target page
2088 *
2089 * Returns the number of pages written
2090 *
2091 * @rs: current RAM state
2092 * @pss: data about the page we want to send
2093 * @last_stage: if we are at the completion stage
2094 */
2095 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
2096 bool last_stage)
2097 {
2098 RAMBlock *block = pss->block;
2099 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2100 int res;
2101
2102 if (control_save_page(rs, block, offset, &res)) {
2103 return res;
2104 }
2105
2106 if (save_compress_page(rs, block, offset)) {
2107 return 1;
2108 }
2109
2110 res = save_zero_page(rs, block, offset);
2111 if (res > 0) {
2112 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2113 * page would be stale
2114 */
2115 if (!save_page_use_compression(rs)) {
2116 XBZRLE_cache_lock();
2117 xbzrle_cache_zero_page(rs, block->offset + offset);
2118 XBZRLE_cache_unlock();
2119 }
2120 ram_release_pages(block->idstr, offset, res);
2121 return res;
2122 }
2123
2124 /*
2125 * Do not use multifd for:
2126 * 1. Compression as the first page in the new block should be posted out
2127 * before sending the compressed page
2128 * 2. In postcopy as one whole host page should be placed
2129 */
2130 if (!save_page_use_compression(rs) && migrate_use_multifd()
2131 && !migration_in_postcopy()) {
2132 return ram_save_multifd_page(rs, block, offset);
2133 }
2134
2135 return ram_save_page(rs, pss, last_stage);
2136 }
2137
2138 /**
2139 * ram_save_host_page: save a whole host page
2140 *
2141 * Starting at *offset send pages up to the end of the current host
2142 * page. It's valid for the initial offset to point into the middle of
2143 * a host page in which case the remainder of the hostpage is sent.
2144 * Only dirty target pages are sent. Note that the host page size may
2145 * be a huge page for this block.
2146 * The saving stops at the boundary of the used_length of the block
2147 * if the RAMBlock isn't a multiple of the host page size.
2148 *
2149 * Returns the number of pages written or negative on error
2150 *
2151 * @rs: current RAM state
2152 * @ms: current migration state
2153 * @pss: data about the page we want to send
2154 * @last_stage: if we are at the completion stage
2155 */
2156 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2157 bool last_stage)
2158 {
2159 int tmppages, pages = 0;
2160 size_t pagesize_bits =
2161 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2162 unsigned long hostpage_boundary =
2163 QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
2164 unsigned long start_page = pss->page;
2165 int res;
2166
2167 if (ramblock_is_ignored(pss->block)) {
2168 error_report("block %s should not be migrated !", pss->block->idstr);
2169 return 0;
2170 }
2171
2172 do {
2173 /* Check the pages is dirty and if it is send it */
2174 if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2175 tmppages = ram_save_target_page(rs, pss, last_stage);
2176 if (tmppages < 0) {
2177 return tmppages;
2178 }
2179
2180 pages += tmppages;
2181 /*
2182 * Allow rate limiting to happen in the middle of huge pages if
2183 * something is sent in the current iteration.
2184 */
2185 if (pagesize_bits > 1 && tmppages > 0) {
2186 migration_rate_limit();
2187 }
2188 }
2189 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2190 } while ((pss->page < hostpage_boundary) &&
2191 offset_in_ramblock(pss->block,
2192 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2193 /* The offset we leave with is the min boundary of host page and block */
2194 pss->page = MIN(pss->page, hostpage_boundary) - 1;
2195
2196 res = ram_save_release_protection(rs, pss, start_page);
2197 return (res < 0 ? res : pages);
2198 }
2199
2200 /**
2201 * ram_find_and_save_block: finds a dirty page and sends it to f
2202 *
2203 * Called within an RCU critical section.
2204 *
2205 * Returns the number of pages written where zero means no dirty pages,
2206 * or negative on error
2207 *
2208 * @rs: current RAM state
2209 * @last_stage: if we are at the completion stage
2210 *
2211 * On systems where host-page-size > target-page-size it will send all the
2212 * pages in a host page that are dirty.
2213 */
2214
2215 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2216 {
2217 PageSearchStatus pss;
2218 int pages = 0;
2219 bool again, found;
2220
2221 /* No dirty page as there is zero RAM */
2222 if (!ram_bytes_total()) {
2223 return pages;
2224 }
2225
2226 pss.block = rs->last_seen_block;
2227 pss.page = rs->last_page;
2228 pss.complete_round = false;
2229
2230 if (!pss.block) {
2231 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2232 }
2233
2234 do {
2235 again = true;
2236 found = get_queued_page(rs, &pss);
2237
2238 if (!found) {
2239 /* priority queue empty, so just search for something dirty */
2240 found = find_dirty_block(rs, &pss, &again);
2241 }
2242
2243 if (found) {
2244 pages = ram_save_host_page(rs, &pss, last_stage);
2245 }
2246 } while (!pages && again);
2247
2248 rs->last_seen_block = pss.block;
2249 rs->last_page = pss.page;
2250
2251 return pages;
2252 }
2253
2254 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2255 {
2256 uint64_t pages = size / TARGET_PAGE_SIZE;
2257
2258 if (zero) {
2259 ram_counters.duplicate += pages;
2260 } else {
2261 ram_counters.normal += pages;
2262 ram_counters.transferred += size;
2263 qemu_update_position(f, size);
2264 }
2265 }
2266
2267 static uint64_t ram_bytes_total_common(bool count_ignored)
2268 {
2269 RAMBlock *block;
2270 uint64_t total = 0;
2271
2272 RCU_READ_LOCK_GUARD();
2273
2274 if (count_ignored) {
2275 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2276 total += block->used_length;
2277 }
2278 } else {
2279 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2280 total += block->used_length;
2281 }
2282 }
2283 return total;
2284 }
2285
2286 uint64_t ram_bytes_total(void)
2287 {
2288 return ram_bytes_total_common(false);
2289 }
2290
2291 static void xbzrle_load_setup(void)
2292 {
2293 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2294 }
2295
2296 static void xbzrle_load_cleanup(void)
2297 {
2298 g_free(XBZRLE.decoded_buf);
2299 XBZRLE.decoded_buf = NULL;
2300 }
2301
2302 static void ram_state_cleanup(RAMState **rsp)
2303 {
2304 if (*rsp) {
2305 migration_page_queue_free(*rsp);
2306 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2307 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2308 g_free(*rsp);
2309 *rsp = NULL;
2310 }
2311 }
2312
2313 static void xbzrle_cleanup(void)
2314 {
2315 XBZRLE_cache_lock();
2316 if (XBZRLE.cache) {
2317 cache_fini(XBZRLE.cache);
2318 g_free(XBZRLE.encoded_buf);
2319 g_free(XBZRLE.current_buf);
2320 g_free(XBZRLE.zero_target_page);
2321 XBZRLE.cache = NULL;
2322 XBZRLE.encoded_buf = NULL;
2323 XBZRLE.current_buf = NULL;
2324 XBZRLE.zero_target_page = NULL;
2325 }
2326 XBZRLE_cache_unlock();
2327 }
2328
2329 static void ram_save_cleanup(void *opaque)
2330 {
2331 RAMState **rsp = opaque;
2332 RAMBlock *block;
2333
2334 /* We don't use dirty log with background snapshots */
2335 if (!migrate_background_snapshot()) {
2336 /* caller have hold iothread lock or is in a bh, so there is
2337 * no writing race against the migration bitmap
2338 */
2339 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2340 /*
2341 * do not stop dirty log without starting it, since
2342 * memory_global_dirty_log_stop will assert that
2343 * memory_global_dirty_log_start/stop used in pairs
2344 */
2345 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2346 }
2347 }
2348
2349 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2350 g_free(block->clear_bmap);
2351 block->clear_bmap = NULL;
2352 g_free(block->bmap);
2353 block->bmap = NULL;
2354 }
2355
2356 xbzrle_cleanup();
2357 compress_threads_save_cleanup();
2358 ram_state_cleanup(rsp);
2359 }
2360
2361 static void ram_state_reset(RAMState *rs)
2362 {
2363 rs->last_seen_block = NULL;
2364 rs->last_sent_block = NULL;
2365 rs->last_page = 0;
2366 rs->last_version = ram_list.version;
2367 rs->xbzrle_enabled = false;
2368 }
2369
2370 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2371
2372 /*
2373 * 'expected' is the value you expect the bitmap mostly to be full
2374 * of; it won't bother printing lines that are all this value.
2375 * If 'todump' is null the migration bitmap is dumped.
2376 */
2377 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2378 unsigned long pages)
2379 {
2380 int64_t cur;
2381 int64_t linelen = 128;
2382 char linebuf[129];
2383
2384 for (cur = 0; cur < pages; cur += linelen) {
2385 int64_t curb;
2386 bool found = false;
2387 /*
2388 * Last line; catch the case where the line length
2389 * is longer than remaining ram
2390 */
2391 if (cur + linelen > pages) {
2392 linelen = pages - cur;
2393 }
2394 for (curb = 0; curb < linelen; curb++) {
2395 bool thisbit = test_bit(cur + curb, todump);
2396 linebuf[curb] = thisbit ? '1' : '.';
2397 found = found || (thisbit != expected);
2398 }
2399 if (found) {
2400 linebuf[curb] = '\0';
2401 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
2402 }
2403 }
2404 }
2405
2406 /* **** functions for postcopy ***** */
2407
2408 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2409 {
2410 struct RAMBlock *block;
2411
2412 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2413 unsigned long *bitmap = block->bmap;
2414 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2415 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2416
2417 while (run_start < range) {
2418 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2419 ram_discard_range(block->idstr,
2420 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2421 ((ram_addr_t)(run_end - run_start))
2422 << TARGET_PAGE_BITS);
2423 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2424 }
2425 }
2426 }
2427
2428 /**
2429 * postcopy_send_discard_bm_ram: discard a RAMBlock
2430 *
2431 * Returns zero on success
2432 *
2433 * Callback from postcopy_each_ram_send_discard for each RAMBlock
2434 *
2435 * @ms: current migration state
2436 * @block: RAMBlock to discard
2437 */
2438 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2439 {
2440 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2441 unsigned long current;
2442 unsigned long *bitmap = block->bmap;
2443
2444 for (current = 0; current < end; ) {
2445 unsigned long one = find_next_bit(bitmap, end, current);
2446 unsigned long zero, discard_length;
2447
2448 if (one >= end) {
2449 break;
2450 }
2451
2452 zero = find_next_zero_bit(bitmap, end, one + 1);
2453
2454 if (zero >= end) {
2455 discard_length = end - one;
2456 } else {
2457 discard_length = zero - one;
2458 }
2459 postcopy_discard_send_range(ms, one, discard_length);
2460 current = one + discard_length;
2461 }
2462
2463 return 0;
2464 }
2465
2466 /**
2467 * postcopy_each_ram_send_discard: discard all RAMBlocks
2468 *
2469 * Returns 0 for success or negative for error
2470 *
2471 * Utility for the outgoing postcopy code.
2472 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2473 * passing it bitmap indexes and name.
2474 * (qemu_ram_foreach_block ends up passing unscaled lengths
2475 * which would mean postcopy code would have to deal with target page)
2476 *
2477 * @ms: current migration state
2478 */
2479 static int postcopy_each_ram_send_discard(MigrationState *ms)
2480 {
2481 struct RAMBlock *block;
2482 int ret;
2483
2484 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2485 postcopy_discard_send_init(ms, block->idstr);
2486
2487 /*
2488 * Postcopy sends chunks of bitmap over the wire, but it
2489 * just needs indexes at this point, avoids it having
2490 * target page specific code.
2491 */
2492 ret = postcopy_send_discard_bm_ram(ms, block);
2493 postcopy_discard_send_finish(ms);
2494 if (ret) {
2495 return ret;
2496 }
2497 }
2498
2499 return 0;
2500 }
2501
2502 /**
2503 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2504 *
2505 * Helper for postcopy_chunk_hostpages; it's called twice to
2506 * canonicalize the two bitmaps, that are similar, but one is
2507 * inverted.
2508 *
2509 * Postcopy requires that all target pages in a hostpage are dirty or
2510 * clean, not a mix. This function canonicalizes the bitmaps.
2511 *
2512 * @ms: current migration state
2513 * @block: block that contains the page we want to canonicalize
2514 */
2515 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2516 {
2517 RAMState *rs = ram_state;
2518 unsigned long *bitmap = block->bmap;
2519 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2520 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2521 unsigned long run_start;
2522
2523 if (block->page_size == TARGET_PAGE_SIZE) {
2524 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2525 return;
2526 }
2527
2528 /* Find a dirty page */
2529 run_start = find_next_bit(bitmap, pages, 0);
2530
2531 while (run_start < pages) {
2532
2533 /*
2534 * If the start of this run of pages is in the middle of a host
2535 * page, then we need to fixup this host page.
2536 */
2537 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2538 /* Find the end of this run */
2539 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2540 /*
2541 * If the end isn't at the start of a host page, then the
2542 * run doesn't finish at the end of a host page
2543 * and we need to discard.
2544 */
2545 }
2546
2547 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2548 unsigned long page;
2549 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2550 host_ratio);
2551 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2552
2553 /* Clean up the bitmap */
2554 for (page = fixup_start_addr;
2555 page < fixup_start_addr + host_ratio; page++) {
2556 /*
2557 * Remark them as dirty, updating the count for any pages
2558 * that weren't previously dirty.
2559 */
2560 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2561 }
2562 }
2563
2564 /* Find the next dirty page for the next iteration */
2565 run_start = find_next_bit(bitmap, pages, run_start);
2566 }
2567 }
2568
2569 /**
2570 * postcopy_chunk_hostpages: discard any partially sent host page
2571 *
2572 * Utility for the outgoing postcopy code.
2573 *
2574 * Discard any partially sent host-page size chunks, mark any partially
2575 * dirty host-page size chunks as all dirty. In this case the host-page
2576 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2577 *
2578 * Returns zero on success
2579 *
2580 * @ms: current migration state
2581 * @block: block we want to work with
2582 */
2583 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2584 {
2585 postcopy_discard_send_init(ms, block->idstr);
2586
2587 /*
2588 * Ensure that all partially dirty host pages are made fully dirty.
2589 */
2590 postcopy_chunk_hostpages_pass(ms, block);
2591
2592 postcopy_discard_send_finish(ms);
2593 return 0;
2594 }
2595
2596 /**
2597 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2598 *
2599 * Returns zero on success
2600 *
2601 * Transmit the set of pages to be discarded after precopy to the target
2602 * these are pages that:
2603 * a) Have been previously transmitted but are now dirty again
2604 * b) Pages that have never been transmitted, this ensures that
2605 * any pages on the destination that have been mapped by background
2606 * tasks get discarded (transparent huge pages is the specific concern)
2607 * Hopefully this is pretty sparse
2608 *
2609 * @ms: current migration state
2610 */
2611 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2612 {
2613 RAMState *rs = ram_state;
2614 RAMBlock *block;
2615 int ret;
2616
2617 RCU_READ_LOCK_GUARD();
2618
2619 /* This should be our last sync, the src is now paused */
2620 migration_bitmap_sync(rs);
2621
2622 /* Easiest way to make sure we don't resume in the middle of a host-page */
2623 rs->last_seen_block = NULL;
2624 rs->last_sent_block = NULL;
2625 rs->last_page = 0;
2626
2627 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2628 /* Deal with TPS != HPS and huge pages */
2629 ret = postcopy_chunk_hostpages(ms, block);
2630 if (ret) {
2631 return ret;
2632 }
2633
2634 #ifdef DEBUG_POSTCOPY
2635 ram_debug_dump_bitmap(block->bmap, true,
2636 block->used_length >> TARGET_PAGE_BITS);
2637 #endif
2638 }
2639 trace_ram_postcopy_send_discard_bitmap();
2640
2641 return postcopy_each_ram_send_discard(ms);
2642 }
2643
2644 /**
2645 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2646 *
2647 * Returns zero on success
2648 *
2649 * @rbname: name of the RAMBlock of the request. NULL means the
2650 * same that last one.
2651 * @start: RAMBlock starting page
2652 * @length: RAMBlock size
2653 */
2654 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2655 {
2656 trace_ram_discard_range(rbname, start, length);
2657
2658 RCU_READ_LOCK_GUARD();
2659 RAMBlock *rb = qemu_ram_block_by_name(rbname);
2660
2661 if (!rb) {
2662 error_report("ram_discard_range: Failed to find block '%s'", rbname);
2663 return -1;
2664 }
2665
2666 /*
2667 * On source VM, we don't need to update the received bitmap since
2668 * we don't even have one.
2669 */
2670 if (rb->receivedmap) {
2671 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2672 length >> qemu_target_page_bits());
2673 }
2674
2675 return ram_block_discard_range(rb, start, length);
2676 }
2677
2678 /*
2679 * For every allocation, we will try not to crash the VM if the
2680 * allocation failed.
2681 */
2682 static int xbzrle_init(void)
2683 {
2684 Error *local_err = NULL;
2685
2686 if (!migrate_use_xbzrle()) {
2687 return 0;
2688 }
2689
2690 XBZRLE_cache_lock();
2691
2692 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2693 if (!XBZRLE.zero_target_page) {
2694 error_report("%s: Error allocating zero page", __func__);
2695 goto err_out;
2696 }
2697
2698 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2699 TARGET_PAGE_SIZE, &local_err);
2700 if (!XBZRLE.cache) {
2701 error_report_err(local_err);
2702 goto free_zero_page;
2703 }
2704
2705 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2706 if (!XBZRLE.encoded_buf) {
2707 error_report("%s: Error allocating encoded_buf", __func__);
2708 goto free_cache;
2709 }
2710
2711 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2712 if (!XBZRLE.current_buf) {
2713 error_report("%s: Error allocating current_buf", __func__);
2714 goto free_encoded_buf;
2715 }
2716
2717 /* We are all good */
2718 XBZRLE_cache_unlock();
2719 return 0;
2720
2721 free_encoded_buf:
2722 g_free(XBZRLE.encoded_buf);
2723 XBZRLE.encoded_buf = NULL;
2724 free_cache:
2725 cache_fini(XBZRLE.cache);
2726 XBZRLE.cache = NULL;
2727 free_zero_page:
2728 g_free(XBZRLE.zero_target_page);
2729 XBZRLE.zero_target_page = NULL;
2730 err_out:
2731 XBZRLE_cache_unlock();
2732 return -ENOMEM;
2733 }
2734
2735 static int ram_state_init(RAMState **rsp)
2736 {
2737 *rsp = g_try_new0(RAMState, 1);
2738
2739 if (!*rsp) {
2740 error_report("%s: Init ramstate fail", __func__);
2741 return -1;
2742 }
2743
2744 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2745 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2746 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2747
2748 /*
2749 * Count the total number of pages used by ram blocks not including any
2750 * gaps due to alignment or unplugs.
2751 * This must match with the initial values of dirty bitmap.
2752 */
2753 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2754 ram_state_reset(*rsp);
2755
2756 return 0;
2757 }
2758
2759 static void ram_list_init_bitmaps(void)
2760 {
2761 MigrationState *ms = migrate_get_current();
2762 RAMBlock *block;
2763 unsigned long pages;
2764 uint8_t shift;
2765
2766 /* Skip setting bitmap if there is no RAM */
2767 if (ram_bytes_total()) {
2768 shift = ms->clear_bitmap_shift;
2769 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2770 error_report("clear_bitmap_shift (%u) too big, using "
2771 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2772 shift = CLEAR_BITMAP_SHIFT_MAX;
2773 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2774 error_report("clear_bitmap_shift (%u) too small, using "
2775 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2776 shift = CLEAR_BITMAP_SHIFT_MIN;
2777 }
2778
2779 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2780 pages = block->max_length >> TARGET_PAGE_BITS;
2781 /*
2782 * The initial dirty bitmap for migration must be set with all
2783 * ones to make sure we'll migrate every guest RAM page to
2784 * destination.
2785 * Here we set RAMBlock.bmap all to 1 because when rebegin a
2786 * new migration after a failed migration, ram_list.
2787 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2788 * guest memory.
2789 */
2790 block->bmap = bitmap_new(pages);
2791 bitmap_set(block->bmap, 0, pages);
2792 block->clear_bmap_shift = shift;
2793 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2794 }
2795 }
2796 }
2797
2798 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
2799 {
2800 unsigned long pages;
2801 RAMBlock *rb;
2802
2803 RCU_READ_LOCK_GUARD();
2804
2805 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
2806 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
2807 rs->migration_dirty_pages -= pages;
2808 }
2809 }
2810
2811 static void ram_init_bitmaps(RAMState *rs)
2812 {
2813 /* For memory_global_dirty_log_start below. */
2814 qemu_mutex_lock_iothread();
2815 qemu_mutex_lock_ramlist();
2816
2817 WITH_RCU_READ_LOCK_GUARD() {
2818 ram_list_init_bitmaps();
2819 /* We don't use dirty log with background snapshots */
2820 if (!migrate_background_snapshot()) {
2821 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
2822 migration_bitmap_sync_precopy(rs);
2823 }
2824 }
2825 qemu_mutex_unlock_ramlist();
2826 qemu_mutex_unlock_iothread();
2827
2828 /*
2829 * After an eventual first bitmap sync, fixup the initial bitmap
2830 * containing all 1s to exclude any discarded pages from migration.
2831 */
2832 migration_bitmap_clear_discarded_pages(rs);
2833 }
2834
2835 static int ram_init_all(RAMState **rsp)
2836 {
2837 if (ram_state_init(rsp)) {
2838 return -1;
2839 }
2840
2841 if (xbzrle_init()) {
2842 ram_state_cleanup(rsp);
2843 return -1;
2844 }
2845
2846 ram_init_bitmaps(*rsp);
2847
2848 return 0;
2849 }
2850
2851 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2852 {
2853 RAMBlock *block;
2854 uint64_t pages = 0;
2855
2856 /*
2857 * Postcopy is not using xbzrle/compression, so no need for that.
2858 * Also, since source are already halted, we don't need to care
2859 * about dirty page logging as well.
2860 */
2861
2862 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2863 pages += bitmap_count_one(block->bmap,
2864 block->used_length >> TARGET_PAGE_BITS);
2865 }
2866
2867 /* This may not be aligned with current bitmaps. Recalculate. */
2868 rs->migration_dirty_pages = pages;
2869
2870 ram_state_reset(rs);
2871
2872 /* Update RAMState cache of output QEMUFile */
2873 rs->f = out;
2874
2875 trace_ram_state_resume_prepare(pages);
2876 }
2877
2878 /*
2879 * This function clears bits of the free pages reported by the caller from the
2880 * migration dirty bitmap. @addr is the host address corresponding to the
2881 * start of the continuous guest free pages, and @len is the total bytes of
2882 * those pages.
2883 */
2884 void qemu_guest_free_page_hint(void *addr, size_t len)
2885 {
2886 RAMBlock *block;
2887 ram_addr_t offset;
2888 size_t used_len, start, npages;
2889 MigrationState *s = migrate_get_current();
2890
2891 /* This function is currently expected to be used during live migration */
2892 if (!migration_is_setup_or_active(s->state)) {
2893 return;
2894 }
2895
2896 for (; len > 0; len -= used_len, addr += used_len) {
2897 block = qemu_ram_block_from_host(addr, false, &offset);
2898 if (unlikely(!block || offset >= block->used_length)) {
2899 /*
2900 * The implementation might not support RAMBlock resize during
2901 * live migration, but it could happen in theory with future
2902 * updates. So we add a check here to capture that case.
2903 */
2904 error_report_once("%s unexpected error", __func__);
2905 return;
2906 }
2907
2908 if (len <= block->used_length - offset) {
2909 used_len = len;
2910 } else {
2911 used_len = block->used_length - offset;
2912 }
2913
2914 start = offset >> TARGET_PAGE_BITS;
2915 npages = used_len >> TARGET_PAGE_BITS;
2916
2917 qemu_mutex_lock(&ram_state->bitmap_mutex);
2918 /*
2919 * The skipped free pages are equavalent to be sent from clear_bmap's
2920 * perspective, so clear the bits from the memory region bitmap which
2921 * are initially set. Otherwise those skipped pages will be sent in
2922 * the next round after syncing from the memory region bitmap.
2923 */
2924 migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
2925 ram_state->migration_dirty_pages -=
2926 bitmap_count_one_with_offset(block->bmap, start, npages);
2927 bitmap_clear(block->bmap, start, npages);
2928 qemu_mutex_unlock(&ram_state->bitmap_mutex);
2929 }
2930 }
2931
2932 /*
2933 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2934 * long-running RCU critical section. When rcu-reclaims in the code
2935 * start to become numerous it will be necessary to reduce the
2936 * granularity of these critical sections.
2937 */
2938
2939 /**
2940 * ram_save_setup: Setup RAM for migration
2941 *
2942 * Returns zero to indicate success and negative for error
2943 *
2944 * @f: QEMUFile where to send the data
2945 * @opaque: RAMState pointer
2946 */
2947 static int ram_save_setup(QEMUFile *f, void *opaque)
2948 {
2949 RAMState **rsp = opaque;
2950 RAMBlock *block;
2951
2952 if (compress_threads_save_setup()) {
2953 return -1;
2954 }
2955
2956 /* migration has already setup the bitmap, reuse it. */
2957 if (!migration_in_colo_state()) {
2958 if (ram_init_all(rsp) != 0) {
2959 compress_threads_save_cleanup();
2960 return -1;
2961 }
2962 }
2963 (*rsp)->f = f;
2964
2965 WITH_RCU_READ_LOCK_GUARD() {
2966 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2967
2968 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2969 qemu_put_byte(f, strlen(block->idstr));
2970 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2971 qemu_put_be64(f, block->used_length);
2972 if (migrate_postcopy_ram() && block->page_size !=
2973 qemu_host_page_size) {
2974 qemu_put_be64(f, block->page_size);
2975 }
2976 if (migrate_ignore_shared()) {
2977 qemu_put_be64(f, block->mr->addr);
2978 }
2979 }
2980 }
2981
2982 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2983 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2984
2985 multifd_send_sync_main(f);
2986 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2987 qemu_fflush(f);
2988
2989 return 0;
2990 }
2991
2992 /**
2993 * ram_save_iterate: iterative stage for migration
2994 *
2995 * Returns zero to indicate success and negative for error
2996 *
2997 * @f: QEMUFile where to send the data
2998 * @opaque: RAMState pointer
2999 */
3000 static int ram_save_iterate(QEMUFile *f, void *opaque)
3001 {
3002 RAMState **temp = opaque;
3003 RAMState *rs = *temp;
3004 int ret = 0;
3005 int i;
3006 int64_t t0;
3007 int done = 0;
3008
3009 if (blk_mig_bulk_active()) {
3010 /* Avoid transferring ram during bulk phase of block migration as
3011 * the bulk phase will usually take a long time and transferring
3012 * ram updates during that time is pointless. */
3013 goto out;
3014 }
3015
3016 /*
3017 * We'll take this lock a little bit long, but it's okay for two reasons.
3018 * Firstly, the only possible other thread to take it is who calls
3019 * qemu_guest_free_page_hint(), which should be rare; secondly, see
3020 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3021 * guarantees that we'll at least released it in a regular basis.
3022 */
3023 qemu_mutex_lock(&rs->bitmap_mutex);
3024 WITH_RCU_READ_LOCK_GUARD() {
3025 if (ram_list.version != rs->last_version) {
3026 ram_state_reset(rs);
3027 }
3028
3029 /* Read version before ram_list.blocks */
3030 smp_rmb();
3031
3032 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3033
3034 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3035 i = 0;
3036 while ((ret = qemu_file_rate_limit(f)) == 0 ||
3037 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3038 int pages;
3039
3040 if (qemu_file_get_error(f)) {
3041 break;
3042 }
3043
3044 pages = ram_find_and_save_block(rs, false);
3045 /* no more pages to sent */
3046 if (pages == 0) {
3047 done = 1;
3048 break;
3049 }
3050
3051 if (pages < 0) {
3052 qemu_file_set_error(f, pages);
3053 break;
3054 }
3055
3056 rs->target_page_count += pages;
3057
3058 /*
3059 * During postcopy, it is necessary to make sure one whole host
3060 * page is sent in one chunk.
3061 */
3062 if (migrate_postcopy_ram()) {
3063 flush_compressed_data(rs);
3064 }
3065
3066 /*
3067 * we want to check in the 1st loop, just in case it was the 1st
3068 * time and we had to sync the dirty bitmap.
3069 * qemu_clock_get_ns() is a bit expensive, so we only check each
3070 * some iterations
3071 */
3072 if ((i & 63) == 0) {
3073 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3074 1000000;
3075 if (t1 > MAX_WAIT) {
3076 trace_ram_save_iterate_big_wait(t1, i);
3077 break;
3078 }
3079 }
3080 i++;
3081 }
3082 }
3083 qemu_mutex_unlock(&rs->bitmap_mutex);
3084
3085 /*
3086 * Must occur before EOS (or any QEMUFile operation)
3087 * because of RDMA protocol.
3088 */
3089 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3090
3091 out:
3092 if (ret >= 0
3093 && migration_is_setup_or_active(migrate_get_current()->state)) {
3094 multifd_send_sync_main(rs->f);
3095 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3096 qemu_fflush(f);
3097 ram_counters.transferred += 8;
3098
3099 ret = qemu_file_get_error(f);
3100 }
3101 if (ret < 0) {
3102 return ret;
3103 }
3104
3105 return done;
3106 }
3107
3108 /**
3109 * ram_save_complete: function called to send the remaining amount of ram
3110 *
3111 * Returns zero to indicate success or negative on error
3112 *
3113 * Called with iothread lock
3114 *
3115 * @f: QEMUFile where to send the data
3116 * @opaque: RAMState pointer
3117 */
3118 static int ram_save_complete(QEMUFile *f, void *opaque)
3119 {
3120 RAMState **temp = opaque;
3121 RAMState *rs = *temp;
3122 int ret = 0;
3123
3124 WITH_RCU_READ_LOCK_GUARD() {
3125 if (!migration_in_postcopy()) {
3126 migration_bitmap_sync_precopy(rs);
3127 }
3128
3129 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3130
3131 /* try transferring iterative blocks of memory */
3132
3133 /* flush all remaining blocks regardless of rate limiting */
3134 while (true) {
3135 int pages;
3136
3137 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3138 /* no more blocks to sent */
3139 if (pages == 0) {
3140 break;
3141 }
3142 if (pages < 0) {
3143 ret = pages;
3144 break;
3145 }
3146 }
3147
3148 flush_compressed_data(rs);
3149 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3150 }
3151
3152 if (ret >= 0) {
3153 multifd_send_sync_main(rs->f);
3154 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3155 qemu_fflush(f);
3156 }
3157
3158 return ret;
3159 }
3160
3161 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3162 uint64_t *res_precopy_only,
3163 uint64_t *res_compatible,
3164 uint64_t *res_postcopy_only)
3165 {
3166 RAMState **temp = opaque;
3167 RAMState *rs = *temp;
3168 uint64_t remaining_size;
3169
3170 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3171
3172 if (!migration_in_postcopy() &&
3173 remaining_size < max_size) {
3174 qemu_mutex_lock_iothread();
3175 WITH_RCU_READ_LOCK_GUARD() {
3176 migration_bitmap_sync_precopy(rs);
3177 }
3178 qemu_mutex_unlock_iothread();
3179 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3180 }
3181
3182 if (migrate_postcopy_ram()) {
3183 /* We can do postcopy, and all the data is postcopiable */
3184 *res_compatible += remaining_size;
3185 } else {
3186 *res_precopy_only += remaining_size;
3187 }
3188 }
3189
3190 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3191 {
3192 unsigned int xh_len;
3193 int xh_flags;
3194 uint8_t *loaded_data;
3195
3196 /* extract RLE header */
3197 xh_flags = qemu_get_byte(f);
3198 xh_len = qemu_get_be16(f);
3199
3200 if (xh_flags != ENCODING_FLAG_XBZRLE) {
3201 error_report("Failed to load XBZRLE page - wrong compression!");
3202 return -1;
3203 }
3204
3205 if (xh_len > TARGET_PAGE_SIZE) {
3206 error_report("Failed to load XBZRLE page - len overflow!");
3207 return -1;
3208 }
3209 loaded_data = XBZRLE.decoded_buf;
3210 /* load data and decode */
3211 /* it can change loaded_data to point to an internal buffer */
3212 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3213
3214 /* decode RLE */
3215 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3216 TARGET_PAGE_SIZE) == -1) {
3217 error_report("Failed to load XBZRLE page - decode error!");
3218 return -1;
3219 }
3220
3221 return 0;
3222 }
3223
3224 /**
3225 * ram_block_from_stream: read a RAMBlock id from the migration stream
3226 *
3227 * Must be called from within a rcu critical section.
3228 *
3229 * Returns a pointer from within the RCU-protected ram_list.
3230 *
3231 * @f: QEMUFile where to read the data from
3232 * @flags: Page flags (mostly to see if it's a continuation of previous block)
3233 */
3234 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3235 {
3236 static RAMBlock *block;
3237 char id[256];
3238 uint8_t len;
3239
3240 if (flags & RAM_SAVE_FLAG_CONTINUE) {
3241 if (!block) {
3242 error_report("Ack, bad migration stream!");
3243 return NULL;
3244 }
3245 return block;
3246 }
3247
3248 len = qemu_get_byte(f);
3249 qemu_get_buffer(f, (uint8_t *)id, len);
3250 id[len] = 0;
3251
3252 block = qemu_ram_block_by_name(id);
3253 if (!block) {
3254 error_report("Can't find block %s", id);
3255 return NULL;
3256 }
3257
3258 if (ramblock_is_ignored(block)) {
3259 error_report("block %s should not be migrated !", id);
3260 return NULL;
3261 }
3262
3263 return block;
3264 }
3265
3266 static inline void *host_from_ram_block_offset(RAMBlock *block,
3267 ram_addr_t offset)
3268 {
3269 if (!offset_in_ramblock(block, offset)) {
3270 return NULL;
3271 }
3272
3273 return block->host + offset;
3274 }
3275
3276 static void *host_page_from_ram_block_offset(RAMBlock *block,
3277 ram_addr_t offset)
3278 {
3279 /* Note: Explicitly no check against offset_in_ramblock(). */
3280 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3281 block->page_size);
3282 }
3283
3284 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3285 ram_addr_t offset)
3286 {
3287 return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3288 }
3289
3290 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3291 ram_addr_t offset, bool record_bitmap)
3292 {
3293 if (!offset_in_ramblock(block, offset)) {
3294 return NULL;
3295 }
3296 if (!block->colo_cache) {
3297 error_report("%s: colo_cache is NULL in block :%s",
3298 __func__, block->idstr);
3299 return NULL;
3300 }
3301
3302 /*
3303 * During colo checkpoint, we need bitmap of these migrated pages.
3304 * It help us to decide which pages in ram cache should be flushed
3305 * into VM's RAM later.
3306 */
3307 if (record_bitmap &&
3308 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3309 ram_state->migration_dirty_pages++;
3310 }
3311 return block->colo_cache + offset;
3312 }
3313
3314 /**
3315 * ram_handle_compressed: handle the zero page case
3316 *
3317 * If a page (or a whole RDMA chunk) has been
3318 * determined to be zero, then zap it.
3319 *
3320 * @host: host address for the zero page
3321 * @ch: what the page is filled from. We only support zero
3322 * @size: size of the zero page
3323 */
3324 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3325 {
3326 if (ch != 0 || !is_zero_range(host, size)) {
3327 memset(host, ch, size);
3328 }
3329 }
3330
3331 /* return the size after decompression, or negative value on error */
3332 static int
3333 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3334 const uint8_t *source, size_t source_len)
3335 {
3336 int err;
3337
3338 err = inflateReset(stream);
3339 if (err != Z_OK) {
3340 return -1;
3341 }
3342
3343 stream->avail_in = source_len;
3344 stream->next_in = (uint8_t *)source;
3345 stream->avail_out = dest_len;
3346 stream->next_out = dest;
3347
3348 err = inflate(stream, Z_NO_FLUSH);
3349 if (err != Z_STREAM_END) {
3350 return -1;
3351 }
3352
3353 return stream->total_out;
3354 }
3355
3356 static void *do_data_decompress(void *opaque)
3357 {
3358 DecompressParam *param = opaque;
3359 unsigned long pagesize;
3360 uint8_t *des;
3361 int len, ret;
3362
3363 qemu_mutex_lock(&param->mutex);
3364 while (!param->quit) {
3365 if (param->des) {
3366 des = param->des;
3367 len = param->len;
3368 param->des = 0;
3369 qemu_mutex_unlock(&param->mutex);
3370
3371 pagesize = TARGET_PAGE_SIZE;
3372
3373 ret = qemu_uncompress_data(&param->stream, des, pagesize,
3374 param->compbuf, len);
3375 if (ret < 0 && migrate_get_current()->decompress_error_check) {
3376 error_report("decompress data failed");
3377 qemu_file_set_error(decomp_file, ret);
3378 }
3379
3380 qemu_mutex_lock(&decomp_done_lock);
3381 param->done = true;
3382 qemu_cond_signal(&decomp_done_cond);
3383 qemu_mutex_unlock(&decomp_done_lock);
3384
3385 qemu_mutex_lock(&param->mutex);
3386 } else {
3387 qemu_cond_wait(&param->cond, &param->mutex);
3388 }
3389 }
3390 qemu_mutex_unlock(&param->mutex);
3391
3392 return NULL;
3393 }
3394
3395 static int wait_for_decompress_done(void)
3396 {
3397 int idx, thread_count;
3398
3399 if (!migrate_use_compression()) {
3400 return 0;
3401 }
3402
3403 thread_count = migrate_decompress_threads();
3404 qemu_mutex_lock(&decomp_done_lock);
3405 for (idx = 0; idx < thread_count; idx++) {
3406 while (!decomp_param[idx].done) {
3407 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3408 }
3409 }
3410 qemu_mutex_unlock(&decomp_done_lock);
3411 return qemu_file_get_error(decomp_file);
3412 }
3413
3414 static void compress_threads_load_cleanup(void)
3415 {
3416 int i, thread_count;
3417
3418 if (!migrate_use_compression()) {
3419 return;
3420 }
3421 thread_count = migrate_decompress_threads();
3422 for (i = 0; i < thread_count; i++) {
3423 /*
3424 * we use it as a indicator which shows if the thread is
3425 * properly init'd or not
3426 */
3427 if (!decomp_param[i].compbuf) {
3428 break;
3429 }
3430
3431 qemu_mutex_lock(&decomp_param[i].mutex);
3432 decomp_param[i].quit = true;
3433 qemu_cond_signal(&decomp_param[i].cond);
3434 qemu_mutex_unlock(&decomp_param[i].mutex);
3435 }
3436 for (i = 0; i < thread_count; i++) {
3437 if (!decomp_param[i].compbuf) {
3438 break;
3439 }
3440
3441 qemu_thread_join(decompress_threads + i);
3442 qemu_mutex_destroy(&decomp_param[i].mutex);
3443 qemu_cond_destroy(&decomp_param[i].cond);
3444 inflateEnd(&decomp_param[i].stream);
3445 g_free(decomp_param[i].compbuf);
3446 decomp_param[i].compbuf = NULL;
3447 }
3448 g_free(decompress_threads);
3449 g_free(decomp_param);
3450 decompress_threads = NULL;
3451 decomp_param = NULL;
3452 decomp_file = NULL;
3453 }
3454
3455 static int compress_threads_load_setup(QEMUFile *f)
3456 {
3457 int i, thread_count;
3458
3459 if (!migrate_use_compression()) {
3460 return 0;
3461 }
3462
3463 thread_count = migrate_decompress_threads();
3464 decompress_threads = g_new0(QemuThread, thread_count);
3465 decomp_param = g_new0(DecompressParam, thread_count);
3466 qemu_mutex_init(&decomp_done_lock);
3467 qemu_cond_init(&decomp_done_cond);
3468 decomp_file = f;
3469 for (i = 0; i < thread_count; i++) {
3470 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3471 goto exit;
3472 }
3473
3474 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3475 qemu_mutex_init(&decomp_param[i].mutex);
3476 qemu_cond_init(&decomp_param[i].cond);
3477 decomp_param[i].done = true;
3478 decomp_param[i].quit = false;
3479 qemu_thread_create(decompress_threads + i, "decompress",
3480 do_data_decompress, decomp_param + i,
3481 QEMU_THREAD_JOINABLE);
3482 }
3483 return 0;
3484 exit:
3485 compress_threads_load_cleanup();
3486 return -1;
3487 }
3488
3489 static void decompress_data_with_multi_threads(QEMUFile *f,
3490 void *host, int len)
3491 {
3492 int idx, thread_count;
3493
3494 thread_count = migrate_decompress_threads();
3495 QEMU_LOCK_GUARD(&decomp_done_lock);
3496 while (true) {
3497 for (idx = 0; idx < thread_count; idx++) {
3498 if (decomp_param[idx].done) {
3499 decomp_param[idx].done = false;
3500 qemu_mutex_lock(&decomp_param[idx].mutex);
3501 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3502 decomp_param[idx].des = host;
3503 decomp_param[idx].len = len;
3504 qemu_cond_signal(&decomp_param[idx].cond);
3505 qemu_mutex_unlock(&decomp_param[idx].mutex);
3506 break;
3507 }
3508 }
3509 if (idx < thread_count) {
3510 break;
3511 } else {
3512 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3513 }
3514 }
3515 }
3516
3517 static void colo_init_ram_state(void)
3518 {
3519 ram_state_init(&ram_state);
3520 }
3521
3522 /*
3523 * colo cache: this is for secondary VM, we cache the whole
3524 * memory of the secondary VM, it is need to hold the global lock
3525 * to call this helper.
3526 */
3527 int colo_init_ram_cache(void)
3528 {
3529 RAMBlock *block;
3530
3531 WITH_RCU_READ_LOCK_GUARD() {
3532 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3533 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3534 NULL, false, false);
3535 if (!block->colo_cache) {
3536 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3537 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3538 block->used_length);
3539 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3540 if (block->colo_cache) {
3541 qemu_anon_ram_free(block->colo_cache, block->used_length);
3542 block->colo_cache = NULL;
3543 }
3544 }
3545 return -errno;
3546 }
3547 if (!machine_dump_guest_core(current_machine)) {
3548 qemu_madvise(block->colo_cache, block->used_length,
3549 QEMU_MADV_DONTDUMP);
3550 }
3551 }
3552 }
3553
3554 /*
3555 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3556 * with to decide which page in cache should be flushed into SVM's RAM. Here
3557 * we use the same name 'ram_bitmap' as for migration.
3558 */
3559 if (ram_bytes_total()) {
3560 RAMBlock *block;
3561
3562 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3563 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3564 block->bmap = bitmap_new(pages);
3565 }
3566 }
3567
3568 colo_init_ram_state();
3569 return 0;
3570 }
3571
3572 /* TODO: duplicated with ram_init_bitmaps */
3573 void colo_incoming_start_dirty_log(void)
3574 {
3575 RAMBlock *block = NULL;
3576 /* For memory_global_dirty_log_start below. */
3577 qemu_mutex_lock_iothread();
3578 qemu_mutex_lock_ramlist();
3579
3580 memory_global_dirty_log_sync();
3581 WITH_RCU_READ_LOCK_GUARD() {
3582 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3583 ramblock_sync_dirty_bitmap(ram_state, block);
3584 /* Discard this dirty bitmap record */
3585 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3586 }
3587 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3588 }
3589 ram_state->migration_dirty_pages = 0;
3590 qemu_mutex_unlock_ramlist();
3591 qemu_mutex_unlock_iothread();
3592 }
3593
3594 /* It is need to hold the global lock to call this helper */
3595 void colo_release_ram_cache(void)
3596 {
3597 RAMBlock *block;
3598
3599 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3600 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3601 g_free(block->bmap);
3602 block->bmap = NULL;
3603 }
3604
3605 WITH_RCU_READ_LOCK_GUARD() {
3606 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3607 if (block->colo_cache) {
3608 qemu_anon_ram_free(block->colo_cache, block->used_length);
3609 block->colo_cache = NULL;
3610 }
3611 }
3612 }
3613 ram_state_cleanup(&ram_state);
3614 }
3615
3616 /**
3617 * ram_load_setup: Setup RAM for migration incoming side
3618 *
3619 * Returns zero to indicate success and negative for error
3620 *
3621 * @f: QEMUFile where to receive the data
3622 * @opaque: RAMState pointer
3623 */
3624 static int ram_load_setup(QEMUFile *f, void *opaque)
3625 {
3626 if (compress_threads_load_setup(f)) {
3627 return -1;
3628 }
3629
3630 xbzrle_load_setup();
3631 ramblock_recv_map_init();
3632
3633 return 0;
3634 }
3635
3636 static int ram_load_cleanup(void *opaque)
3637 {
3638 RAMBlock *rb;
3639
3640 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3641 qemu_ram_block_writeback(rb);
3642 }
3643
3644 xbzrle_load_cleanup();
3645 compress_threads_load_cleanup();
3646
3647 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3648 g_free(rb->receivedmap);
3649 rb->receivedmap = NULL;
3650 }
3651
3652 return 0;
3653 }
3654
3655 /**
3656 * ram_postcopy_incoming_init: allocate postcopy data structures
3657 *
3658 * Returns 0 for success and negative if there was one error
3659 *
3660 * @mis: current migration incoming state
3661 *
3662 * Allocate data structures etc needed by incoming migration with
3663 * postcopy-ram. postcopy-ram's similarly names
3664 * postcopy_ram_incoming_init does the work.
3665 */
3666 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3667 {
3668 return postcopy_ram_incoming_init(mis);
3669 }
3670
3671 /**
3672 * ram_load_postcopy: load a page in postcopy case
3673 *
3674 * Returns 0 for success or -errno in case of error
3675 *
3676 * Called in postcopy mode by ram_load().
3677 * rcu_read_lock is taken prior to this being called.
3678 *
3679 * @f: QEMUFile where to send the data
3680 */
3681 static int ram_load_postcopy(QEMUFile *f)
3682 {
3683 int flags = 0, ret = 0;
3684 bool place_needed = false;
3685 bool matches_target_page_size = false;
3686 MigrationIncomingState *mis = migration_incoming_get_current();
3687 /* Temporary page that is later 'placed' */
3688 void *postcopy_host_page = mis->postcopy_tmp_page;
3689 void *host_page = NULL;
3690 bool all_zero = true;
3691 int target_pages = 0;
3692
3693 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3694 ram_addr_t addr;
3695 void *page_buffer = NULL;
3696 void *place_source = NULL;
3697 RAMBlock *block = NULL;
3698 uint8_t ch;
3699 int len;
3700
3701 addr = qemu_get_be64(f);
3702
3703 /*
3704 * If qemu file error, we should stop here, and then "addr"
3705 * may be invalid
3706 */
3707 ret = qemu_file_get_error(f);
3708 if (ret) {
3709 break;
3710 }
3711
3712 flags = addr & ~TARGET_PAGE_MASK;
3713 addr &= TARGET_PAGE_MASK;
3714
3715 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3716 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3717 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3718 block = ram_block_from_stream(f, flags);
3719 if (!block) {
3720 ret = -EINVAL;
3721 break;
3722 }
3723
3724 /*
3725 * Relying on used_length is racy and can result in false positives.
3726 * We might place pages beyond used_length in case RAM was shrunk
3727 * while in postcopy, which is fine - trying to place via
3728 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3729 */
3730 if (!block->host || addr >= block->postcopy_length) {
3731 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3732 ret = -EINVAL;
3733 break;
3734 }
3735 target_pages++;
3736 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3737 /*
3738 * Postcopy requires that we place whole host pages atomically;
3739 * these may be huge pages for RAMBlocks that are backed by
3740 * hugetlbfs.
3741 * To make it atomic, the data is read into a temporary page
3742 * that's moved into place later.
3743 * The migration protocol uses, possibly smaller, target-pages
3744 * however the source ensures it always sends all the components
3745 * of a host page in one chunk.
3746 */
3747 page_buffer = postcopy_host_page +
3748 host_page_offset_from_ram_block_offset(block, addr);
3749 /* If all TP are zero then we can optimise the place */
3750 if (target_pages == 1) {
3751 host_page = host_page_from_ram_block_offset(block, addr);
3752 } else if (host_page != host_page_from_ram_block_offset(block,
3753 addr)) {
3754 /* not the 1st TP within the HP */
3755 error_report("Non-same host page %p/%p", host_page,
3756 host_page_from_ram_block_offset(block, addr));
3757 ret = -EINVAL;
3758 break;
3759 }
3760
3761 /*
3762 * If it's the last part of a host page then we place the host
3763 * page
3764 */
3765 if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3766 place_needed = true;
3767 }
3768 place_source = postcopy_host_page;
3769 }
3770
3771 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3772 case RAM_SAVE_FLAG_ZERO:
3773 ch = qemu_get_byte(f);
3774 /*
3775 * Can skip to set page_buffer when
3776 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3777 */
3778 if (ch || !matches_target_page_size) {
3779 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3780 }
3781 if (ch) {
3782 all_zero = false;
3783 }
3784 break;
3785
3786 case RAM_SAVE_FLAG_PAGE:
3787 all_zero = false;
3788 if (!matches_target_page_size) {
3789 /* For huge pages, we always use temporary buffer */
3790 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3791 } else {
3792 /*
3793 * For small pages that matches target page size, we
3794 * avoid the qemu_file copy. Instead we directly use
3795 * the buffer of QEMUFile to place the page. Note: we
3796 * cannot do any QEMUFile operation before using that
3797 * buffer to make sure the buffer is valid when
3798 * placing the page.
3799 */
3800 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3801 TARGET_PAGE_SIZE);
3802 }
3803 break;
3804 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3805 all_zero = false;
3806 len = qemu_get_be32(f);
3807 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3808 error_report("Invalid compressed data length: %d", len);
3809 ret = -EINVAL;
3810 break;
3811 }
3812 decompress_data_with_multi_threads(f, page_buffer, len);
3813 break;
3814
3815 case RAM_SAVE_FLAG_EOS:
3816 /* normal exit */
3817 multifd_recv_sync_main();
3818 break;
3819 default:
3820 error_report("Unknown combination of migration flags: 0x%x"
3821 " (postcopy mode)", flags);
3822 ret = -EINVAL;
3823 break;
3824 }
3825
3826 /* Got the whole host page, wait for decompress before placing. */
3827 if (place_needed) {
3828 ret |= wait_for_decompress_done();
3829 }
3830
3831 /* Detect for any possible file errors */
3832 if (!ret && qemu_file_get_error(f)) {
3833 ret = qemu_file_get_error(f);
3834 }
3835
3836 if (!ret && place_needed) {
3837 if (all_zero) {
3838 ret = postcopy_place_page_zero(mis, host_page, block);
3839 } else {
3840 ret = postcopy_place_page(mis, host_page, place_source,
3841 block);
3842 }
3843 place_needed = false;
3844 target_pages = 0;
3845 /* Assume we have a zero page until we detect something different */
3846 all_zero = true;
3847 }
3848 }
3849
3850 return ret;
3851 }
3852
3853 static bool postcopy_is_advised(void)
3854 {
3855 PostcopyState ps = postcopy_state_get();
3856 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3857 }
3858
3859 static bool postcopy_is_running(void)
3860 {
3861 PostcopyState ps = postcopy_state_get();
3862 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3863 }
3864
3865 /*
3866 * Flush content of RAM cache into SVM's memory.
3867 * Only flush the pages that be dirtied by PVM or SVM or both.
3868 */
3869 void colo_flush_ram_cache(void)
3870 {
3871 RAMBlock *block = NULL;
3872 void *dst_host;
3873 void *src_host;
3874 unsigned long offset = 0;
3875
3876 memory_global_dirty_log_sync();
3877 qemu_mutex_lock(&ram_state->bitmap_mutex);
3878 WITH_RCU_READ_LOCK_GUARD() {
3879 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3880 ramblock_sync_dirty_bitmap(ram_state, block);
3881 }
3882 }
3883
3884 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3885 WITH_RCU_READ_LOCK_GUARD() {
3886 block = QLIST_FIRST_RCU(&ram_list.blocks);
3887
3888 while (block) {
3889 offset = migration_bitmap_find_dirty(ram_state, block, offset);
3890
3891 if (!offset_in_ramblock(block,
3892 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
3893 offset = 0;
3894 block = QLIST_NEXT_RCU(block, next);
3895 } else {
3896 migration_bitmap_clear_dirty(ram_state, block, offset);
3897 dst_host = block->host
3898 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3899 src_host = block->colo_cache
3900 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3901 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3902 }
3903 }
3904 }
3905 trace_colo_flush_ram_cache_end();
3906 qemu_mutex_unlock(&ram_state->bitmap_mutex);
3907 }
3908
3909 /**
3910 * ram_load_precopy: load pages in precopy case
3911 *
3912 * Returns 0 for success or -errno in case of error
3913 *
3914 * Called in precopy mode by ram_load().
3915 * rcu_read_lock is taken prior to this being called.
3916 *
3917 * @f: QEMUFile where to send the data
3918 */
3919 static int ram_load_precopy(QEMUFile *f)
3920 {
3921 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3922 /* ADVISE is earlier, it shows the source has the postcopy capability on */
3923 bool postcopy_advised = postcopy_is_advised();
3924 if (!migrate_use_compression()) {
3925 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3926 }
3927
3928 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3929 ram_addr_t addr, total_ram_bytes;
3930 void *host = NULL, *host_bak = NULL;
3931 uint8_t ch;
3932
3933 /*
3934 * Yield periodically to let main loop run, but an iteration of
3935 * the main loop is expensive, so do it each some iterations
3936 */
3937 if ((i & 32767) == 0 && qemu_in_coroutine()) {
3938 aio_co_schedule(qemu_get_current_aio_context(),
3939 qemu_coroutine_self());
3940 qemu_coroutine_yield();
3941 }
3942 i++;
3943
3944 addr = qemu_get_be64(f);
3945 flags = addr & ~TARGET_PAGE_MASK;
3946 addr &= TARGET_PAGE_MASK;
3947
3948 if (flags & invalid_flags) {
3949 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3950 error_report("Received an unexpected compressed page");
3951 }
3952
3953 ret = -EINVAL;
3954 break;
3955 }
3956
3957 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3958 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3959 RAMBlock *block = ram_block_from_stream(f, flags);
3960
3961 host = host_from_ram_block_offset(block, addr);
3962 /*
3963 * After going into COLO stage, we should not load the page
3964 * into SVM's memory directly, we put them into colo_cache firstly.
3965 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3966 * Previously, we copied all these memory in preparing stage of COLO
3967 * while we need to stop VM, which is a time-consuming process.
3968 * Here we optimize it by a trick, back-up every page while in
3969 * migration process while COLO is enabled, though it affects the
3970 * speed of the migration, but it obviously reduce the downtime of
3971 * back-up all SVM'S memory in COLO preparing stage.
3972 */
3973 if (migration_incoming_colo_enabled()) {
3974 if (migration_incoming_in_colo_state()) {
3975 /* In COLO stage, put all pages into cache temporarily */
3976 host = colo_cache_from_block_offset(block, addr, true);
3977 } else {
3978 /*
3979 * In migration stage but before COLO stage,
3980 * Put all pages into both cache and SVM's memory.
3981 */
3982 host_bak = colo_cache_from_block_offset(block, addr, false);
3983 }
3984 }
3985 if (!host) {
3986 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3987 ret = -EINVAL;
3988 break;
3989 }
3990 if (!migration_incoming_in_colo_state()) {
3991 ramblock_recv_bitmap_set(block, host);
3992 }
3993
3994 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3995 }
3996
3997 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3998 case RAM_SAVE_FLAG_MEM_SIZE:
3999 /* Synchronize RAM block list */
4000 total_ram_bytes = addr;
4001 while (!ret && total_ram_bytes) {
4002 RAMBlock *block;
4003 char id[256];
4004 ram_addr_t length;
4005
4006 len = qemu_get_byte(f);
4007 qemu_get_buffer(f, (uint8_t *)id, len);
4008 id[len] = 0;
4009 length = qemu_get_be64(f);
4010
4011 block = qemu_ram_block_by_name(id);
4012 if (block && !qemu_ram_is_migratable(block)) {
4013 error_report("block %s should not be migrated !", id);
4014 ret = -EINVAL;
4015 } else if (block) {
4016 if (length != block->used_length) {
4017 Error *local_err = NULL;
4018
4019 ret = qemu_ram_resize(block, length,
4020 &local_err);
4021 if (local_err) {
4022 error_report_err(local_err);
4023 }
4024 }
4025 /* For postcopy we need to check hugepage sizes match */
4026 if (postcopy_advised && migrate_postcopy_ram() &&
4027 block->page_size != qemu_host_page_size) {
4028 uint64_t remote_page_size = qemu_get_be64(f);
4029 if (remote_page_size != block->page_size) {
4030 error_report("Mismatched RAM page size %s "
4031 "(local) %zd != %" PRId64,
4032 id, block->page_size,
4033 remote_page_size);
4034 ret = -EINVAL;
4035 }
4036 }
4037 if (migrate_ignore_shared()) {
4038 hwaddr addr = qemu_get_be64(f);
4039 if (ramblock_is_ignored(block) &&
4040 block->mr->addr != addr) {
4041 error_report("Mismatched GPAs for block %s "
4042 "%" PRId64 "!= %" PRId64,
4043 id, (uint64_t)addr,
4044 (uint64_t)block->mr->addr);
4045 ret = -EINVAL;
4046 }
4047 }
4048 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4049 block->idstr);
4050 } else {
4051 error_report("Unknown ramblock \"%s\", cannot "
4052 "accept migration", id);
4053 ret = -EINVAL;
4054 }
4055
4056 total_ram_bytes -= length;
4057 }
4058 break;
4059
4060 case RAM_SAVE_FLAG_ZERO:
4061 ch = qemu_get_byte(f);
4062 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4063 break;
4064
4065 case RAM_SAVE_FLAG_PAGE:
4066 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4067 break;
4068
4069 case RAM_SAVE_FLAG_COMPRESS_PAGE:
4070 len = qemu_get_be32(f);
4071 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4072 error_report("Invalid compressed data length: %d", len);
4073 ret = -EINVAL;
4074 break;
4075 }
4076 decompress_data_with_multi_threads(f, host, len);
4077 break;
4078
4079 case RAM_SAVE_FLAG_XBZRLE:
4080 if (load_xbzrle(f, addr, host) < 0) {
4081 error_report("Failed to decompress XBZRLE page at "
4082 RAM_ADDR_FMT, addr);
4083 ret = -EINVAL;
4084 break;
4085 }
4086 break;
4087 case RAM_SAVE_FLAG_EOS:
4088 /* normal exit */
4089 multifd_recv_sync_main();
4090 break;
4091 default:
4092 if (flags & RAM_SAVE_FLAG_HOOK) {
4093 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4094 } else {
4095 error_report("Unknown combination of migration flags: 0x%x",
4096 flags);
4097 ret = -EINVAL;
4098 }
4099 }
4100 if (!ret) {
4101 ret = qemu_file_get_error(f);
4102 }
4103 if (!ret && host_bak) {
4104 memcpy(host_bak, host, TARGET_PAGE_SIZE);
4105 }
4106 }
4107
4108 ret |= wait_for_decompress_done();
4109 return ret;
4110 }
4111
4112 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4113 {
4114 int ret = 0;
4115 static uint64_t seq_iter;
4116 /*
4117 * If system is running in postcopy mode, page inserts to host memory must
4118 * be atomic
4119 */
4120 bool postcopy_running = postcopy_is_running();
4121
4122 seq_iter++;
4123
4124 if (version_id != 4) {
4125 return -EINVAL;
4126 }
4127
4128 /*
4129 * This RCU critical section can be very long running.
4130 * When RCU reclaims in the code start to become numerous,
4131 * it will be necessary to reduce the granularity of this
4132 * critical section.
4133 */
4134 WITH_RCU_READ_LOCK_GUARD() {
4135 if (postcopy_running) {
4136 ret = ram_load_postcopy(f);
4137 } else {
4138 ret = ram_load_precopy(f);
4139 }
4140 }
4141 trace_ram_load_complete(ret, seq_iter);
4142
4143 return ret;
4144 }
4145
4146 static bool ram_has_postcopy(void *opaque)
4147 {
4148 RAMBlock *rb;
4149 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4150 if (ramblock_is_pmem(rb)) {
4151 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4152 "is not supported now!", rb->idstr, rb->host);
4153 return false;
4154 }
4155 }
4156
4157 return migrate_postcopy_ram();
4158 }
4159
4160 /* Sync all the dirty bitmap with destination VM. */
4161 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4162 {
4163 RAMBlock *block;
4164 QEMUFile *file = s->to_dst_file;
4165 int ramblock_count = 0;
4166
4167 trace_ram_dirty_bitmap_sync_start();
4168
4169 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4170 qemu_savevm_send_recv_bitmap(file, block->idstr);
4171 trace_ram_dirty_bitmap_request(block->idstr);
4172 ramblock_count++;
4173 }
4174
4175 trace_ram_dirty_bitmap_sync_wait();
4176
4177 /* Wait until all the ramblocks' dirty bitmap synced */
4178 while (ramblock_count--) {
4179 qemu_sem_wait(&s->rp_state.rp_sem);
4180 }
4181
4182 trace_ram_dirty_bitmap_sync_complete();
4183
4184 return 0;
4185 }
4186
4187 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4188 {
4189 qemu_sem_post(&s->rp_state.rp_sem);
4190 }
4191
4192 /*
4193 * Read the received bitmap, revert it as the initial dirty bitmap.
4194 * This is only used when the postcopy migration is paused but wants
4195 * to resume from a middle point.
4196 */
4197 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4198 {
4199 int ret = -EINVAL;
4200 /* from_dst_file is always valid because we're within rp_thread */
4201 QEMUFile *file = s->rp_state.from_dst_file;
4202 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4203 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4204 uint64_t size, end_mark;
4205
4206 trace_ram_dirty_bitmap_reload_begin(block->idstr);
4207
4208 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4209 error_report("%s: incorrect state %s", __func__,
4210 MigrationStatus_str(s->state));
4211 return -EINVAL;
4212 }
4213
4214 /*
4215 * Note: see comments in ramblock_recv_bitmap_send() on why we
4216 * need the endianness conversion, and the paddings.
4217 */
4218 local_size = ROUND_UP(local_size, 8);
4219
4220 /* Add paddings */
4221 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4222
4223 size = qemu_get_be64(file);
4224
4225 /* The size of the bitmap should match with our ramblock */
4226 if (size != local_size) {
4227 error_report("%s: ramblock '%s' bitmap size mismatch "
4228 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4229 block->idstr, size, local_size);
4230 ret = -EINVAL;
4231 goto out;
4232 }
4233
4234 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4235 end_mark = qemu_get_be64(file);
4236
4237 ret = qemu_file_get_error(file);
4238 if (ret || size != local_size) {
4239 error_report("%s: read bitmap failed for ramblock '%s': %d"
4240 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4241 __func__, block->idstr, ret, local_size, size);
4242 ret = -EIO;
4243 goto out;
4244 }
4245
4246 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4247 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4248 __func__, block->idstr, end_mark);
4249 ret = -EINVAL;
4250 goto out;
4251 }
4252
4253 /*
4254 * Endianness conversion. We are during postcopy (though paused).
4255 * The dirty bitmap won't change. We can directly modify it.
4256 */
4257 bitmap_from_le(block->bmap, le_bitmap, nbits);
4258
4259 /*
4260 * What we received is "received bitmap". Revert it as the initial
4261 * dirty bitmap for this ramblock.
4262 */
4263 bitmap_complement(block->bmap, block->bmap, nbits);
4264
4265 /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4266 ramblock_dirty_bitmap_clear_discarded_pages(block);
4267
4268 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4269 trace_ram_dirty_bitmap_reload_complete(block->idstr);
4270
4271 /*
4272 * We succeeded to sync bitmap for current ramblock. If this is
4273 * the last one to sync, we need to notify the main send thread.
4274 */
4275 ram_dirty_bitmap_reload_notify(s);
4276
4277 ret = 0;
4278 out:
4279 g_free(le_bitmap);
4280 return ret;
4281 }
4282
4283 static int ram_resume_prepare(MigrationState *s, void *opaque)
4284 {
4285 RAMState *rs = *(RAMState **)opaque;
4286 int ret;
4287
4288 ret = ram_dirty_bitmap_sync_all(s, rs);
4289 if (ret) {
4290 return ret;
4291 }
4292
4293 ram_state_resume_prepare(rs, s->to_dst_file);
4294
4295 return 0;
4296 }
4297
4298 static SaveVMHandlers savevm_ram_handlers = {
4299 .save_setup = ram_save_setup,
4300 .save_live_iterate = ram_save_iterate,
4301 .save_live_complete_postcopy = ram_save_complete,
4302 .save_live_complete_precopy = ram_save_complete,
4303 .has_postcopy = ram_has_postcopy,
4304 .save_live_pending = ram_save_pending,
4305 .load_state = ram_load,
4306 .save_cleanup = ram_save_cleanup,
4307 .load_setup = ram_load_setup,
4308 .load_cleanup = ram_load_cleanup,
4309 .resume_prepare = ram_resume_prepare,
4310 };
4311
4312 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4313 size_t old_size, size_t new_size)
4314 {
4315 PostcopyState ps = postcopy_state_get();
4316 ram_addr_t offset;
4317 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4318 Error *err = NULL;
4319
4320 if (ramblock_is_ignored(rb)) {
4321 return;
4322 }
4323
4324 if (!migration_is_idle()) {
4325 /*
4326 * Precopy code on the source cannot deal with the size of RAM blocks
4327 * changing at random points in time - especially after sending the
4328 * RAM block sizes in the migration stream, they must no longer change.
4329 * Abort and indicate a proper reason.
4330 */
4331 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4332 migration_cancel(err);
4333 error_free(err);
4334 }
4335
4336 switch (ps) {
4337 case POSTCOPY_INCOMING_ADVISE:
4338 /*
4339 * Update what ram_postcopy_incoming_init()->init_range() does at the
4340 * time postcopy was advised. Syncing RAM blocks with the source will
4341 * result in RAM resizes.
4342 */
4343 if (old_size < new_size) {
4344 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4345 error_report("RAM block '%s' discard of resized RAM failed",
4346 rb->idstr);
4347 }
4348 }
4349 rb->postcopy_length = new_size;
4350 break;
4351 case POSTCOPY_INCOMING_NONE:
4352 case POSTCOPY_INCOMING_RUNNING:
4353 case POSTCOPY_INCOMING_END:
4354 /*
4355 * Once our guest is running, postcopy does no longer care about
4356 * resizes. When growing, the new memory was not available on the
4357 * source, no handler needed.
4358 */
4359 break;
4360 default:
4361 error_report("RAM block '%s' resized during postcopy state: %d",
4362 rb->idstr, ps);
4363 exit(-1);
4364 }
4365 }
4366
4367 static RAMBlockNotifier ram_mig_ram_notifier = {
4368 .ram_block_resized = ram_mig_ram_block_resized,
4369 };
4370
4371 void ram_mig_init(void)
4372 {
4373 qemu_mutex_init(&XBZRLE.lock);
4374 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4375 ram_block_notifier_add(&ram_mig_ram_notifier);
4376 }