]> git.proxmox.com Git - mirror_qemu.git/blob - migration/ram.c
f69d8d42b03ed010aebd8fa0799b526dfd136ceb
[mirror_qemu.git] / migration / ram.c
1 /*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
28
29 #include "qemu/osdep.h"
30 #include "qemu/cutils.h"
31 #include "qemu/bitops.h"
32 #include "qemu/bitmap.h"
33 #include "qemu/madvise.h"
34 #include "qemu/main-loop.h"
35 #include "xbzrle.h"
36 #include "ram-compress.h"
37 #include "ram.h"
38 #include "migration.h"
39 #include "migration-stats.h"
40 #include "migration/register.h"
41 #include "migration/misc.h"
42 #include "qemu-file.h"
43 #include "postcopy-ram.h"
44 #include "page_cache.h"
45 #include "qemu/error-report.h"
46 #include "qapi/error.h"
47 #include "qapi/qapi-types-migration.h"
48 #include "qapi/qapi-events-migration.h"
49 #include "qapi/qmp/qerror.h"
50 #include "trace.h"
51 #include "exec/ram_addr.h"
52 #include "exec/target_page.h"
53 #include "qemu/rcu_queue.h"
54 #include "migration/colo.h"
55 #include "block.h"
56 #include "sysemu/cpu-throttle.h"
57 #include "savevm.h"
58 #include "qemu/iov.h"
59 #include "multifd.h"
60 #include "sysemu/runstate.h"
61 #include "options.h"
62
63 #include "hw/boards.h" /* for machine_dump_guest_core() */
64
65 #if defined(__linux__)
66 #include "qemu/userfaultfd.h"
67 #endif /* defined(__linux__) */
68
69 /***********************************************************/
70 /* ram save/restore */
71
72 /*
73 * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
74 * worked for pages that were filled with the same char. We switched
75 * it to only search for the zero value. And to avoid confusion with
76 * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it.
77 */
78 /*
79 * RAM_SAVE_FLAG_FULL was obsoleted in 2009, it can be reused now
80 */
81 #define RAM_SAVE_FLAG_FULL 0x01
82 #define RAM_SAVE_FLAG_ZERO 0x02
83 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
84 #define RAM_SAVE_FLAG_PAGE 0x08
85 #define RAM_SAVE_FLAG_EOS 0x10
86 #define RAM_SAVE_FLAG_CONTINUE 0x20
87 #define RAM_SAVE_FLAG_XBZRLE 0x40
88 /* 0x80 is reserved in qemu-file.h for RAM_SAVE_FLAG_HOOK */
89 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
90 #define RAM_SAVE_FLAG_MULTIFD_FLUSH 0x200
91 /* We can't use any flag that is bigger than 0x200 */
92
93 int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int,
94 uint8_t *, int) = xbzrle_encode_buffer;
95 #if defined(CONFIG_AVX512BW_OPT)
96 #include "qemu/cpuid.h"
97 static void __attribute__((constructor)) init_cpu_flag(void)
98 {
99 unsigned max = __get_cpuid_max(0, NULL);
100 int a, b, c, d;
101 if (max >= 1) {
102 __cpuid(1, a, b, c, d);
103 /* We must check that AVX is not just available, but usable. */
104 if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
105 int bv;
106 __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
107 __cpuid_count(7, 0, a, b, c, d);
108 /* 0xe6:
109 * XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
110 * and ZMM16-ZMM31 state are enabled by OS)
111 * XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
112 */
113 if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
114 xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512;
115 }
116 }
117 }
118 }
119 #endif
120
121 XBZRLECacheStats xbzrle_counters;
122
123 /* used by the search for pages to send */
124 struct PageSearchStatus {
125 /* The migration channel used for a specific host page */
126 QEMUFile *pss_channel;
127 /* Last block from where we have sent data */
128 RAMBlock *last_sent_block;
129 /* Current block being searched */
130 RAMBlock *block;
131 /* Current page to search from */
132 unsigned long page;
133 /* Set once we wrap around */
134 bool complete_round;
135 /* Whether we're sending a host page */
136 bool host_page_sending;
137 /* The start/end of current host page. Invalid if host_page_sending==false */
138 unsigned long host_page_start;
139 unsigned long host_page_end;
140 };
141 typedef struct PageSearchStatus PageSearchStatus;
142
143 /* struct contains XBZRLE cache and a static page
144 used by the compression */
145 static struct {
146 /* buffer used for XBZRLE encoding */
147 uint8_t *encoded_buf;
148 /* buffer for storing page content */
149 uint8_t *current_buf;
150 /* Cache for XBZRLE, Protected by lock. */
151 PageCache *cache;
152 QemuMutex lock;
153 /* it will store a page full of zeros */
154 uint8_t *zero_target_page;
155 /* buffer used for XBZRLE decoding */
156 uint8_t *decoded_buf;
157 } XBZRLE;
158
159 static void XBZRLE_cache_lock(void)
160 {
161 if (migrate_xbzrle()) {
162 qemu_mutex_lock(&XBZRLE.lock);
163 }
164 }
165
166 static void XBZRLE_cache_unlock(void)
167 {
168 if (migrate_xbzrle()) {
169 qemu_mutex_unlock(&XBZRLE.lock);
170 }
171 }
172
173 /**
174 * xbzrle_cache_resize: resize the xbzrle cache
175 *
176 * This function is called from migrate_params_apply in main
177 * thread, possibly while a migration is in progress. A running
178 * migration may be using the cache and might finish during this call,
179 * hence changes to the cache are protected by XBZRLE.lock().
180 *
181 * Returns 0 for success or -1 for error
182 *
183 * @new_size: new cache size
184 * @errp: set *errp if the check failed, with reason
185 */
186 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
187 {
188 PageCache *new_cache;
189 int64_t ret = 0;
190
191 /* Check for truncation */
192 if (new_size != (size_t)new_size) {
193 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
194 "exceeding address space");
195 return -1;
196 }
197
198 if (new_size == migrate_xbzrle_cache_size()) {
199 /* nothing to do */
200 return 0;
201 }
202
203 XBZRLE_cache_lock();
204
205 if (XBZRLE.cache != NULL) {
206 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
207 if (!new_cache) {
208 ret = -1;
209 goto out;
210 }
211
212 cache_fini(XBZRLE.cache);
213 XBZRLE.cache = new_cache;
214 }
215 out:
216 XBZRLE_cache_unlock();
217 return ret;
218 }
219
220 static bool postcopy_preempt_active(void)
221 {
222 return migrate_postcopy_preempt() && migration_in_postcopy();
223 }
224
225 bool ramblock_is_ignored(RAMBlock *block)
226 {
227 return !qemu_ram_is_migratable(block) ||
228 (migrate_ignore_shared() && qemu_ram_is_shared(block));
229 }
230
231 #undef RAMBLOCK_FOREACH
232
233 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
234 {
235 RAMBlock *block;
236 int ret = 0;
237
238 RCU_READ_LOCK_GUARD();
239
240 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
241 ret = func(block, opaque);
242 if (ret) {
243 break;
244 }
245 }
246 return ret;
247 }
248
249 static void ramblock_recv_map_init(void)
250 {
251 RAMBlock *rb;
252
253 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
254 assert(!rb->receivedmap);
255 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
256 }
257 }
258
259 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
260 {
261 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
262 rb->receivedmap);
263 }
264
265 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
266 {
267 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
268 }
269
270 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
271 {
272 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
273 }
274
275 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
276 size_t nr)
277 {
278 bitmap_set_atomic(rb->receivedmap,
279 ramblock_recv_bitmap_offset(host_addr, rb),
280 nr);
281 }
282
283 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
284
285 /*
286 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
287 *
288 * Returns >0 if success with sent bytes, or <0 if error.
289 */
290 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
291 const char *block_name)
292 {
293 RAMBlock *block = qemu_ram_block_by_name(block_name);
294 unsigned long *le_bitmap, nbits;
295 uint64_t size;
296
297 if (!block) {
298 error_report("%s: invalid block name: %s", __func__, block_name);
299 return -1;
300 }
301
302 nbits = block->postcopy_length >> TARGET_PAGE_BITS;
303
304 /*
305 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
306 * machines we may need 4 more bytes for padding (see below
307 * comment). So extend it a bit before hand.
308 */
309 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
310
311 /*
312 * Always use little endian when sending the bitmap. This is
313 * required that when source and destination VMs are not using the
314 * same endianness. (Note: big endian won't work.)
315 */
316 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
317
318 /* Size of the bitmap, in bytes */
319 size = DIV_ROUND_UP(nbits, 8);
320
321 /*
322 * size is always aligned to 8 bytes for 64bit machines, but it
323 * may not be true for 32bit machines. We need this padding to
324 * make sure the migration can survive even between 32bit and
325 * 64bit machines.
326 */
327 size = ROUND_UP(size, 8);
328
329 qemu_put_be64(file, size);
330 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
331 /*
332 * Mark as an end, in case the middle part is screwed up due to
333 * some "mysterious" reason.
334 */
335 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
336 qemu_fflush(file);
337
338 g_free(le_bitmap);
339
340 if (qemu_file_get_error(file)) {
341 return qemu_file_get_error(file);
342 }
343
344 return size + sizeof(size);
345 }
346
347 /*
348 * An outstanding page request, on the source, having been received
349 * and queued
350 */
351 struct RAMSrcPageRequest {
352 RAMBlock *rb;
353 hwaddr offset;
354 hwaddr len;
355
356 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
357 };
358
359 /* State of RAM for migration */
360 struct RAMState {
361 /*
362 * PageSearchStatus structures for the channels when send pages.
363 * Protected by the bitmap_mutex.
364 */
365 PageSearchStatus pss[RAM_CHANNEL_MAX];
366 /* UFFD file descriptor, used in 'write-tracking' migration */
367 int uffdio_fd;
368 /* total ram size in bytes */
369 uint64_t ram_bytes_total;
370 /* Last block that we have visited searching for dirty pages */
371 RAMBlock *last_seen_block;
372 /* Last dirty target page we have sent */
373 ram_addr_t last_page;
374 /* last ram version we have seen */
375 uint32_t last_version;
376 /* How many times we have dirty too many pages */
377 int dirty_rate_high_cnt;
378 /* these variables are used for bitmap sync */
379 /* last time we did a full bitmap_sync */
380 int64_t time_last_bitmap_sync;
381 /* bytes transferred at start_time */
382 uint64_t bytes_xfer_prev;
383 /* number of dirty pages since start_time */
384 uint64_t num_dirty_pages_period;
385 /* xbzrle misses since the beginning of the period */
386 uint64_t xbzrle_cache_miss_prev;
387 /* Amount of xbzrle pages since the beginning of the period */
388 uint64_t xbzrle_pages_prev;
389 /* Amount of xbzrle encoded bytes since the beginning of the period */
390 uint64_t xbzrle_bytes_prev;
391 /* Are we really using XBZRLE (e.g., after the first round). */
392 bool xbzrle_started;
393 /* Are we on the last stage of migration */
394 bool last_stage;
395 /* compression statistics since the beginning of the period */
396 /* amount of count that no free thread to compress data */
397 uint64_t compress_thread_busy_prev;
398 /* amount bytes after compression */
399 uint64_t compressed_size_prev;
400 /* amount of compressed pages */
401 uint64_t compress_pages_prev;
402
403 /* total handled target pages at the beginning of period */
404 uint64_t target_page_count_prev;
405 /* total handled target pages since start */
406 uint64_t target_page_count;
407 /* number of dirty bits in the bitmap */
408 uint64_t migration_dirty_pages;
409 /*
410 * Protects:
411 * - dirty/clear bitmap
412 * - migration_dirty_pages
413 * - pss structures
414 */
415 QemuMutex bitmap_mutex;
416 /* The RAMBlock used in the last src_page_requests */
417 RAMBlock *last_req_rb;
418 /* Queue of outstanding page requests from the destination */
419 QemuMutex src_page_req_mutex;
420 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
421 };
422 typedef struct RAMState RAMState;
423
424 static RAMState *ram_state;
425
426 static NotifierWithReturnList precopy_notifier_list;
427
428 /* Whether postcopy has queued requests? */
429 static bool postcopy_has_request(RAMState *rs)
430 {
431 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
432 }
433
434 void precopy_infrastructure_init(void)
435 {
436 notifier_with_return_list_init(&precopy_notifier_list);
437 }
438
439 void precopy_add_notifier(NotifierWithReturn *n)
440 {
441 notifier_with_return_list_add(&precopy_notifier_list, n);
442 }
443
444 void precopy_remove_notifier(NotifierWithReturn *n)
445 {
446 notifier_with_return_remove(n);
447 }
448
449 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
450 {
451 PrecopyNotifyData pnd;
452 pnd.reason = reason;
453 pnd.errp = errp;
454
455 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
456 }
457
458 uint64_t ram_bytes_remaining(void)
459 {
460 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
461 0;
462 }
463
464 void ram_transferred_add(uint64_t bytes)
465 {
466 if (runstate_is_running()) {
467 stat64_add(&mig_stats.precopy_bytes, bytes);
468 } else if (migration_in_postcopy()) {
469 stat64_add(&mig_stats.postcopy_bytes, bytes);
470 } else {
471 stat64_add(&mig_stats.downtime_bytes, bytes);
472 }
473 stat64_add(&mig_stats.transferred, bytes);
474 }
475
476 struct MigrationOps {
477 int (*ram_save_target_page)(RAMState *rs, PageSearchStatus *pss);
478 };
479 typedef struct MigrationOps MigrationOps;
480
481 MigrationOps *migration_ops;
482
483 static int ram_save_host_page_urgent(PageSearchStatus *pss);
484
485 /* NOTE: page is the PFN not real ram_addr_t. */
486 static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page)
487 {
488 pss->block = rb;
489 pss->page = page;
490 pss->complete_round = false;
491 }
492
493 /*
494 * Check whether two PSSs are actively sending the same page. Return true
495 * if it is, false otherwise.
496 */
497 static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2)
498 {
499 return pss1->host_page_sending && pss2->host_page_sending &&
500 (pss1->host_page_start == pss2->host_page_start);
501 }
502
503 /**
504 * save_page_header: write page header to wire
505 *
506 * If this is the 1st block, it also writes the block identification
507 *
508 * Returns the number of bytes written
509 *
510 * @pss: current PSS channel status
511 * @block: block that contains the page we want to send
512 * @offset: offset inside the block for the page
513 * in the lower bits, it contains flags
514 */
515 static size_t save_page_header(PageSearchStatus *pss, QEMUFile *f,
516 RAMBlock *block, ram_addr_t offset)
517 {
518 size_t size, len;
519 bool same_block = (block == pss->last_sent_block);
520
521 if (same_block) {
522 offset |= RAM_SAVE_FLAG_CONTINUE;
523 }
524 qemu_put_be64(f, offset);
525 size = 8;
526
527 if (!same_block) {
528 len = strlen(block->idstr);
529 qemu_put_byte(f, len);
530 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
531 size += 1 + len;
532 pss->last_sent_block = block;
533 }
534 return size;
535 }
536
537 /**
538 * mig_throttle_guest_down: throttle down the guest
539 *
540 * Reduce amount of guest cpu execution to hopefully slow down memory
541 * writes. If guest dirty memory rate is reduced below the rate at
542 * which we can transfer pages to the destination then we should be
543 * able to complete migration. Some workloads dirty memory way too
544 * fast and will not effectively converge, even with auto-converge.
545 */
546 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
547 uint64_t bytes_dirty_threshold)
548 {
549 uint64_t pct_initial = migrate_cpu_throttle_initial();
550 uint64_t pct_increment = migrate_cpu_throttle_increment();
551 bool pct_tailslow = migrate_cpu_throttle_tailslow();
552 int pct_max = migrate_max_cpu_throttle();
553
554 uint64_t throttle_now = cpu_throttle_get_percentage();
555 uint64_t cpu_now, cpu_ideal, throttle_inc;
556
557 /* We have not started throttling yet. Let's start it. */
558 if (!cpu_throttle_active()) {
559 cpu_throttle_set(pct_initial);
560 } else {
561 /* Throttling already on, just increase the rate */
562 if (!pct_tailslow) {
563 throttle_inc = pct_increment;
564 } else {
565 /* Compute the ideal CPU percentage used by Guest, which may
566 * make the dirty rate match the dirty rate threshold. */
567 cpu_now = 100 - throttle_now;
568 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
569 bytes_dirty_period);
570 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
571 }
572 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
573 }
574 }
575
576 void mig_throttle_counter_reset(void)
577 {
578 RAMState *rs = ram_state;
579
580 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
581 rs->num_dirty_pages_period = 0;
582 rs->bytes_xfer_prev = stat64_get(&mig_stats.transferred);
583 }
584
585 /**
586 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
587 *
588 * @rs: current RAM state
589 * @current_addr: address for the zero page
590 *
591 * Update the xbzrle cache to reflect a page that's been sent as all 0.
592 * The important thing is that a stale (not-yet-0'd) page be replaced
593 * by the new data.
594 * As a bonus, if the page wasn't in the cache it gets added so that
595 * when a small write is made into the 0'd page it gets XBZRLE sent.
596 */
597 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
598 {
599 /* We don't care if this fails to allocate a new cache page
600 * as long as it updated an old one */
601 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
602 stat64_get(&mig_stats.dirty_sync_count));
603 }
604
605 #define ENCODING_FLAG_XBZRLE 0x1
606
607 /**
608 * save_xbzrle_page: compress and send current page
609 *
610 * Returns: 1 means that we wrote the page
611 * 0 means that page is identical to the one already sent
612 * -1 means that xbzrle would be longer than normal
613 *
614 * @rs: current RAM state
615 * @pss: current PSS channel
616 * @current_data: pointer to the address of the page contents
617 * @current_addr: addr of the page
618 * @block: block that contains the page we want to send
619 * @offset: offset inside the block for the page
620 */
621 static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
622 uint8_t **current_data, ram_addr_t current_addr,
623 RAMBlock *block, ram_addr_t offset)
624 {
625 int encoded_len = 0, bytes_xbzrle;
626 uint8_t *prev_cached_page;
627 QEMUFile *file = pss->pss_channel;
628 uint64_t generation = stat64_get(&mig_stats.dirty_sync_count);
629
630 if (!cache_is_cached(XBZRLE.cache, current_addr, generation)) {
631 xbzrle_counters.cache_miss++;
632 if (!rs->last_stage) {
633 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
634 generation) == -1) {
635 return -1;
636 } else {
637 /* update *current_data when the page has been
638 inserted into cache */
639 *current_data = get_cached_data(XBZRLE.cache, current_addr);
640 }
641 }
642 return -1;
643 }
644
645 /*
646 * Reaching here means the page has hit the xbzrle cache, no matter what
647 * encoding result it is (normal encoding, overflow or skipping the page),
648 * count the page as encoded. This is used to calculate the encoding rate.
649 *
650 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
651 * 2nd page turns out to be skipped (i.e. no new bytes written to the
652 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
653 * skipped page included. In this way, the encoding rate can tell if the
654 * guest page is good for xbzrle encoding.
655 */
656 xbzrle_counters.pages++;
657 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
658
659 /* save current buffer into memory */
660 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
661
662 /* XBZRLE encoding (if there is no overflow) */
663 encoded_len = xbzrle_encode_buffer_func(prev_cached_page, XBZRLE.current_buf,
664 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
665 TARGET_PAGE_SIZE);
666
667 /*
668 * Update the cache contents, so that it corresponds to the data
669 * sent, in all cases except where we skip the page.
670 */
671 if (!rs->last_stage && encoded_len != 0) {
672 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
673 /*
674 * In the case where we couldn't compress, ensure that the caller
675 * sends the data from the cache, since the guest might have
676 * changed the RAM since we copied it.
677 */
678 *current_data = prev_cached_page;
679 }
680
681 if (encoded_len == 0) {
682 trace_save_xbzrle_page_skipping();
683 return 0;
684 } else if (encoded_len == -1) {
685 trace_save_xbzrle_page_overflow();
686 xbzrle_counters.overflow++;
687 xbzrle_counters.bytes += TARGET_PAGE_SIZE;
688 return -1;
689 }
690
691 /* Send XBZRLE based compressed page */
692 bytes_xbzrle = save_page_header(pss, pss->pss_channel, block,
693 offset | RAM_SAVE_FLAG_XBZRLE);
694 qemu_put_byte(file, ENCODING_FLAG_XBZRLE);
695 qemu_put_be16(file, encoded_len);
696 qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len);
697 bytes_xbzrle += encoded_len + 1 + 2;
698 /*
699 * Like compressed_size (please see update_compress_thread_counts),
700 * the xbzrle encoded bytes don't count the 8 byte header with
701 * RAM_SAVE_FLAG_CONTINUE.
702 */
703 xbzrle_counters.bytes += bytes_xbzrle - 8;
704 ram_transferred_add(bytes_xbzrle);
705
706 return 1;
707 }
708
709 /**
710 * pss_find_next_dirty: find the next dirty page of current ramblock
711 *
712 * This function updates pss->page to point to the next dirty page index
713 * within the ramblock to migrate, or the end of ramblock when nothing
714 * found. Note that when pss->host_page_sending==true it means we're
715 * during sending a host page, so we won't look for dirty page that is
716 * outside the host page boundary.
717 *
718 * @pss: the current page search status
719 */
720 static void pss_find_next_dirty(PageSearchStatus *pss)
721 {
722 RAMBlock *rb = pss->block;
723 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
724 unsigned long *bitmap = rb->bmap;
725
726 if (ramblock_is_ignored(rb)) {
727 /* Points directly to the end, so we know no dirty page */
728 pss->page = size;
729 return;
730 }
731
732 /*
733 * If during sending a host page, only look for dirty pages within the
734 * current host page being send.
735 */
736 if (pss->host_page_sending) {
737 assert(pss->host_page_end);
738 size = MIN(size, pss->host_page_end);
739 }
740
741 pss->page = find_next_bit(bitmap, size, pss->page);
742 }
743
744 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
745 unsigned long page)
746 {
747 uint8_t shift;
748 hwaddr size, start;
749
750 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
751 return;
752 }
753
754 shift = rb->clear_bmap_shift;
755 /*
756 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
757 * can make things easier sometimes since then start address
758 * of the small chunk will always be 64 pages aligned so the
759 * bitmap will always be aligned to unsigned long. We should
760 * even be able to remove this restriction but I'm simply
761 * keeping it.
762 */
763 assert(shift >= 6);
764
765 size = 1ULL << (TARGET_PAGE_BITS + shift);
766 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
767 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
768 memory_region_clear_dirty_bitmap(rb->mr, start, size);
769 }
770
771 static void
772 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
773 unsigned long start,
774 unsigned long npages)
775 {
776 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
777 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
778 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
779
780 /*
781 * Clear pages from start to start + npages - 1, so the end boundary is
782 * exclusive.
783 */
784 for (i = chunk_start; i < chunk_end; i += chunk_pages) {
785 migration_clear_memory_region_dirty_bitmap(rb, i);
786 }
787 }
788
789 /*
790 * colo_bitmap_find_diry:find contiguous dirty pages from start
791 *
792 * Returns the page offset within memory region of the start of the contiguout
793 * dirty page
794 *
795 * @rs: current RAM state
796 * @rb: RAMBlock where to search for dirty pages
797 * @start: page where we start the search
798 * @num: the number of contiguous dirty pages
799 */
800 static inline
801 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
802 unsigned long start, unsigned long *num)
803 {
804 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
805 unsigned long *bitmap = rb->bmap;
806 unsigned long first, next;
807
808 *num = 0;
809
810 if (ramblock_is_ignored(rb)) {
811 return size;
812 }
813
814 first = find_next_bit(bitmap, size, start);
815 if (first >= size) {
816 return first;
817 }
818 next = find_next_zero_bit(bitmap, size, first + 1);
819 assert(next >= first);
820 *num = next - first;
821 return first;
822 }
823
824 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
825 RAMBlock *rb,
826 unsigned long page)
827 {
828 bool ret;
829
830 /*
831 * Clear dirty bitmap if needed. This _must_ be called before we
832 * send any of the page in the chunk because we need to make sure
833 * we can capture further page content changes when we sync dirty
834 * log the next time. So as long as we are going to send any of
835 * the page in the chunk we clear the remote dirty bitmap for all.
836 * Clearing it earlier won't be a problem, but too late will.
837 */
838 migration_clear_memory_region_dirty_bitmap(rb, page);
839
840 ret = test_and_clear_bit(page, rb->bmap);
841 if (ret) {
842 rs->migration_dirty_pages--;
843 }
844
845 return ret;
846 }
847
848 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
849 void *opaque)
850 {
851 const hwaddr offset = section->offset_within_region;
852 const hwaddr size = int128_get64(section->size);
853 const unsigned long start = offset >> TARGET_PAGE_BITS;
854 const unsigned long npages = size >> TARGET_PAGE_BITS;
855 RAMBlock *rb = section->mr->ram_block;
856 uint64_t *cleared_bits = opaque;
857
858 /*
859 * We don't grab ram_state->bitmap_mutex because we expect to run
860 * only when starting migration or during postcopy recovery where
861 * we don't have concurrent access.
862 */
863 if (!migration_in_postcopy() && !migrate_background_snapshot()) {
864 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
865 }
866 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
867 bitmap_clear(rb->bmap, start, npages);
868 }
869
870 /*
871 * Exclude all dirty pages from migration that fall into a discarded range as
872 * managed by a RamDiscardManager responsible for the mapped memory region of
873 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
874 *
875 * Discarded pages ("logically unplugged") have undefined content and must
876 * not get migrated, because even reading these pages for migration might
877 * result in undesired behavior.
878 *
879 * Returns the number of cleared bits in the RAMBlock dirty bitmap.
880 *
881 * Note: The result is only stable while migrating (precopy/postcopy).
882 */
883 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
884 {
885 uint64_t cleared_bits = 0;
886
887 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
888 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
889 MemoryRegionSection section = {
890 .mr = rb->mr,
891 .offset_within_region = 0,
892 .size = int128_make64(qemu_ram_get_used_length(rb)),
893 };
894
895 ram_discard_manager_replay_discarded(rdm, &section,
896 dirty_bitmap_clear_section,
897 &cleared_bits);
898 }
899 return cleared_bits;
900 }
901
902 /*
903 * Check if a host-page aligned page falls into a discarded range as managed by
904 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
905 *
906 * Note: The result is only stable while migrating (precopy/postcopy).
907 */
908 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
909 {
910 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
911 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
912 MemoryRegionSection section = {
913 .mr = rb->mr,
914 .offset_within_region = start,
915 .size = int128_make64(qemu_ram_pagesize(rb)),
916 };
917
918 return !ram_discard_manager_is_populated(rdm, &section);
919 }
920 return false;
921 }
922
923 /* Called with RCU critical section */
924 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
925 {
926 uint64_t new_dirty_pages =
927 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
928
929 rs->migration_dirty_pages += new_dirty_pages;
930 rs->num_dirty_pages_period += new_dirty_pages;
931 }
932
933 /**
934 * ram_pagesize_summary: calculate all the pagesizes of a VM
935 *
936 * Returns a summary bitmap of the page sizes of all RAMBlocks
937 *
938 * For VMs with just normal pages this is equivalent to the host page
939 * size. If it's got some huge pages then it's the OR of all the
940 * different page sizes.
941 */
942 uint64_t ram_pagesize_summary(void)
943 {
944 RAMBlock *block;
945 uint64_t summary = 0;
946
947 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
948 summary |= block->page_size;
949 }
950
951 return summary;
952 }
953
954 uint64_t ram_get_total_transferred_pages(void)
955 {
956 return stat64_get(&mig_stats.normal_pages) +
957 stat64_get(&mig_stats.zero_pages) +
958 compression_counters.pages + xbzrle_counters.pages;
959 }
960
961 static void migration_update_rates(RAMState *rs, int64_t end_time)
962 {
963 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
964 double compressed_size;
965
966 /* calculate period counters */
967 stat64_set(&mig_stats.dirty_pages_rate,
968 rs->num_dirty_pages_period * 1000 /
969 (end_time - rs->time_last_bitmap_sync));
970
971 if (!page_count) {
972 return;
973 }
974
975 if (migrate_xbzrle()) {
976 double encoded_size, unencoded_size;
977
978 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
979 rs->xbzrle_cache_miss_prev) / page_count;
980 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
981 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
982 TARGET_PAGE_SIZE;
983 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
984 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
985 xbzrle_counters.encoding_rate = 0;
986 } else {
987 xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
988 }
989 rs->xbzrle_pages_prev = xbzrle_counters.pages;
990 rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
991 }
992
993 if (migrate_compress()) {
994 compression_counters.busy_rate = (double)(compression_counters.busy -
995 rs->compress_thread_busy_prev) / page_count;
996 rs->compress_thread_busy_prev = compression_counters.busy;
997
998 compressed_size = compression_counters.compressed_size -
999 rs->compressed_size_prev;
1000 if (compressed_size) {
1001 double uncompressed_size = (compression_counters.pages -
1002 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1003
1004 /* Compression-Ratio = Uncompressed-size / Compressed-size */
1005 compression_counters.compression_rate =
1006 uncompressed_size / compressed_size;
1007
1008 rs->compress_pages_prev = compression_counters.pages;
1009 rs->compressed_size_prev = compression_counters.compressed_size;
1010 }
1011 }
1012 }
1013
1014 static void migration_trigger_throttle(RAMState *rs)
1015 {
1016 uint64_t threshold = migrate_throttle_trigger_threshold();
1017 uint64_t bytes_xfer_period =
1018 stat64_get(&mig_stats.transferred) - rs->bytes_xfer_prev;
1019 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1020 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1021
1022 /* During block migration the auto-converge logic incorrectly detects
1023 * that ram migration makes no progress. Avoid this by disabling the
1024 * throttling logic during the bulk phase of block migration. */
1025 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1026 /* The following detection logic can be refined later. For now:
1027 Check to see if the ratio between dirtied bytes and the approx.
1028 amount of bytes that just got transferred since the last time
1029 we were in this routine reaches the threshold. If that happens
1030 twice, start or increase throttling. */
1031
1032 if ((bytes_dirty_period > bytes_dirty_threshold) &&
1033 (++rs->dirty_rate_high_cnt >= 2)) {
1034 trace_migration_throttle();
1035 rs->dirty_rate_high_cnt = 0;
1036 mig_throttle_guest_down(bytes_dirty_period,
1037 bytes_dirty_threshold);
1038 }
1039 }
1040 }
1041
1042 static void migration_bitmap_sync(RAMState *rs)
1043 {
1044 RAMBlock *block;
1045 int64_t end_time;
1046
1047 stat64_add(&mig_stats.dirty_sync_count, 1);
1048
1049 if (!rs->time_last_bitmap_sync) {
1050 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1051 }
1052
1053 trace_migration_bitmap_sync_start();
1054 memory_global_dirty_log_sync();
1055
1056 qemu_mutex_lock(&rs->bitmap_mutex);
1057 WITH_RCU_READ_LOCK_GUARD() {
1058 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1059 ramblock_sync_dirty_bitmap(rs, block);
1060 }
1061 stat64_set(&mig_stats.dirty_bytes_last_sync, ram_bytes_remaining());
1062 }
1063 qemu_mutex_unlock(&rs->bitmap_mutex);
1064
1065 memory_global_after_dirty_log_sync();
1066 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1067
1068 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1069
1070 /* more than 1 second = 1000 millisecons */
1071 if (end_time > rs->time_last_bitmap_sync + 1000) {
1072 migration_trigger_throttle(rs);
1073
1074 migration_update_rates(rs, end_time);
1075
1076 rs->target_page_count_prev = rs->target_page_count;
1077
1078 /* reset period counters */
1079 rs->time_last_bitmap_sync = end_time;
1080 rs->num_dirty_pages_period = 0;
1081 rs->bytes_xfer_prev = stat64_get(&mig_stats.transferred);
1082 }
1083 if (migrate_events()) {
1084 uint64_t generation = stat64_get(&mig_stats.dirty_sync_count);
1085 qapi_event_send_migration_pass(generation);
1086 }
1087 }
1088
1089 static void migration_bitmap_sync_precopy(RAMState *rs)
1090 {
1091 Error *local_err = NULL;
1092
1093 /*
1094 * The current notifier usage is just an optimization to migration, so we
1095 * don't stop the normal migration process in the error case.
1096 */
1097 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1098 error_report_err(local_err);
1099 local_err = NULL;
1100 }
1101
1102 migration_bitmap_sync(rs);
1103
1104 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1105 error_report_err(local_err);
1106 }
1107 }
1108
1109 void ram_release_page(const char *rbname, uint64_t offset)
1110 {
1111 if (!migrate_release_ram() || !migration_in_postcopy()) {
1112 return;
1113 }
1114
1115 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1116 }
1117
1118 /**
1119 * save_zero_page_to_file: send the zero page to the file
1120 *
1121 * Returns the size of data written to the file, 0 means the page is not
1122 * a zero page
1123 *
1124 * @pss: current PSS channel
1125 * @block: block that contains the page we want to send
1126 * @offset: offset inside the block for the page
1127 */
1128 static int save_zero_page_to_file(PageSearchStatus *pss, QEMUFile *file,
1129 RAMBlock *block, ram_addr_t offset)
1130 {
1131 uint8_t *p = block->host + offset;
1132 int len = 0;
1133
1134 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1135 len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO);
1136 qemu_put_byte(file, 0);
1137 len += 1;
1138 ram_release_page(block->idstr, offset);
1139 }
1140 return len;
1141 }
1142
1143 /**
1144 * save_zero_page: send the zero page to the stream
1145 *
1146 * Returns the number of pages written.
1147 *
1148 * @pss: current PSS channel
1149 * @block: block that contains the page we want to send
1150 * @offset: offset inside the block for the page
1151 */
1152 static int save_zero_page(PageSearchStatus *pss, QEMUFile *f, RAMBlock *block,
1153 ram_addr_t offset)
1154 {
1155 int len = save_zero_page_to_file(pss, f, block, offset);
1156
1157 if (len) {
1158 stat64_add(&mig_stats.zero_pages, 1);
1159 ram_transferred_add(len);
1160 return 1;
1161 }
1162 return -1;
1163 }
1164
1165 /*
1166 * @pages: the number of pages written by the control path,
1167 * < 0 - error
1168 * > 0 - number of pages written
1169 *
1170 * Return true if the pages has been saved, otherwise false is returned.
1171 */
1172 static bool control_save_page(PageSearchStatus *pss, RAMBlock *block,
1173 ram_addr_t offset, int *pages)
1174 {
1175 uint64_t bytes_xmit = 0;
1176 int ret;
1177
1178 *pages = -1;
1179 ret = ram_control_save_page(pss->pss_channel, block->offset, offset,
1180 TARGET_PAGE_SIZE, &bytes_xmit);
1181 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1182 return false;
1183 }
1184
1185 if (bytes_xmit) {
1186 ram_transferred_add(bytes_xmit);
1187 *pages = 1;
1188 }
1189
1190 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1191 return true;
1192 }
1193
1194 if (bytes_xmit > 0) {
1195 stat64_add(&mig_stats.normal_pages, 1);
1196 } else if (bytes_xmit == 0) {
1197 stat64_add(&mig_stats.zero_pages, 1);
1198 }
1199
1200 return true;
1201 }
1202
1203 /*
1204 * directly send the page to the stream
1205 *
1206 * Returns the number of pages written.
1207 *
1208 * @pss: current PSS channel
1209 * @block: block that contains the page we want to send
1210 * @offset: offset inside the block for the page
1211 * @buf: the page to be sent
1212 * @async: send to page asyncly
1213 */
1214 static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
1215 ram_addr_t offset, uint8_t *buf, bool async)
1216 {
1217 QEMUFile *file = pss->pss_channel;
1218
1219 ram_transferred_add(save_page_header(pss, pss->pss_channel, block,
1220 offset | RAM_SAVE_FLAG_PAGE));
1221 if (async) {
1222 qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
1223 migrate_release_ram() &&
1224 migration_in_postcopy());
1225 } else {
1226 qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
1227 }
1228 ram_transferred_add(TARGET_PAGE_SIZE);
1229 stat64_add(&mig_stats.normal_pages, 1);
1230 return 1;
1231 }
1232
1233 /**
1234 * ram_save_page: send the given page to the stream
1235 *
1236 * Returns the number of pages written.
1237 * < 0 - error
1238 * >=0 - Number of pages written - this might legally be 0
1239 * if xbzrle noticed the page was the same.
1240 *
1241 * @rs: current RAM state
1242 * @block: block that contains the page we want to send
1243 * @offset: offset inside the block for the page
1244 */
1245 static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
1246 {
1247 int pages = -1;
1248 uint8_t *p;
1249 bool send_async = true;
1250 RAMBlock *block = pss->block;
1251 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1252 ram_addr_t current_addr = block->offset + offset;
1253
1254 p = block->host + offset;
1255 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1256
1257 XBZRLE_cache_lock();
1258 if (rs->xbzrle_started && !migration_in_postcopy()) {
1259 pages = save_xbzrle_page(rs, pss, &p, current_addr,
1260 block, offset);
1261 if (!rs->last_stage) {
1262 /* Can't send this cached data async, since the cache page
1263 * might get updated before it gets to the wire
1264 */
1265 send_async = false;
1266 }
1267 }
1268
1269 /* XBZRLE overflow or normal page */
1270 if (pages == -1) {
1271 pages = save_normal_page(pss, block, offset, p, send_async);
1272 }
1273
1274 XBZRLE_cache_unlock();
1275
1276 return pages;
1277 }
1278
1279 static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block,
1280 ram_addr_t offset)
1281 {
1282 if (multifd_queue_page(file, block, offset) < 0) {
1283 return -1;
1284 }
1285 stat64_add(&mig_stats.normal_pages, 1);
1286
1287 return 1;
1288 }
1289
1290 static void
1291 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1292 {
1293 ram_transferred_add(bytes_xmit);
1294
1295 if (param->result == RES_ZEROPAGE) {
1296 stat64_add(&mig_stats.zero_pages, 1);
1297 return;
1298 }
1299
1300 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1301 compression_counters.compressed_size += bytes_xmit - 8;
1302 compression_counters.pages++;
1303 }
1304
1305 static bool save_page_use_compression(RAMState *rs);
1306
1307 static int send_queued_data(CompressParam *param)
1308 {
1309 PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_PRECOPY];
1310 MigrationState *ms = migrate_get_current();
1311 QEMUFile *file = ms->to_dst_file;
1312 int len = 0;
1313
1314 RAMBlock *block = param->block;
1315 ram_addr_t offset = param->offset;
1316
1317 if (param->result == RES_NONE) {
1318 return 0;
1319 }
1320
1321 assert(block == pss->last_sent_block);
1322
1323 if (param->result == RES_ZEROPAGE) {
1324 assert(qemu_file_buffer_empty(param->file));
1325 len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO);
1326 qemu_put_byte(file, 0);
1327 len += 1;
1328 ram_release_page(block->idstr, offset);
1329 } else if (param->result == RES_COMPRESS) {
1330 assert(!qemu_file_buffer_empty(param->file));
1331 len += save_page_header(pss, file, block,
1332 offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1333 len += qemu_put_qemu_file(file, param->file);
1334 } else {
1335 abort();
1336 }
1337
1338 update_compress_thread_counts(param, len);
1339
1340 return len;
1341 }
1342
1343 static void ram_flush_compressed_data(RAMState *rs)
1344 {
1345 if (!save_page_use_compression(rs)) {
1346 return;
1347 }
1348
1349 flush_compressed_data(send_queued_data);
1350 }
1351
1352 #define PAGE_ALL_CLEAN 0
1353 #define PAGE_TRY_AGAIN 1
1354 #define PAGE_DIRTY_FOUND 2
1355 /**
1356 * find_dirty_block: find the next dirty page and update any state
1357 * associated with the search process.
1358 *
1359 * Returns:
1360 * <0: An error happened
1361 * PAGE_ALL_CLEAN: no dirty page found, give up
1362 * PAGE_TRY_AGAIN: no dirty page found, retry for next block
1363 * PAGE_DIRTY_FOUND: dirty page found
1364 *
1365 * @rs: current RAM state
1366 * @pss: data about the state of the current dirty page scan
1367 * @again: set to false if the search has scanned the whole of RAM
1368 */
1369 static int find_dirty_block(RAMState *rs, PageSearchStatus *pss)
1370 {
1371 /* Update pss->page for the next dirty bit in ramblock */
1372 pss_find_next_dirty(pss);
1373
1374 if (pss->complete_round && pss->block == rs->last_seen_block &&
1375 pss->page >= rs->last_page) {
1376 /*
1377 * We've been once around the RAM and haven't found anything.
1378 * Give up.
1379 */
1380 return PAGE_ALL_CLEAN;
1381 }
1382 if (!offset_in_ramblock(pss->block,
1383 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1384 /* Didn't find anything in this RAM Block */
1385 pss->page = 0;
1386 pss->block = QLIST_NEXT_RCU(pss->block, next);
1387 if (!pss->block) {
1388 if (!migrate_multifd_flush_after_each_section()) {
1389 QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel;
1390 int ret = multifd_send_sync_main(f);
1391 if (ret < 0) {
1392 return ret;
1393 }
1394 qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
1395 qemu_fflush(f);
1396 }
1397 /*
1398 * If memory migration starts over, we will meet a dirtied page
1399 * which may still exists in compression threads's ring, so we
1400 * should flush the compressed data to make sure the new page
1401 * is not overwritten by the old one in the destination.
1402 *
1403 * Also If xbzrle is on, stop using the data compression at this
1404 * point. In theory, xbzrle can do better than compression.
1405 */
1406 ram_flush_compressed_data(rs);
1407
1408 /* Hit the end of the list */
1409 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1410 /* Flag that we've looped */
1411 pss->complete_round = true;
1412 /* After the first round, enable XBZRLE. */
1413 if (migrate_xbzrle()) {
1414 rs->xbzrle_started = true;
1415 }
1416 }
1417 /* Didn't find anything this time, but try again on the new block */
1418 return PAGE_TRY_AGAIN;
1419 } else {
1420 /* We've found something */
1421 return PAGE_DIRTY_FOUND;
1422 }
1423 }
1424
1425 /**
1426 * unqueue_page: gets a page of the queue
1427 *
1428 * Helper for 'get_queued_page' - gets a page off the queue
1429 *
1430 * Returns the block of the page (or NULL if none available)
1431 *
1432 * @rs: current RAM state
1433 * @offset: used to return the offset within the RAMBlock
1434 */
1435 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1436 {
1437 struct RAMSrcPageRequest *entry;
1438 RAMBlock *block = NULL;
1439
1440 if (!postcopy_has_request(rs)) {
1441 return NULL;
1442 }
1443
1444 QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1445
1446 /*
1447 * This should _never_ change even after we take the lock, because no one
1448 * should be taking anything off the request list other than us.
1449 */
1450 assert(postcopy_has_request(rs));
1451
1452 entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1453 block = entry->rb;
1454 *offset = entry->offset;
1455
1456 if (entry->len > TARGET_PAGE_SIZE) {
1457 entry->len -= TARGET_PAGE_SIZE;
1458 entry->offset += TARGET_PAGE_SIZE;
1459 } else {
1460 memory_region_unref(block->mr);
1461 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1462 g_free(entry);
1463 migration_consume_urgent_request();
1464 }
1465
1466 return block;
1467 }
1468
1469 #if defined(__linux__)
1470 /**
1471 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1472 * is found, return RAM block pointer and page offset
1473 *
1474 * Returns pointer to the RAMBlock containing faulting page,
1475 * NULL if no write faults are pending
1476 *
1477 * @rs: current RAM state
1478 * @offset: page offset from the beginning of the block
1479 */
1480 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1481 {
1482 struct uffd_msg uffd_msg;
1483 void *page_address;
1484 RAMBlock *block;
1485 int res;
1486
1487 if (!migrate_background_snapshot()) {
1488 return NULL;
1489 }
1490
1491 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1492 if (res <= 0) {
1493 return NULL;
1494 }
1495
1496 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1497 block = qemu_ram_block_from_host(page_address, false, offset);
1498 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1499 return block;
1500 }
1501
1502 /**
1503 * ram_save_release_protection: release UFFD write protection after
1504 * a range of pages has been saved
1505 *
1506 * @rs: current RAM state
1507 * @pss: page-search-status structure
1508 * @start_page: index of the first page in the range relative to pss->block
1509 *
1510 * Returns 0 on success, negative value in case of an error
1511 */
1512 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1513 unsigned long start_page)
1514 {
1515 int res = 0;
1516
1517 /* Check if page is from UFFD-managed region. */
1518 if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1519 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1520 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
1521
1522 /* Flush async buffers before un-protect. */
1523 qemu_fflush(pss->pss_channel);
1524 /* Un-protect memory range. */
1525 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1526 false, false);
1527 }
1528
1529 return res;
1530 }
1531
1532 /* ram_write_tracking_available: check if kernel supports required UFFD features
1533 *
1534 * Returns true if supports, false otherwise
1535 */
1536 bool ram_write_tracking_available(void)
1537 {
1538 uint64_t uffd_features;
1539 int res;
1540
1541 res = uffd_query_features(&uffd_features);
1542 return (res == 0 &&
1543 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1544 }
1545
1546 /* ram_write_tracking_compatible: check if guest configuration is
1547 * compatible with 'write-tracking'
1548 *
1549 * Returns true if compatible, false otherwise
1550 */
1551 bool ram_write_tracking_compatible(void)
1552 {
1553 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1554 int uffd_fd;
1555 RAMBlock *block;
1556 bool ret = false;
1557
1558 /* Open UFFD file descriptor */
1559 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1560 if (uffd_fd < 0) {
1561 return false;
1562 }
1563
1564 RCU_READ_LOCK_GUARD();
1565
1566 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1567 uint64_t uffd_ioctls;
1568
1569 /* Nothing to do with read-only and MMIO-writable regions */
1570 if (block->mr->readonly || block->mr->rom_device) {
1571 continue;
1572 }
1573 /* Try to register block memory via UFFD-IO to track writes */
1574 if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1575 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1576 goto out;
1577 }
1578 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1579 goto out;
1580 }
1581 }
1582 ret = true;
1583
1584 out:
1585 uffd_close_fd(uffd_fd);
1586 return ret;
1587 }
1588
1589 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1590 ram_addr_t size)
1591 {
1592 const ram_addr_t end = offset + size;
1593
1594 /*
1595 * We read one byte of each page; this will preallocate page tables if
1596 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1597 * where no page was populated yet. This might require adaption when
1598 * supporting other mappings, like shmem.
1599 */
1600 for (; offset < end; offset += block->page_size) {
1601 char tmp = *((char *)block->host + offset);
1602
1603 /* Don't optimize the read out */
1604 asm volatile("" : "+r" (tmp));
1605 }
1606 }
1607
1608 static inline int populate_read_section(MemoryRegionSection *section,
1609 void *opaque)
1610 {
1611 const hwaddr size = int128_get64(section->size);
1612 hwaddr offset = section->offset_within_region;
1613 RAMBlock *block = section->mr->ram_block;
1614
1615 populate_read_range(block, offset, size);
1616 return 0;
1617 }
1618
1619 /*
1620 * ram_block_populate_read: preallocate page tables and populate pages in the
1621 * RAM block by reading a byte of each page.
1622 *
1623 * Since it's solely used for userfault_fd WP feature, here we just
1624 * hardcode page size to qemu_real_host_page_size.
1625 *
1626 * @block: RAM block to populate
1627 */
1628 static void ram_block_populate_read(RAMBlock *rb)
1629 {
1630 /*
1631 * Skip populating all pages that fall into a discarded range as managed by
1632 * a RamDiscardManager responsible for the mapped memory region of the
1633 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1634 * must not get populated automatically. We don't have to track
1635 * modifications via userfaultfd WP reliably, because these pages will
1636 * not be part of the migration stream either way -- see
1637 * ramblock_dirty_bitmap_exclude_discarded_pages().
1638 *
1639 * Note: The result is only stable while migrating (precopy/postcopy).
1640 */
1641 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1642 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1643 MemoryRegionSection section = {
1644 .mr = rb->mr,
1645 .offset_within_region = 0,
1646 .size = rb->mr->size,
1647 };
1648
1649 ram_discard_manager_replay_populated(rdm, &section,
1650 populate_read_section, NULL);
1651 } else {
1652 populate_read_range(rb, 0, rb->used_length);
1653 }
1654 }
1655
1656 /*
1657 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1658 */
1659 void ram_write_tracking_prepare(void)
1660 {
1661 RAMBlock *block;
1662
1663 RCU_READ_LOCK_GUARD();
1664
1665 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1666 /* Nothing to do with read-only and MMIO-writable regions */
1667 if (block->mr->readonly || block->mr->rom_device) {
1668 continue;
1669 }
1670
1671 /*
1672 * Populate pages of the RAM block before enabling userfault_fd
1673 * write protection.
1674 *
1675 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1676 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1677 * pages with pte_none() entries in page table.
1678 */
1679 ram_block_populate_read(block);
1680 }
1681 }
1682
1683 static inline int uffd_protect_section(MemoryRegionSection *section,
1684 void *opaque)
1685 {
1686 const hwaddr size = int128_get64(section->size);
1687 const hwaddr offset = section->offset_within_region;
1688 RAMBlock *rb = section->mr->ram_block;
1689 int uffd_fd = (uintptr_t)opaque;
1690
1691 return uffd_change_protection(uffd_fd, rb->host + offset, size, true,
1692 false);
1693 }
1694
1695 static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd)
1696 {
1697 assert(rb->flags & RAM_UF_WRITEPROTECT);
1698
1699 /* See ram_block_populate_read() */
1700 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1701 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1702 MemoryRegionSection section = {
1703 .mr = rb->mr,
1704 .offset_within_region = 0,
1705 .size = rb->mr->size,
1706 };
1707
1708 return ram_discard_manager_replay_populated(rdm, &section,
1709 uffd_protect_section,
1710 (void *)(uintptr_t)uffd_fd);
1711 }
1712 return uffd_change_protection(uffd_fd, rb->host,
1713 rb->used_length, true, false);
1714 }
1715
1716 /*
1717 * ram_write_tracking_start: start UFFD-WP memory tracking
1718 *
1719 * Returns 0 for success or negative value in case of error
1720 */
1721 int ram_write_tracking_start(void)
1722 {
1723 int uffd_fd;
1724 RAMState *rs = ram_state;
1725 RAMBlock *block;
1726
1727 /* Open UFFD file descriptor */
1728 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1729 if (uffd_fd < 0) {
1730 return uffd_fd;
1731 }
1732 rs->uffdio_fd = uffd_fd;
1733
1734 RCU_READ_LOCK_GUARD();
1735
1736 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1737 /* Nothing to do with read-only and MMIO-writable regions */
1738 if (block->mr->readonly || block->mr->rom_device) {
1739 continue;
1740 }
1741
1742 /* Register block memory with UFFD to track writes */
1743 if (uffd_register_memory(rs->uffdio_fd, block->host,
1744 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1745 goto fail;
1746 }
1747 block->flags |= RAM_UF_WRITEPROTECT;
1748 memory_region_ref(block->mr);
1749
1750 /* Apply UFFD write protection to the block memory range */
1751 if (ram_block_uffd_protect(block, uffd_fd)) {
1752 goto fail;
1753 }
1754
1755 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1756 block->host, block->max_length);
1757 }
1758
1759 return 0;
1760
1761 fail:
1762 error_report("ram_write_tracking_start() failed: restoring initial memory state");
1763
1764 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1765 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1766 continue;
1767 }
1768 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1769 /* Cleanup flags and remove reference */
1770 block->flags &= ~RAM_UF_WRITEPROTECT;
1771 memory_region_unref(block->mr);
1772 }
1773
1774 uffd_close_fd(uffd_fd);
1775 rs->uffdio_fd = -1;
1776 return -1;
1777 }
1778
1779 /**
1780 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1781 */
1782 void ram_write_tracking_stop(void)
1783 {
1784 RAMState *rs = ram_state;
1785 RAMBlock *block;
1786
1787 RCU_READ_LOCK_GUARD();
1788
1789 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1790 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1791 continue;
1792 }
1793 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1794
1795 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1796 block->host, block->max_length);
1797
1798 /* Cleanup flags and remove reference */
1799 block->flags &= ~RAM_UF_WRITEPROTECT;
1800 memory_region_unref(block->mr);
1801 }
1802
1803 /* Finally close UFFD file descriptor */
1804 uffd_close_fd(rs->uffdio_fd);
1805 rs->uffdio_fd = -1;
1806 }
1807
1808 #else
1809 /* No target OS support, stubs just fail or ignore */
1810
1811 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1812 {
1813 (void) rs;
1814 (void) offset;
1815
1816 return NULL;
1817 }
1818
1819 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1820 unsigned long start_page)
1821 {
1822 (void) rs;
1823 (void) pss;
1824 (void) start_page;
1825
1826 return 0;
1827 }
1828
1829 bool ram_write_tracking_available(void)
1830 {
1831 return false;
1832 }
1833
1834 bool ram_write_tracking_compatible(void)
1835 {
1836 assert(0);
1837 return false;
1838 }
1839
1840 int ram_write_tracking_start(void)
1841 {
1842 assert(0);
1843 return -1;
1844 }
1845
1846 void ram_write_tracking_stop(void)
1847 {
1848 assert(0);
1849 }
1850 #endif /* defined(__linux__) */
1851
1852 /**
1853 * get_queued_page: unqueue a page from the postcopy requests
1854 *
1855 * Skips pages that are already sent (!dirty)
1856 *
1857 * Returns true if a queued page is found
1858 *
1859 * @rs: current RAM state
1860 * @pss: data about the state of the current dirty page scan
1861 */
1862 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1863 {
1864 RAMBlock *block;
1865 ram_addr_t offset;
1866 bool dirty;
1867
1868 do {
1869 block = unqueue_page(rs, &offset);
1870 /*
1871 * We're sending this page, and since it's postcopy nothing else
1872 * will dirty it, and we must make sure it doesn't get sent again
1873 * even if this queue request was received after the background
1874 * search already sent it.
1875 */
1876 if (block) {
1877 unsigned long page;
1878
1879 page = offset >> TARGET_PAGE_BITS;
1880 dirty = test_bit(page, block->bmap);
1881 if (!dirty) {
1882 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1883 page);
1884 } else {
1885 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1886 }
1887 }
1888
1889 } while (block && !dirty);
1890
1891 if (!block) {
1892 /*
1893 * Poll write faults too if background snapshot is enabled; that's
1894 * when we have vcpus got blocked by the write protected pages.
1895 */
1896 block = poll_fault_page(rs, &offset);
1897 }
1898
1899 if (block) {
1900 /*
1901 * We want the background search to continue from the queued page
1902 * since the guest is likely to want other pages near to the page
1903 * it just requested.
1904 */
1905 pss->block = block;
1906 pss->page = offset >> TARGET_PAGE_BITS;
1907
1908 /*
1909 * This unqueued page would break the "one round" check, even is
1910 * really rare.
1911 */
1912 pss->complete_round = false;
1913 }
1914
1915 return !!block;
1916 }
1917
1918 /**
1919 * migration_page_queue_free: drop any remaining pages in the ram
1920 * request queue
1921 *
1922 * It should be empty at the end anyway, but in error cases there may
1923 * be some left. in case that there is any page left, we drop it.
1924 *
1925 */
1926 static void migration_page_queue_free(RAMState *rs)
1927 {
1928 struct RAMSrcPageRequest *mspr, *next_mspr;
1929 /* This queue generally should be empty - but in the case of a failed
1930 * migration might have some droppings in.
1931 */
1932 RCU_READ_LOCK_GUARD();
1933 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1934 memory_region_unref(mspr->rb->mr);
1935 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1936 g_free(mspr);
1937 }
1938 }
1939
1940 /**
1941 * ram_save_queue_pages: queue the page for transmission
1942 *
1943 * A request from postcopy destination for example.
1944 *
1945 * Returns zero on success or negative on error
1946 *
1947 * @rbname: Name of the RAMBLock of the request. NULL means the
1948 * same that last one.
1949 * @start: starting address from the start of the RAMBlock
1950 * @len: length (in bytes) to send
1951 */
1952 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1953 {
1954 RAMBlock *ramblock;
1955 RAMState *rs = ram_state;
1956
1957 stat64_add(&mig_stats.postcopy_requests, 1);
1958 RCU_READ_LOCK_GUARD();
1959
1960 if (!rbname) {
1961 /* Reuse last RAMBlock */
1962 ramblock = rs->last_req_rb;
1963
1964 if (!ramblock) {
1965 /*
1966 * Shouldn't happen, we can't reuse the last RAMBlock if
1967 * it's the 1st request.
1968 */
1969 error_report("ram_save_queue_pages no previous block");
1970 return -1;
1971 }
1972 } else {
1973 ramblock = qemu_ram_block_by_name(rbname);
1974
1975 if (!ramblock) {
1976 /* We shouldn't be asked for a non-existent RAMBlock */
1977 error_report("ram_save_queue_pages no block '%s'", rbname);
1978 return -1;
1979 }
1980 rs->last_req_rb = ramblock;
1981 }
1982 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1983 if (!offset_in_ramblock(ramblock, start + len - 1)) {
1984 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1985 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1986 __func__, start, len, ramblock->used_length);
1987 return -1;
1988 }
1989
1990 /*
1991 * When with postcopy preempt, we send back the page directly in the
1992 * rp-return thread.
1993 */
1994 if (postcopy_preempt_active()) {
1995 ram_addr_t page_start = start >> TARGET_PAGE_BITS;
1996 size_t page_size = qemu_ram_pagesize(ramblock);
1997 PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY];
1998 int ret = 0;
1999
2000 qemu_mutex_lock(&rs->bitmap_mutex);
2001
2002 pss_init(pss, ramblock, page_start);
2003 /*
2004 * Always use the preempt channel, and make sure it's there. It's
2005 * safe to access without lock, because when rp-thread is running
2006 * we should be the only one who operates on the qemufile
2007 */
2008 pss->pss_channel = migrate_get_current()->postcopy_qemufile_src;
2009 assert(pss->pss_channel);
2010
2011 /*
2012 * It must be either one or multiple of host page size. Just
2013 * assert; if something wrong we're mostly split brain anyway.
2014 */
2015 assert(len % page_size == 0);
2016 while (len) {
2017 if (ram_save_host_page_urgent(pss)) {
2018 error_report("%s: ram_save_host_page_urgent() failed: "
2019 "ramblock=%s, start_addr=0x"RAM_ADDR_FMT,
2020 __func__, ramblock->idstr, start);
2021 ret = -1;
2022 break;
2023 }
2024 /*
2025 * NOTE: after ram_save_host_page_urgent() succeeded, pss->page
2026 * will automatically be moved and point to the next host page
2027 * we're going to send, so no need to update here.
2028 *
2029 * Normally QEMU never sends >1 host page in requests, so
2030 * logically we don't even need that as the loop should only
2031 * run once, but just to be consistent.
2032 */
2033 len -= page_size;
2034 };
2035 qemu_mutex_unlock(&rs->bitmap_mutex);
2036
2037 return ret;
2038 }
2039
2040 struct RAMSrcPageRequest *new_entry =
2041 g_new0(struct RAMSrcPageRequest, 1);
2042 new_entry->rb = ramblock;
2043 new_entry->offset = start;
2044 new_entry->len = len;
2045
2046 memory_region_ref(ramblock->mr);
2047 qemu_mutex_lock(&rs->src_page_req_mutex);
2048 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2049 migration_make_urgent_request();
2050 qemu_mutex_unlock(&rs->src_page_req_mutex);
2051
2052 return 0;
2053 }
2054
2055 static bool save_page_use_compression(RAMState *rs)
2056 {
2057 if (!migrate_compress()) {
2058 return false;
2059 }
2060
2061 /*
2062 * If xbzrle is enabled (e.g., after first round of migration), stop
2063 * using the data compression. In theory, xbzrle can do better than
2064 * compression.
2065 */
2066 if (rs->xbzrle_started) {
2067 return false;
2068 }
2069
2070 return true;
2071 }
2072
2073 /*
2074 * try to compress the page before posting it out, return true if the page
2075 * has been properly handled by compression, otherwise needs other
2076 * paths to handle it
2077 */
2078 static bool save_compress_page(RAMState *rs, PageSearchStatus *pss,
2079 RAMBlock *block, ram_addr_t offset)
2080 {
2081 if (!save_page_use_compression(rs)) {
2082 return false;
2083 }
2084
2085 /*
2086 * When starting the process of a new block, the first page of
2087 * the block should be sent out before other pages in the same
2088 * block, and all the pages in last block should have been sent
2089 * out, keeping this order is important, because the 'cont' flag
2090 * is used to avoid resending the block name.
2091 *
2092 * We post the fist page as normal page as compression will take
2093 * much CPU resource.
2094 */
2095 if (block != pss->last_sent_block) {
2096 ram_flush_compressed_data(rs);
2097 return false;
2098 }
2099
2100 if (compress_page_with_multi_thread(block, offset, send_queued_data) > 0) {
2101 return true;
2102 }
2103
2104 compression_counters.busy++;
2105 return false;
2106 }
2107
2108 /**
2109 * ram_save_target_page_legacy: save one target page
2110 *
2111 * Returns the number of pages written
2112 *
2113 * @rs: current RAM state
2114 * @pss: data about the page we want to send
2115 */
2116 static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss)
2117 {
2118 RAMBlock *block = pss->block;
2119 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2120 int res;
2121
2122 if (control_save_page(pss, block, offset, &res)) {
2123 return res;
2124 }
2125
2126 if (save_compress_page(rs, pss, block, offset)) {
2127 return 1;
2128 }
2129
2130 res = save_zero_page(pss, pss->pss_channel, block, offset);
2131 if (res > 0) {
2132 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2133 * page would be stale
2134 */
2135 if (rs->xbzrle_started) {
2136 XBZRLE_cache_lock();
2137 xbzrle_cache_zero_page(rs, block->offset + offset);
2138 XBZRLE_cache_unlock();
2139 }
2140 return res;
2141 }
2142
2143 /*
2144 * Do not use multifd in postcopy as one whole host page should be
2145 * placed. Meanwhile postcopy requires atomic update of pages, so even
2146 * if host page size == guest page size the dest guest during run may
2147 * still see partially copied pages which is data corruption.
2148 */
2149 if (migrate_multifd() && !migration_in_postcopy()) {
2150 return ram_save_multifd_page(pss->pss_channel, block, offset);
2151 }
2152
2153 return ram_save_page(rs, pss);
2154 }
2155
2156 /* Should be called before sending a host page */
2157 static void pss_host_page_prepare(PageSearchStatus *pss)
2158 {
2159 /* How many guest pages are there in one host page? */
2160 size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2161
2162 pss->host_page_sending = true;
2163 if (guest_pfns <= 1) {
2164 /*
2165 * This covers both when guest psize == host psize, or when guest
2166 * has larger psize than the host (guest_pfns==0).
2167 *
2168 * For the latter, we always send one whole guest page per
2169 * iteration of the host page (example: an Alpha VM on x86 host
2170 * will have guest psize 8K while host psize 4K).
2171 */
2172 pss->host_page_start = pss->page;
2173 pss->host_page_end = pss->page + 1;
2174 } else {
2175 /*
2176 * The host page spans over multiple guest pages, we send them
2177 * within the same host page iteration.
2178 */
2179 pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
2180 pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
2181 }
2182 }
2183
2184 /*
2185 * Whether the page pointed by PSS is within the host page being sent.
2186 * Must be called after a previous pss_host_page_prepare().
2187 */
2188 static bool pss_within_range(PageSearchStatus *pss)
2189 {
2190 ram_addr_t ram_addr;
2191
2192 assert(pss->host_page_sending);
2193
2194 /* Over host-page boundary? */
2195 if (pss->page >= pss->host_page_end) {
2196 return false;
2197 }
2198
2199 ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2200
2201 return offset_in_ramblock(pss->block, ram_addr);
2202 }
2203
2204 static void pss_host_page_finish(PageSearchStatus *pss)
2205 {
2206 pss->host_page_sending = false;
2207 /* This is not needed, but just to reset it */
2208 pss->host_page_start = pss->host_page_end = 0;
2209 }
2210
2211 /*
2212 * Send an urgent host page specified by `pss'. Need to be called with
2213 * bitmap_mutex held.
2214 *
2215 * Returns 0 if save host page succeeded, false otherwise.
2216 */
2217 static int ram_save_host_page_urgent(PageSearchStatus *pss)
2218 {
2219 bool page_dirty, sent = false;
2220 RAMState *rs = ram_state;
2221 int ret = 0;
2222
2223 trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2224 pss_host_page_prepare(pss);
2225
2226 /*
2227 * If precopy is sending the same page, let it be done in precopy, or
2228 * we could send the same page in two channels and none of them will
2229 * receive the whole page.
2230 */
2231 if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) {
2232 trace_postcopy_preempt_hit(pss->block->idstr,
2233 pss->page << TARGET_PAGE_BITS);
2234 return 0;
2235 }
2236
2237 do {
2238 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2239
2240 if (page_dirty) {
2241 /* Be strict to return code; it must be 1, or what else? */
2242 if (migration_ops->ram_save_target_page(rs, pss) != 1) {
2243 error_report_once("%s: ram_save_target_page failed", __func__);
2244 ret = -1;
2245 goto out;
2246 }
2247 sent = true;
2248 }
2249 pss_find_next_dirty(pss);
2250 } while (pss_within_range(pss));
2251 out:
2252 pss_host_page_finish(pss);
2253 /* For urgent requests, flush immediately if sent */
2254 if (sent) {
2255 qemu_fflush(pss->pss_channel);
2256 }
2257 return ret;
2258 }
2259
2260 /**
2261 * ram_save_host_page: save a whole host page
2262 *
2263 * Starting at *offset send pages up to the end of the current host
2264 * page. It's valid for the initial offset to point into the middle of
2265 * a host page in which case the remainder of the hostpage is sent.
2266 * Only dirty target pages are sent. Note that the host page size may
2267 * be a huge page for this block.
2268 *
2269 * The saving stops at the boundary of the used_length of the block
2270 * if the RAMBlock isn't a multiple of the host page size.
2271 *
2272 * The caller must be with ram_state.bitmap_mutex held to call this
2273 * function. Note that this function can temporarily release the lock, but
2274 * when the function is returned it'll make sure the lock is still held.
2275 *
2276 * Returns the number of pages written or negative on error
2277 *
2278 * @rs: current RAM state
2279 * @pss: data about the page we want to send
2280 */
2281 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
2282 {
2283 bool page_dirty, preempt_active = postcopy_preempt_active();
2284 int tmppages, pages = 0;
2285 size_t pagesize_bits =
2286 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2287 unsigned long start_page = pss->page;
2288 int res;
2289
2290 if (ramblock_is_ignored(pss->block)) {
2291 error_report("block %s should not be migrated !", pss->block->idstr);
2292 return 0;
2293 }
2294
2295 /* Update host page boundary information */
2296 pss_host_page_prepare(pss);
2297
2298 do {
2299 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2300
2301 /* Check the pages is dirty and if it is send it */
2302 if (page_dirty) {
2303 /*
2304 * Properly yield the lock only in postcopy preempt mode
2305 * because both migration thread and rp-return thread can
2306 * operate on the bitmaps.
2307 */
2308 if (preempt_active) {
2309 qemu_mutex_unlock(&rs->bitmap_mutex);
2310 }
2311 tmppages = migration_ops->ram_save_target_page(rs, pss);
2312 if (tmppages >= 0) {
2313 pages += tmppages;
2314 /*
2315 * Allow rate limiting to happen in the middle of huge pages if
2316 * something is sent in the current iteration.
2317 */
2318 if (pagesize_bits > 1 && tmppages > 0) {
2319 migration_rate_limit();
2320 }
2321 }
2322 if (preempt_active) {
2323 qemu_mutex_lock(&rs->bitmap_mutex);
2324 }
2325 } else {
2326 tmppages = 0;
2327 }
2328
2329 if (tmppages < 0) {
2330 pss_host_page_finish(pss);
2331 return tmppages;
2332 }
2333
2334 pss_find_next_dirty(pss);
2335 } while (pss_within_range(pss));
2336
2337 pss_host_page_finish(pss);
2338
2339 res = ram_save_release_protection(rs, pss, start_page);
2340 return (res < 0 ? res : pages);
2341 }
2342
2343 /**
2344 * ram_find_and_save_block: finds a dirty page and sends it to f
2345 *
2346 * Called within an RCU critical section.
2347 *
2348 * Returns the number of pages written where zero means no dirty pages,
2349 * or negative on error
2350 *
2351 * @rs: current RAM state
2352 *
2353 * On systems where host-page-size > target-page-size it will send all the
2354 * pages in a host page that are dirty.
2355 */
2356 static int ram_find_and_save_block(RAMState *rs)
2357 {
2358 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
2359 int pages = 0;
2360
2361 /* No dirty page as there is zero RAM */
2362 if (!rs->ram_bytes_total) {
2363 return pages;
2364 }
2365
2366 /*
2367 * Always keep last_seen_block/last_page valid during this procedure,
2368 * because find_dirty_block() relies on these values (e.g., we compare
2369 * last_seen_block with pss.block to see whether we searched all the
2370 * ramblocks) to detect the completion of migration. Having NULL value
2371 * of last_seen_block can conditionally cause below loop to run forever.
2372 */
2373 if (!rs->last_seen_block) {
2374 rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks);
2375 rs->last_page = 0;
2376 }
2377
2378 pss_init(pss, rs->last_seen_block, rs->last_page);
2379
2380 while (true){
2381 if (!get_queued_page(rs, pss)) {
2382 /* priority queue empty, so just search for something dirty */
2383 int res = find_dirty_block(rs, pss);
2384 if (res != PAGE_DIRTY_FOUND) {
2385 if (res == PAGE_ALL_CLEAN) {
2386 break;
2387 } else if (res == PAGE_TRY_AGAIN) {
2388 continue;
2389 } else if (res < 0) {
2390 pages = res;
2391 break;
2392 }
2393 }
2394 }
2395 pages = ram_save_host_page(rs, pss);
2396 if (pages) {
2397 break;
2398 }
2399 }
2400
2401 rs->last_seen_block = pss->block;
2402 rs->last_page = pss->page;
2403
2404 return pages;
2405 }
2406
2407 static uint64_t ram_bytes_total_with_ignored(void)
2408 {
2409 RAMBlock *block;
2410 uint64_t total = 0;
2411
2412 RCU_READ_LOCK_GUARD();
2413
2414 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2415 total += block->used_length;
2416 }
2417 return total;
2418 }
2419
2420 uint64_t ram_bytes_total(void)
2421 {
2422 RAMBlock *block;
2423 uint64_t total = 0;
2424
2425 RCU_READ_LOCK_GUARD();
2426
2427 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2428 total += block->used_length;
2429 }
2430 return total;
2431 }
2432
2433 static void xbzrle_load_setup(void)
2434 {
2435 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2436 }
2437
2438 static void xbzrle_load_cleanup(void)
2439 {
2440 g_free(XBZRLE.decoded_buf);
2441 XBZRLE.decoded_buf = NULL;
2442 }
2443
2444 static void ram_state_cleanup(RAMState **rsp)
2445 {
2446 if (*rsp) {
2447 migration_page_queue_free(*rsp);
2448 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2449 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2450 g_free(*rsp);
2451 *rsp = NULL;
2452 }
2453 }
2454
2455 static void xbzrle_cleanup(void)
2456 {
2457 XBZRLE_cache_lock();
2458 if (XBZRLE.cache) {
2459 cache_fini(XBZRLE.cache);
2460 g_free(XBZRLE.encoded_buf);
2461 g_free(XBZRLE.current_buf);
2462 g_free(XBZRLE.zero_target_page);
2463 XBZRLE.cache = NULL;
2464 XBZRLE.encoded_buf = NULL;
2465 XBZRLE.current_buf = NULL;
2466 XBZRLE.zero_target_page = NULL;
2467 }
2468 XBZRLE_cache_unlock();
2469 }
2470
2471 static void ram_save_cleanup(void *opaque)
2472 {
2473 RAMState **rsp = opaque;
2474 RAMBlock *block;
2475
2476 /* We don't use dirty log with background snapshots */
2477 if (!migrate_background_snapshot()) {
2478 /* caller have hold iothread lock or is in a bh, so there is
2479 * no writing race against the migration bitmap
2480 */
2481 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2482 /*
2483 * do not stop dirty log without starting it, since
2484 * memory_global_dirty_log_stop will assert that
2485 * memory_global_dirty_log_start/stop used in pairs
2486 */
2487 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2488 }
2489 }
2490
2491 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2492 g_free(block->clear_bmap);
2493 block->clear_bmap = NULL;
2494 g_free(block->bmap);
2495 block->bmap = NULL;
2496 }
2497
2498 xbzrle_cleanup();
2499 compress_threads_save_cleanup();
2500 ram_state_cleanup(rsp);
2501 g_free(migration_ops);
2502 migration_ops = NULL;
2503 }
2504
2505 static void ram_state_reset(RAMState *rs)
2506 {
2507 int i;
2508
2509 for (i = 0; i < RAM_CHANNEL_MAX; i++) {
2510 rs->pss[i].last_sent_block = NULL;
2511 }
2512
2513 rs->last_seen_block = NULL;
2514 rs->last_page = 0;
2515 rs->last_version = ram_list.version;
2516 rs->xbzrle_started = false;
2517 }
2518
2519 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2520
2521 /* **** functions for postcopy ***** */
2522
2523 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2524 {
2525 struct RAMBlock *block;
2526
2527 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2528 unsigned long *bitmap = block->bmap;
2529 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2530 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2531
2532 while (run_start < range) {
2533 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2534 ram_discard_range(block->idstr,
2535 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2536 ((ram_addr_t)(run_end - run_start))
2537 << TARGET_PAGE_BITS);
2538 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2539 }
2540 }
2541 }
2542
2543 /**
2544 * postcopy_send_discard_bm_ram: discard a RAMBlock
2545 *
2546 * Callback from postcopy_each_ram_send_discard for each RAMBlock
2547 *
2548 * @ms: current migration state
2549 * @block: RAMBlock to discard
2550 */
2551 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2552 {
2553 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2554 unsigned long current;
2555 unsigned long *bitmap = block->bmap;
2556
2557 for (current = 0; current < end; ) {
2558 unsigned long one = find_next_bit(bitmap, end, current);
2559 unsigned long zero, discard_length;
2560
2561 if (one >= end) {
2562 break;
2563 }
2564
2565 zero = find_next_zero_bit(bitmap, end, one + 1);
2566
2567 if (zero >= end) {
2568 discard_length = end - one;
2569 } else {
2570 discard_length = zero - one;
2571 }
2572 postcopy_discard_send_range(ms, one, discard_length);
2573 current = one + discard_length;
2574 }
2575 }
2576
2577 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2578
2579 /**
2580 * postcopy_each_ram_send_discard: discard all RAMBlocks
2581 *
2582 * Utility for the outgoing postcopy code.
2583 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2584 * passing it bitmap indexes and name.
2585 * (qemu_ram_foreach_block ends up passing unscaled lengths
2586 * which would mean postcopy code would have to deal with target page)
2587 *
2588 * @ms: current migration state
2589 */
2590 static void postcopy_each_ram_send_discard(MigrationState *ms)
2591 {
2592 struct RAMBlock *block;
2593
2594 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2595 postcopy_discard_send_init(ms, block->idstr);
2596
2597 /*
2598 * Deal with TPS != HPS and huge pages. It discard any partially sent
2599 * host-page size chunks, mark any partially dirty host-page size
2600 * chunks as all dirty. In this case the host-page is the host-page
2601 * for the particular RAMBlock, i.e. it might be a huge page.
2602 */
2603 postcopy_chunk_hostpages_pass(ms, block);
2604
2605 /*
2606 * Postcopy sends chunks of bitmap over the wire, but it
2607 * just needs indexes at this point, avoids it having
2608 * target page specific code.
2609 */
2610 postcopy_send_discard_bm_ram(ms, block);
2611 postcopy_discard_send_finish(ms);
2612 }
2613 }
2614
2615 /**
2616 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2617 *
2618 * Helper for postcopy_chunk_hostpages; it's called twice to
2619 * canonicalize the two bitmaps, that are similar, but one is
2620 * inverted.
2621 *
2622 * Postcopy requires that all target pages in a hostpage are dirty or
2623 * clean, not a mix. This function canonicalizes the bitmaps.
2624 *
2625 * @ms: current migration state
2626 * @block: block that contains the page we want to canonicalize
2627 */
2628 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2629 {
2630 RAMState *rs = ram_state;
2631 unsigned long *bitmap = block->bmap;
2632 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2633 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2634 unsigned long run_start;
2635
2636 if (block->page_size == TARGET_PAGE_SIZE) {
2637 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2638 return;
2639 }
2640
2641 /* Find a dirty page */
2642 run_start = find_next_bit(bitmap, pages, 0);
2643
2644 while (run_start < pages) {
2645
2646 /*
2647 * If the start of this run of pages is in the middle of a host
2648 * page, then we need to fixup this host page.
2649 */
2650 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2651 /* Find the end of this run */
2652 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2653 /*
2654 * If the end isn't at the start of a host page, then the
2655 * run doesn't finish at the end of a host page
2656 * and we need to discard.
2657 */
2658 }
2659
2660 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2661 unsigned long page;
2662 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2663 host_ratio);
2664 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2665
2666 /* Clean up the bitmap */
2667 for (page = fixup_start_addr;
2668 page < fixup_start_addr + host_ratio; page++) {
2669 /*
2670 * Remark them as dirty, updating the count for any pages
2671 * that weren't previously dirty.
2672 */
2673 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2674 }
2675 }
2676
2677 /* Find the next dirty page for the next iteration */
2678 run_start = find_next_bit(bitmap, pages, run_start);
2679 }
2680 }
2681
2682 /**
2683 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2684 *
2685 * Transmit the set of pages to be discarded after precopy to the target
2686 * these are pages that:
2687 * a) Have been previously transmitted but are now dirty again
2688 * b) Pages that have never been transmitted, this ensures that
2689 * any pages on the destination that have been mapped by background
2690 * tasks get discarded (transparent huge pages is the specific concern)
2691 * Hopefully this is pretty sparse
2692 *
2693 * @ms: current migration state
2694 */
2695 void ram_postcopy_send_discard_bitmap(MigrationState *ms)
2696 {
2697 RAMState *rs = ram_state;
2698
2699 RCU_READ_LOCK_GUARD();
2700
2701 /* This should be our last sync, the src is now paused */
2702 migration_bitmap_sync(rs);
2703
2704 /* Easiest way to make sure we don't resume in the middle of a host-page */
2705 rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL;
2706 rs->last_seen_block = NULL;
2707 rs->last_page = 0;
2708
2709 postcopy_each_ram_send_discard(ms);
2710
2711 trace_ram_postcopy_send_discard_bitmap();
2712 }
2713
2714 /**
2715 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2716 *
2717 * Returns zero on success
2718 *
2719 * @rbname: name of the RAMBlock of the request. NULL means the
2720 * same that last one.
2721 * @start: RAMBlock starting page
2722 * @length: RAMBlock size
2723 */
2724 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2725 {
2726 trace_ram_discard_range(rbname, start, length);
2727
2728 RCU_READ_LOCK_GUARD();
2729 RAMBlock *rb = qemu_ram_block_by_name(rbname);
2730
2731 if (!rb) {
2732 error_report("ram_discard_range: Failed to find block '%s'", rbname);
2733 return -1;
2734 }
2735
2736 /*
2737 * On source VM, we don't need to update the received bitmap since
2738 * we don't even have one.
2739 */
2740 if (rb->receivedmap) {
2741 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2742 length >> qemu_target_page_bits());
2743 }
2744
2745 return ram_block_discard_range(rb, start, length);
2746 }
2747
2748 /*
2749 * For every allocation, we will try not to crash the VM if the
2750 * allocation failed.
2751 */
2752 static int xbzrle_init(void)
2753 {
2754 Error *local_err = NULL;
2755
2756 if (!migrate_xbzrle()) {
2757 return 0;
2758 }
2759
2760 XBZRLE_cache_lock();
2761
2762 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2763 if (!XBZRLE.zero_target_page) {
2764 error_report("%s: Error allocating zero page", __func__);
2765 goto err_out;
2766 }
2767
2768 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2769 TARGET_PAGE_SIZE, &local_err);
2770 if (!XBZRLE.cache) {
2771 error_report_err(local_err);
2772 goto free_zero_page;
2773 }
2774
2775 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2776 if (!XBZRLE.encoded_buf) {
2777 error_report("%s: Error allocating encoded_buf", __func__);
2778 goto free_cache;
2779 }
2780
2781 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2782 if (!XBZRLE.current_buf) {
2783 error_report("%s: Error allocating current_buf", __func__);
2784 goto free_encoded_buf;
2785 }
2786
2787 /* We are all good */
2788 XBZRLE_cache_unlock();
2789 return 0;
2790
2791 free_encoded_buf:
2792 g_free(XBZRLE.encoded_buf);
2793 XBZRLE.encoded_buf = NULL;
2794 free_cache:
2795 cache_fini(XBZRLE.cache);
2796 XBZRLE.cache = NULL;
2797 free_zero_page:
2798 g_free(XBZRLE.zero_target_page);
2799 XBZRLE.zero_target_page = NULL;
2800 err_out:
2801 XBZRLE_cache_unlock();
2802 return -ENOMEM;
2803 }
2804
2805 static int ram_state_init(RAMState **rsp)
2806 {
2807 *rsp = g_try_new0(RAMState, 1);
2808
2809 if (!*rsp) {
2810 error_report("%s: Init ramstate fail", __func__);
2811 return -1;
2812 }
2813
2814 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2815 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2816 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2817 (*rsp)->ram_bytes_total = ram_bytes_total();
2818
2819 /*
2820 * Count the total number of pages used by ram blocks not including any
2821 * gaps due to alignment or unplugs.
2822 * This must match with the initial values of dirty bitmap.
2823 */
2824 (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS;
2825 ram_state_reset(*rsp);
2826
2827 return 0;
2828 }
2829
2830 static void ram_list_init_bitmaps(void)
2831 {
2832 MigrationState *ms = migrate_get_current();
2833 RAMBlock *block;
2834 unsigned long pages;
2835 uint8_t shift;
2836
2837 /* Skip setting bitmap if there is no RAM */
2838 if (ram_bytes_total()) {
2839 shift = ms->clear_bitmap_shift;
2840 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2841 error_report("clear_bitmap_shift (%u) too big, using "
2842 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2843 shift = CLEAR_BITMAP_SHIFT_MAX;
2844 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2845 error_report("clear_bitmap_shift (%u) too small, using "
2846 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2847 shift = CLEAR_BITMAP_SHIFT_MIN;
2848 }
2849
2850 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2851 pages = block->max_length >> TARGET_PAGE_BITS;
2852 /*
2853 * The initial dirty bitmap for migration must be set with all
2854 * ones to make sure we'll migrate every guest RAM page to
2855 * destination.
2856 * Here we set RAMBlock.bmap all to 1 because when rebegin a
2857 * new migration after a failed migration, ram_list.
2858 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2859 * guest memory.
2860 */
2861 block->bmap = bitmap_new(pages);
2862 bitmap_set(block->bmap, 0, pages);
2863 block->clear_bmap_shift = shift;
2864 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2865 }
2866 }
2867 }
2868
2869 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
2870 {
2871 unsigned long pages;
2872 RAMBlock *rb;
2873
2874 RCU_READ_LOCK_GUARD();
2875
2876 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
2877 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
2878 rs->migration_dirty_pages -= pages;
2879 }
2880 }
2881
2882 static void ram_init_bitmaps(RAMState *rs)
2883 {
2884 /* For memory_global_dirty_log_start below. */
2885 qemu_mutex_lock_iothread();
2886 qemu_mutex_lock_ramlist();
2887
2888 WITH_RCU_READ_LOCK_GUARD() {
2889 ram_list_init_bitmaps();
2890 /* We don't use dirty log with background snapshots */
2891 if (!migrate_background_snapshot()) {
2892 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
2893 migration_bitmap_sync_precopy(rs);
2894 }
2895 }
2896 qemu_mutex_unlock_ramlist();
2897 qemu_mutex_unlock_iothread();
2898
2899 /*
2900 * After an eventual first bitmap sync, fixup the initial bitmap
2901 * containing all 1s to exclude any discarded pages from migration.
2902 */
2903 migration_bitmap_clear_discarded_pages(rs);
2904 }
2905
2906 static int ram_init_all(RAMState **rsp)
2907 {
2908 if (ram_state_init(rsp)) {
2909 return -1;
2910 }
2911
2912 if (xbzrle_init()) {
2913 ram_state_cleanup(rsp);
2914 return -1;
2915 }
2916
2917 ram_init_bitmaps(*rsp);
2918
2919 return 0;
2920 }
2921
2922 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2923 {
2924 RAMBlock *block;
2925 uint64_t pages = 0;
2926
2927 /*
2928 * Postcopy is not using xbzrle/compression, so no need for that.
2929 * Also, since source are already halted, we don't need to care
2930 * about dirty page logging as well.
2931 */
2932
2933 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2934 pages += bitmap_count_one(block->bmap,
2935 block->used_length >> TARGET_PAGE_BITS);
2936 }
2937
2938 /* This may not be aligned with current bitmaps. Recalculate. */
2939 rs->migration_dirty_pages = pages;
2940
2941 ram_state_reset(rs);
2942
2943 /* Update RAMState cache of output QEMUFile */
2944 rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out;
2945
2946 trace_ram_state_resume_prepare(pages);
2947 }
2948
2949 /*
2950 * This function clears bits of the free pages reported by the caller from the
2951 * migration dirty bitmap. @addr is the host address corresponding to the
2952 * start of the continuous guest free pages, and @len is the total bytes of
2953 * those pages.
2954 */
2955 void qemu_guest_free_page_hint(void *addr, size_t len)
2956 {
2957 RAMBlock *block;
2958 ram_addr_t offset;
2959 size_t used_len, start, npages;
2960 MigrationState *s = migrate_get_current();
2961
2962 /* This function is currently expected to be used during live migration */
2963 if (!migration_is_setup_or_active(s->state)) {
2964 return;
2965 }
2966
2967 for (; len > 0; len -= used_len, addr += used_len) {
2968 block = qemu_ram_block_from_host(addr, false, &offset);
2969 if (unlikely(!block || offset >= block->used_length)) {
2970 /*
2971 * The implementation might not support RAMBlock resize during
2972 * live migration, but it could happen in theory with future
2973 * updates. So we add a check here to capture that case.
2974 */
2975 error_report_once("%s unexpected error", __func__);
2976 return;
2977 }
2978
2979 if (len <= block->used_length - offset) {
2980 used_len = len;
2981 } else {
2982 used_len = block->used_length - offset;
2983 }
2984
2985 start = offset >> TARGET_PAGE_BITS;
2986 npages = used_len >> TARGET_PAGE_BITS;
2987
2988 qemu_mutex_lock(&ram_state->bitmap_mutex);
2989 /*
2990 * The skipped free pages are equavalent to be sent from clear_bmap's
2991 * perspective, so clear the bits from the memory region bitmap which
2992 * are initially set. Otherwise those skipped pages will be sent in
2993 * the next round after syncing from the memory region bitmap.
2994 */
2995 migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
2996 ram_state->migration_dirty_pages -=
2997 bitmap_count_one_with_offset(block->bmap, start, npages);
2998 bitmap_clear(block->bmap, start, npages);
2999 qemu_mutex_unlock(&ram_state->bitmap_mutex);
3000 }
3001 }
3002
3003 /*
3004 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3005 * long-running RCU critical section. When rcu-reclaims in the code
3006 * start to become numerous it will be necessary to reduce the
3007 * granularity of these critical sections.
3008 */
3009
3010 /**
3011 * ram_save_setup: Setup RAM for migration
3012 *
3013 * Returns zero to indicate success and negative for error
3014 *
3015 * @f: QEMUFile where to send the data
3016 * @opaque: RAMState pointer
3017 */
3018 static int ram_save_setup(QEMUFile *f, void *opaque)
3019 {
3020 RAMState **rsp = opaque;
3021 RAMBlock *block;
3022 int ret;
3023
3024 if (compress_threads_save_setup()) {
3025 return -1;
3026 }
3027
3028 /* migration has already setup the bitmap, reuse it. */
3029 if (!migration_in_colo_state()) {
3030 if (ram_init_all(rsp) != 0) {
3031 compress_threads_save_cleanup();
3032 return -1;
3033 }
3034 }
3035 (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f;
3036
3037 WITH_RCU_READ_LOCK_GUARD() {
3038 qemu_put_be64(f, ram_bytes_total_with_ignored()
3039 | RAM_SAVE_FLAG_MEM_SIZE);
3040
3041 RAMBLOCK_FOREACH_MIGRATABLE(block) {
3042 qemu_put_byte(f, strlen(block->idstr));
3043 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3044 qemu_put_be64(f, block->used_length);
3045 if (migrate_postcopy_ram() && block->page_size !=
3046 qemu_host_page_size) {
3047 qemu_put_be64(f, block->page_size);
3048 }
3049 if (migrate_ignore_shared()) {
3050 qemu_put_be64(f, block->mr->addr);
3051 }
3052 }
3053 }
3054
3055 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3056 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3057
3058 migration_ops = g_malloc0(sizeof(MigrationOps));
3059 migration_ops->ram_save_target_page = ram_save_target_page_legacy;
3060 ret = multifd_send_sync_main(f);
3061 if (ret < 0) {
3062 return ret;
3063 }
3064
3065 if (!migrate_multifd_flush_after_each_section()) {
3066 qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
3067 }
3068
3069 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3070 qemu_fflush(f);
3071
3072 return 0;
3073 }
3074
3075 /**
3076 * ram_save_iterate: iterative stage for migration
3077 *
3078 * Returns zero to indicate success and negative for error
3079 *
3080 * @f: QEMUFile where to send the data
3081 * @opaque: RAMState pointer
3082 */
3083 static int ram_save_iterate(QEMUFile *f, void *opaque)
3084 {
3085 RAMState **temp = opaque;
3086 RAMState *rs = *temp;
3087 int ret = 0;
3088 int i;
3089 int64_t t0;
3090 int done = 0;
3091
3092 if (blk_mig_bulk_active()) {
3093 /* Avoid transferring ram during bulk phase of block migration as
3094 * the bulk phase will usually take a long time and transferring
3095 * ram updates during that time is pointless. */
3096 goto out;
3097 }
3098
3099 /*
3100 * We'll take this lock a little bit long, but it's okay for two reasons.
3101 * Firstly, the only possible other thread to take it is who calls
3102 * qemu_guest_free_page_hint(), which should be rare; secondly, see
3103 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3104 * guarantees that we'll at least released it in a regular basis.
3105 */
3106 qemu_mutex_lock(&rs->bitmap_mutex);
3107 WITH_RCU_READ_LOCK_GUARD() {
3108 if (ram_list.version != rs->last_version) {
3109 ram_state_reset(rs);
3110 }
3111
3112 /* Read version before ram_list.blocks */
3113 smp_rmb();
3114
3115 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3116
3117 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3118 i = 0;
3119 while ((ret = qemu_file_rate_limit(f)) == 0 ||
3120 postcopy_has_request(rs)) {
3121 int pages;
3122
3123 if (qemu_file_get_error(f)) {
3124 break;
3125 }
3126
3127 pages = ram_find_and_save_block(rs);
3128 /* no more pages to sent */
3129 if (pages == 0) {
3130 done = 1;
3131 break;
3132 }
3133
3134 if (pages < 0) {
3135 qemu_file_set_error(f, pages);
3136 break;
3137 }
3138
3139 rs->target_page_count += pages;
3140
3141 /*
3142 * During postcopy, it is necessary to make sure one whole host
3143 * page is sent in one chunk.
3144 */
3145 if (migrate_postcopy_ram()) {
3146 ram_flush_compressed_data(rs);
3147 }
3148
3149 /*
3150 * we want to check in the 1st loop, just in case it was the 1st
3151 * time and we had to sync the dirty bitmap.
3152 * qemu_clock_get_ns() is a bit expensive, so we only check each
3153 * some iterations
3154 */
3155 if ((i & 63) == 0) {
3156 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3157 1000000;
3158 if (t1 > MAX_WAIT) {
3159 trace_ram_save_iterate_big_wait(t1, i);
3160 break;
3161 }
3162 }
3163 i++;
3164 }
3165 }
3166 qemu_mutex_unlock(&rs->bitmap_mutex);
3167
3168 /*
3169 * Must occur before EOS (or any QEMUFile operation)
3170 * because of RDMA protocol.
3171 */
3172 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3173
3174 out:
3175 if (ret >= 0
3176 && migration_is_setup_or_active(migrate_get_current()->state)) {
3177 if (migrate_multifd_flush_after_each_section()) {
3178 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3179 if (ret < 0) {
3180 return ret;
3181 }
3182 }
3183
3184 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3185 qemu_fflush(f);
3186 ram_transferred_add(8);
3187
3188 ret = qemu_file_get_error(f);
3189 }
3190 if (ret < 0) {
3191 return ret;
3192 }
3193
3194 return done;
3195 }
3196
3197 /**
3198 * ram_save_complete: function called to send the remaining amount of ram
3199 *
3200 * Returns zero to indicate success or negative on error
3201 *
3202 * Called with iothread lock
3203 *
3204 * @f: QEMUFile where to send the data
3205 * @opaque: RAMState pointer
3206 */
3207 static int ram_save_complete(QEMUFile *f, void *opaque)
3208 {
3209 RAMState **temp = opaque;
3210 RAMState *rs = *temp;
3211 int ret = 0;
3212
3213 rs->last_stage = !migration_in_colo_state();
3214
3215 WITH_RCU_READ_LOCK_GUARD() {
3216 if (!migration_in_postcopy()) {
3217 migration_bitmap_sync_precopy(rs);
3218 }
3219
3220 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3221
3222 /* try transferring iterative blocks of memory */
3223
3224 /* flush all remaining blocks regardless of rate limiting */
3225 qemu_mutex_lock(&rs->bitmap_mutex);
3226 while (true) {
3227 int pages;
3228
3229 pages = ram_find_and_save_block(rs);
3230 /* no more blocks to sent */
3231 if (pages == 0) {
3232 break;
3233 }
3234 if (pages < 0) {
3235 ret = pages;
3236 break;
3237 }
3238 }
3239 qemu_mutex_unlock(&rs->bitmap_mutex);
3240
3241 ram_flush_compressed_data(rs);
3242 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3243 }
3244
3245 if (ret < 0) {
3246 return ret;
3247 }
3248
3249 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3250 if (ret < 0) {
3251 return ret;
3252 }
3253
3254 if (!migrate_multifd_flush_after_each_section()) {
3255 qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
3256 }
3257 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3258 qemu_fflush(f);
3259
3260 return 0;
3261 }
3262
3263 static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy,
3264 uint64_t *can_postcopy)
3265 {
3266 RAMState **temp = opaque;
3267 RAMState *rs = *temp;
3268
3269 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3270
3271 if (migrate_postcopy_ram()) {
3272 /* We can do postcopy, and all the data is postcopiable */
3273 *can_postcopy += remaining_size;
3274 } else {
3275 *must_precopy += remaining_size;
3276 }
3277 }
3278
3279 static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy,
3280 uint64_t *can_postcopy)
3281 {
3282 MigrationState *s = migrate_get_current();
3283 RAMState **temp = opaque;
3284 RAMState *rs = *temp;
3285
3286 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3287
3288 if (!migration_in_postcopy() && remaining_size < s->threshold_size) {
3289 qemu_mutex_lock_iothread();
3290 WITH_RCU_READ_LOCK_GUARD() {
3291 migration_bitmap_sync_precopy(rs);
3292 }
3293 qemu_mutex_unlock_iothread();
3294 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3295 }
3296
3297 if (migrate_postcopy_ram()) {
3298 /* We can do postcopy, and all the data is postcopiable */
3299 *can_postcopy += remaining_size;
3300 } else {
3301 *must_precopy += remaining_size;
3302 }
3303 }
3304
3305 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3306 {
3307 unsigned int xh_len;
3308 int xh_flags;
3309 uint8_t *loaded_data;
3310
3311 /* extract RLE header */
3312 xh_flags = qemu_get_byte(f);
3313 xh_len = qemu_get_be16(f);
3314
3315 if (xh_flags != ENCODING_FLAG_XBZRLE) {
3316 error_report("Failed to load XBZRLE page - wrong compression!");
3317 return -1;
3318 }
3319
3320 if (xh_len > TARGET_PAGE_SIZE) {
3321 error_report("Failed to load XBZRLE page - len overflow!");
3322 return -1;
3323 }
3324 loaded_data = XBZRLE.decoded_buf;
3325 /* load data and decode */
3326 /* it can change loaded_data to point to an internal buffer */
3327 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3328
3329 /* decode RLE */
3330 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3331 TARGET_PAGE_SIZE) == -1) {
3332 error_report("Failed to load XBZRLE page - decode error!");
3333 return -1;
3334 }
3335
3336 return 0;
3337 }
3338
3339 /**
3340 * ram_block_from_stream: read a RAMBlock id from the migration stream
3341 *
3342 * Must be called from within a rcu critical section.
3343 *
3344 * Returns a pointer from within the RCU-protected ram_list.
3345 *
3346 * @mis: the migration incoming state pointer
3347 * @f: QEMUFile where to read the data from
3348 * @flags: Page flags (mostly to see if it's a continuation of previous block)
3349 * @channel: the channel we're using
3350 */
3351 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
3352 QEMUFile *f, int flags,
3353 int channel)
3354 {
3355 RAMBlock *block = mis->last_recv_block[channel];
3356 char id[256];
3357 uint8_t len;
3358
3359 if (flags & RAM_SAVE_FLAG_CONTINUE) {
3360 if (!block) {
3361 error_report("Ack, bad migration stream!");
3362 return NULL;
3363 }
3364 return block;
3365 }
3366
3367 len = qemu_get_byte(f);
3368 qemu_get_buffer(f, (uint8_t *)id, len);
3369 id[len] = 0;
3370
3371 block = qemu_ram_block_by_name(id);
3372 if (!block) {
3373 error_report("Can't find block %s", id);
3374 return NULL;
3375 }
3376
3377 if (ramblock_is_ignored(block)) {
3378 error_report("block %s should not be migrated !", id);
3379 return NULL;
3380 }
3381
3382 mis->last_recv_block[channel] = block;
3383
3384 return block;
3385 }
3386
3387 static inline void *host_from_ram_block_offset(RAMBlock *block,
3388 ram_addr_t offset)
3389 {
3390 if (!offset_in_ramblock(block, offset)) {
3391 return NULL;
3392 }
3393
3394 return block->host + offset;
3395 }
3396
3397 static void *host_page_from_ram_block_offset(RAMBlock *block,
3398 ram_addr_t offset)
3399 {
3400 /* Note: Explicitly no check against offset_in_ramblock(). */
3401 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3402 block->page_size);
3403 }
3404
3405 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3406 ram_addr_t offset)
3407 {
3408 return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3409 }
3410
3411 void colo_record_bitmap(RAMBlock *block, ram_addr_t *normal, uint32_t pages)
3412 {
3413 qemu_mutex_lock(&ram_state->bitmap_mutex);
3414 for (int i = 0; i < pages; i++) {
3415 ram_addr_t offset = normal[i];
3416 ram_state->migration_dirty_pages += !test_and_set_bit(
3417 offset >> TARGET_PAGE_BITS,
3418 block->bmap);
3419 }
3420 qemu_mutex_unlock(&ram_state->bitmap_mutex);
3421 }
3422
3423 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3424 ram_addr_t offset, bool record_bitmap)
3425 {
3426 if (!offset_in_ramblock(block, offset)) {
3427 return NULL;
3428 }
3429 if (!block->colo_cache) {
3430 error_report("%s: colo_cache is NULL in block :%s",
3431 __func__, block->idstr);
3432 return NULL;
3433 }
3434
3435 /*
3436 * During colo checkpoint, we need bitmap of these migrated pages.
3437 * It help us to decide which pages in ram cache should be flushed
3438 * into VM's RAM later.
3439 */
3440 if (record_bitmap) {
3441 colo_record_bitmap(block, &offset, 1);
3442 }
3443 return block->colo_cache + offset;
3444 }
3445
3446 /**
3447 * ram_handle_compressed: handle the zero page case
3448 *
3449 * If a page (or a whole RDMA chunk) has been
3450 * determined to be zero, then zap it.
3451 *
3452 * @host: host address for the zero page
3453 * @ch: what the page is filled from. We only support zero
3454 * @size: size of the zero page
3455 */
3456 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3457 {
3458 if (ch != 0 || !buffer_is_zero(host, size)) {
3459 memset(host, ch, size);
3460 }
3461 }
3462
3463 static void colo_init_ram_state(void)
3464 {
3465 ram_state_init(&ram_state);
3466 }
3467
3468 /*
3469 * colo cache: this is for secondary VM, we cache the whole
3470 * memory of the secondary VM, it is need to hold the global lock
3471 * to call this helper.
3472 */
3473 int colo_init_ram_cache(void)
3474 {
3475 RAMBlock *block;
3476
3477 WITH_RCU_READ_LOCK_GUARD() {
3478 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3479 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3480 NULL, false, false);
3481 if (!block->colo_cache) {
3482 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3483 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3484 block->used_length);
3485 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3486 if (block->colo_cache) {
3487 qemu_anon_ram_free(block->colo_cache, block->used_length);
3488 block->colo_cache = NULL;
3489 }
3490 }
3491 return -errno;
3492 }
3493 if (!machine_dump_guest_core(current_machine)) {
3494 qemu_madvise(block->colo_cache, block->used_length,
3495 QEMU_MADV_DONTDUMP);
3496 }
3497 }
3498 }
3499
3500 /*
3501 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3502 * with to decide which page in cache should be flushed into SVM's RAM. Here
3503 * we use the same name 'ram_bitmap' as for migration.
3504 */
3505 if (ram_bytes_total()) {
3506 RAMBlock *block;
3507
3508 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3509 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3510 block->bmap = bitmap_new(pages);
3511 }
3512 }
3513
3514 colo_init_ram_state();
3515 return 0;
3516 }
3517
3518 /* TODO: duplicated with ram_init_bitmaps */
3519 void colo_incoming_start_dirty_log(void)
3520 {
3521 RAMBlock *block = NULL;
3522 /* For memory_global_dirty_log_start below. */
3523 qemu_mutex_lock_iothread();
3524 qemu_mutex_lock_ramlist();
3525
3526 memory_global_dirty_log_sync();
3527 WITH_RCU_READ_LOCK_GUARD() {
3528 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3529 ramblock_sync_dirty_bitmap(ram_state, block);
3530 /* Discard this dirty bitmap record */
3531 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3532 }
3533 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3534 }
3535 ram_state->migration_dirty_pages = 0;
3536 qemu_mutex_unlock_ramlist();
3537 qemu_mutex_unlock_iothread();
3538 }
3539
3540 /* It is need to hold the global lock to call this helper */
3541 void colo_release_ram_cache(void)
3542 {
3543 RAMBlock *block;
3544
3545 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3546 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3547 g_free(block->bmap);
3548 block->bmap = NULL;
3549 }
3550
3551 WITH_RCU_READ_LOCK_GUARD() {
3552 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3553 if (block->colo_cache) {
3554 qemu_anon_ram_free(block->colo_cache, block->used_length);
3555 block->colo_cache = NULL;
3556 }
3557 }
3558 }
3559 ram_state_cleanup(&ram_state);
3560 }
3561
3562 /**
3563 * ram_load_setup: Setup RAM for migration incoming side
3564 *
3565 * Returns zero to indicate success and negative for error
3566 *
3567 * @f: QEMUFile where to receive the data
3568 * @opaque: RAMState pointer
3569 */
3570 static int ram_load_setup(QEMUFile *f, void *opaque)
3571 {
3572 xbzrle_load_setup();
3573 ramblock_recv_map_init();
3574
3575 return 0;
3576 }
3577
3578 static int ram_load_cleanup(void *opaque)
3579 {
3580 RAMBlock *rb;
3581
3582 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3583 qemu_ram_block_writeback(rb);
3584 }
3585
3586 xbzrle_load_cleanup();
3587
3588 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3589 g_free(rb->receivedmap);
3590 rb->receivedmap = NULL;
3591 }
3592
3593 return 0;
3594 }
3595
3596 /**
3597 * ram_postcopy_incoming_init: allocate postcopy data structures
3598 *
3599 * Returns 0 for success and negative if there was one error
3600 *
3601 * @mis: current migration incoming state
3602 *
3603 * Allocate data structures etc needed by incoming migration with
3604 * postcopy-ram. postcopy-ram's similarly names
3605 * postcopy_ram_incoming_init does the work.
3606 */
3607 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3608 {
3609 return postcopy_ram_incoming_init(mis);
3610 }
3611
3612 /**
3613 * ram_load_postcopy: load a page in postcopy case
3614 *
3615 * Returns 0 for success or -errno in case of error
3616 *
3617 * Called in postcopy mode by ram_load().
3618 * rcu_read_lock is taken prior to this being called.
3619 *
3620 * @f: QEMUFile where to send the data
3621 * @channel: the channel to use for loading
3622 */
3623 int ram_load_postcopy(QEMUFile *f, int channel)
3624 {
3625 int flags = 0, ret = 0;
3626 bool place_needed = false;
3627 bool matches_target_page_size = false;
3628 MigrationIncomingState *mis = migration_incoming_get_current();
3629 PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
3630
3631 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3632 ram_addr_t addr;
3633 void *page_buffer = NULL;
3634 void *place_source = NULL;
3635 RAMBlock *block = NULL;
3636 uint8_t ch;
3637 int len;
3638
3639 addr = qemu_get_be64(f);
3640
3641 /*
3642 * If qemu file error, we should stop here, and then "addr"
3643 * may be invalid
3644 */
3645 ret = qemu_file_get_error(f);
3646 if (ret) {
3647 break;
3648 }
3649
3650 flags = addr & ~TARGET_PAGE_MASK;
3651 addr &= TARGET_PAGE_MASK;
3652
3653 trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
3654 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3655 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3656 block = ram_block_from_stream(mis, f, flags, channel);
3657 if (!block) {
3658 ret = -EINVAL;
3659 break;
3660 }
3661
3662 /*
3663 * Relying on used_length is racy and can result in false positives.
3664 * We might place pages beyond used_length in case RAM was shrunk
3665 * while in postcopy, which is fine - trying to place via
3666 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3667 */
3668 if (!block->host || addr >= block->postcopy_length) {
3669 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3670 ret = -EINVAL;
3671 break;
3672 }
3673 tmp_page->target_pages++;
3674 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3675 /*
3676 * Postcopy requires that we place whole host pages atomically;
3677 * these may be huge pages for RAMBlocks that are backed by
3678 * hugetlbfs.
3679 * To make it atomic, the data is read into a temporary page
3680 * that's moved into place later.
3681 * The migration protocol uses, possibly smaller, target-pages
3682 * however the source ensures it always sends all the components
3683 * of a host page in one chunk.
3684 */
3685 page_buffer = tmp_page->tmp_huge_page +
3686 host_page_offset_from_ram_block_offset(block, addr);
3687 /* If all TP are zero then we can optimise the place */
3688 if (tmp_page->target_pages == 1) {
3689 tmp_page->host_addr =
3690 host_page_from_ram_block_offset(block, addr);
3691 } else if (tmp_page->host_addr !=
3692 host_page_from_ram_block_offset(block, addr)) {
3693 /* not the 1st TP within the HP */
3694 error_report("Non-same host page detected on channel %d: "
3695 "Target host page %p, received host page %p "
3696 "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
3697 channel, tmp_page->host_addr,
3698 host_page_from_ram_block_offset(block, addr),
3699 block->idstr, addr, tmp_page->target_pages);
3700 ret = -EINVAL;
3701 break;
3702 }
3703
3704 /*
3705 * If it's the last part of a host page then we place the host
3706 * page
3707 */
3708 if (tmp_page->target_pages ==
3709 (block->page_size / TARGET_PAGE_SIZE)) {
3710 place_needed = true;
3711 }
3712 place_source = tmp_page->tmp_huge_page;
3713 }
3714
3715 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3716 case RAM_SAVE_FLAG_ZERO:
3717 ch = qemu_get_byte(f);
3718 /*
3719 * Can skip to set page_buffer when
3720 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3721 */
3722 if (ch || !matches_target_page_size) {
3723 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3724 }
3725 if (ch) {
3726 tmp_page->all_zero = false;
3727 }
3728 break;
3729
3730 case RAM_SAVE_FLAG_PAGE:
3731 tmp_page->all_zero = false;
3732 if (!matches_target_page_size) {
3733 /* For huge pages, we always use temporary buffer */
3734 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3735 } else {
3736 /*
3737 * For small pages that matches target page size, we
3738 * avoid the qemu_file copy. Instead we directly use
3739 * the buffer of QEMUFile to place the page. Note: we
3740 * cannot do any QEMUFile operation before using that
3741 * buffer to make sure the buffer is valid when
3742 * placing the page.
3743 */
3744 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3745 TARGET_PAGE_SIZE);
3746 }
3747 break;
3748 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3749 tmp_page->all_zero = false;
3750 len = qemu_get_be32(f);
3751 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3752 error_report("Invalid compressed data length: %d", len);
3753 ret = -EINVAL;
3754 break;
3755 }
3756 decompress_data_with_multi_threads(f, page_buffer, len);
3757 break;
3758 case RAM_SAVE_FLAG_MULTIFD_FLUSH:
3759 multifd_recv_sync_main();
3760 break;
3761 case RAM_SAVE_FLAG_EOS:
3762 /* normal exit */
3763 if (migrate_multifd_flush_after_each_section()) {
3764 multifd_recv_sync_main();
3765 }
3766 break;
3767 default:
3768 error_report("Unknown combination of migration flags: 0x%x"
3769 " (postcopy mode)", flags);
3770 ret = -EINVAL;
3771 break;
3772 }
3773
3774 /* Got the whole host page, wait for decompress before placing. */
3775 if (place_needed) {
3776 ret |= wait_for_decompress_done();
3777 }
3778
3779 /* Detect for any possible file errors */
3780 if (!ret && qemu_file_get_error(f)) {
3781 ret = qemu_file_get_error(f);
3782 }
3783
3784 if (!ret && place_needed) {
3785 if (tmp_page->all_zero) {
3786 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
3787 } else {
3788 ret = postcopy_place_page(mis, tmp_page->host_addr,
3789 place_source, block);
3790 }
3791 place_needed = false;
3792 postcopy_temp_page_reset(tmp_page);
3793 }
3794 }
3795
3796 return ret;
3797 }
3798
3799 static bool postcopy_is_running(void)
3800 {
3801 PostcopyState ps = postcopy_state_get();
3802 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3803 }
3804
3805 /*
3806 * Flush content of RAM cache into SVM's memory.
3807 * Only flush the pages that be dirtied by PVM or SVM or both.
3808 */
3809 void colo_flush_ram_cache(void)
3810 {
3811 RAMBlock *block = NULL;
3812 void *dst_host;
3813 void *src_host;
3814 unsigned long offset = 0;
3815
3816 memory_global_dirty_log_sync();
3817 qemu_mutex_lock(&ram_state->bitmap_mutex);
3818 WITH_RCU_READ_LOCK_GUARD() {
3819 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3820 ramblock_sync_dirty_bitmap(ram_state, block);
3821 }
3822 }
3823
3824 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3825 WITH_RCU_READ_LOCK_GUARD() {
3826 block = QLIST_FIRST_RCU(&ram_list.blocks);
3827
3828 while (block) {
3829 unsigned long num = 0;
3830
3831 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
3832 if (!offset_in_ramblock(block,
3833 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
3834 offset = 0;
3835 num = 0;
3836 block = QLIST_NEXT_RCU(block, next);
3837 } else {
3838 unsigned long i = 0;
3839
3840 for (i = 0; i < num; i++) {
3841 migration_bitmap_clear_dirty(ram_state, block, offset + i);
3842 }
3843 dst_host = block->host
3844 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3845 src_host = block->colo_cache
3846 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3847 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
3848 offset += num;
3849 }
3850 }
3851 }
3852 qemu_mutex_unlock(&ram_state->bitmap_mutex);
3853 trace_colo_flush_ram_cache_end();
3854 }
3855
3856 /**
3857 * ram_load_precopy: load pages in precopy case
3858 *
3859 * Returns 0 for success or -errno in case of error
3860 *
3861 * Called in precopy mode by ram_load().
3862 * rcu_read_lock is taken prior to this being called.
3863 *
3864 * @f: QEMUFile where to send the data
3865 */
3866 static int ram_load_precopy(QEMUFile *f)
3867 {
3868 MigrationIncomingState *mis = migration_incoming_get_current();
3869 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3870 /* ADVISE is earlier, it shows the source has the postcopy capability on */
3871 bool postcopy_advised = migration_incoming_postcopy_advised();
3872 if (!migrate_compress()) {
3873 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3874 }
3875
3876 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3877 ram_addr_t addr, total_ram_bytes;
3878 void *host = NULL, *host_bak = NULL;
3879 uint8_t ch;
3880
3881 /*
3882 * Yield periodically to let main loop run, but an iteration of
3883 * the main loop is expensive, so do it each some iterations
3884 */
3885 if ((i & 32767) == 0 && qemu_in_coroutine()) {
3886 aio_co_schedule(qemu_get_current_aio_context(),
3887 qemu_coroutine_self());
3888 qemu_coroutine_yield();
3889 }
3890 i++;
3891
3892 addr = qemu_get_be64(f);
3893 flags = addr & ~TARGET_PAGE_MASK;
3894 addr &= TARGET_PAGE_MASK;
3895
3896 if (flags & invalid_flags) {
3897 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3898 error_report("Received an unexpected compressed page");
3899 }
3900
3901 ret = -EINVAL;
3902 break;
3903 }
3904
3905 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3906 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3907 RAMBlock *block = ram_block_from_stream(mis, f, flags,
3908 RAM_CHANNEL_PRECOPY);
3909
3910 host = host_from_ram_block_offset(block, addr);
3911 /*
3912 * After going into COLO stage, we should not load the page
3913 * into SVM's memory directly, we put them into colo_cache firstly.
3914 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3915 * Previously, we copied all these memory in preparing stage of COLO
3916 * while we need to stop VM, which is a time-consuming process.
3917 * Here we optimize it by a trick, back-up every page while in
3918 * migration process while COLO is enabled, though it affects the
3919 * speed of the migration, but it obviously reduce the downtime of
3920 * back-up all SVM'S memory in COLO preparing stage.
3921 */
3922 if (migration_incoming_colo_enabled()) {
3923 if (migration_incoming_in_colo_state()) {
3924 /* In COLO stage, put all pages into cache temporarily */
3925 host = colo_cache_from_block_offset(block, addr, true);
3926 } else {
3927 /*
3928 * In migration stage but before COLO stage,
3929 * Put all pages into both cache and SVM's memory.
3930 */
3931 host_bak = colo_cache_from_block_offset(block, addr, false);
3932 }
3933 }
3934 if (!host) {
3935 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3936 ret = -EINVAL;
3937 break;
3938 }
3939 if (!migration_incoming_in_colo_state()) {
3940 ramblock_recv_bitmap_set(block, host);
3941 }
3942
3943 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3944 }
3945
3946 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3947 case RAM_SAVE_FLAG_MEM_SIZE:
3948 /* Synchronize RAM block list */
3949 total_ram_bytes = addr;
3950 while (!ret && total_ram_bytes) {
3951 RAMBlock *block;
3952 char id[256];
3953 ram_addr_t length;
3954
3955 len = qemu_get_byte(f);
3956 qemu_get_buffer(f, (uint8_t *)id, len);
3957 id[len] = 0;
3958 length = qemu_get_be64(f);
3959
3960 block = qemu_ram_block_by_name(id);
3961 if (block && !qemu_ram_is_migratable(block)) {
3962 error_report("block %s should not be migrated !", id);
3963 ret = -EINVAL;
3964 } else if (block) {
3965 if (length != block->used_length) {
3966 Error *local_err = NULL;
3967
3968 ret = qemu_ram_resize(block, length,
3969 &local_err);
3970 if (local_err) {
3971 error_report_err(local_err);
3972 }
3973 }
3974 /* For postcopy we need to check hugepage sizes match */
3975 if (postcopy_advised && migrate_postcopy_ram() &&
3976 block->page_size != qemu_host_page_size) {
3977 uint64_t remote_page_size = qemu_get_be64(f);
3978 if (remote_page_size != block->page_size) {
3979 error_report("Mismatched RAM page size %s "
3980 "(local) %zd != %" PRId64,
3981 id, block->page_size,
3982 remote_page_size);
3983 ret = -EINVAL;
3984 }
3985 }
3986 if (migrate_ignore_shared()) {
3987 hwaddr addr = qemu_get_be64(f);
3988 if (ramblock_is_ignored(block) &&
3989 block->mr->addr != addr) {
3990 error_report("Mismatched GPAs for block %s "
3991 "%" PRId64 "!= %" PRId64,
3992 id, (uint64_t)addr,
3993 (uint64_t)block->mr->addr);
3994 ret = -EINVAL;
3995 }
3996 }
3997 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3998 block->idstr);
3999 } else {
4000 error_report("Unknown ramblock \"%s\", cannot "
4001 "accept migration", id);
4002 ret = -EINVAL;
4003 }
4004
4005 total_ram_bytes -= length;
4006 }
4007 break;
4008
4009 case RAM_SAVE_FLAG_ZERO:
4010 ch = qemu_get_byte(f);
4011 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4012 break;
4013
4014 case RAM_SAVE_FLAG_PAGE:
4015 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4016 break;
4017
4018 case RAM_SAVE_FLAG_COMPRESS_PAGE:
4019 len = qemu_get_be32(f);
4020 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4021 error_report("Invalid compressed data length: %d", len);
4022 ret = -EINVAL;
4023 break;
4024 }
4025 decompress_data_with_multi_threads(f, host, len);
4026 break;
4027
4028 case RAM_SAVE_FLAG_XBZRLE:
4029 if (load_xbzrle(f, addr, host) < 0) {
4030 error_report("Failed to decompress XBZRLE page at "
4031 RAM_ADDR_FMT, addr);
4032 ret = -EINVAL;
4033 break;
4034 }
4035 break;
4036 case RAM_SAVE_FLAG_MULTIFD_FLUSH:
4037 multifd_recv_sync_main();
4038 break;
4039 case RAM_SAVE_FLAG_EOS:
4040 /* normal exit */
4041 if (migrate_multifd_flush_after_each_section()) {
4042 multifd_recv_sync_main();
4043 }
4044 break;
4045 case RAM_SAVE_FLAG_HOOK:
4046 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4047 break;
4048 default:
4049 error_report("Unknown combination of migration flags: 0x%x", flags);
4050 ret = -EINVAL;
4051 }
4052 if (!ret) {
4053 ret = qemu_file_get_error(f);
4054 }
4055 if (!ret && host_bak) {
4056 memcpy(host_bak, host, TARGET_PAGE_SIZE);
4057 }
4058 }
4059
4060 ret |= wait_for_decompress_done();
4061 return ret;
4062 }
4063
4064 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4065 {
4066 int ret = 0;
4067 static uint64_t seq_iter;
4068 /*
4069 * If system is running in postcopy mode, page inserts to host memory must
4070 * be atomic
4071 */
4072 bool postcopy_running = postcopy_is_running();
4073
4074 seq_iter++;
4075
4076 if (version_id != 4) {
4077 return -EINVAL;
4078 }
4079
4080 /*
4081 * This RCU critical section can be very long running.
4082 * When RCU reclaims in the code start to become numerous,
4083 * it will be necessary to reduce the granularity of this
4084 * critical section.
4085 */
4086 WITH_RCU_READ_LOCK_GUARD() {
4087 if (postcopy_running) {
4088 /*
4089 * Note! Here RAM_CHANNEL_PRECOPY is the precopy channel of
4090 * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
4091 * service fast page faults.
4092 */
4093 ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
4094 } else {
4095 ret = ram_load_precopy(f);
4096 }
4097 }
4098 trace_ram_load_complete(ret, seq_iter);
4099
4100 return ret;
4101 }
4102
4103 static bool ram_has_postcopy(void *opaque)
4104 {
4105 RAMBlock *rb;
4106 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4107 if (ramblock_is_pmem(rb)) {
4108 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4109 "is not supported now!", rb->idstr, rb->host);
4110 return false;
4111 }
4112 }
4113
4114 return migrate_postcopy_ram();
4115 }
4116
4117 /* Sync all the dirty bitmap with destination VM. */
4118 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4119 {
4120 RAMBlock *block;
4121 QEMUFile *file = s->to_dst_file;
4122 int ramblock_count = 0;
4123
4124 trace_ram_dirty_bitmap_sync_start();
4125
4126 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4127 qemu_savevm_send_recv_bitmap(file, block->idstr);
4128 trace_ram_dirty_bitmap_request(block->idstr);
4129 ramblock_count++;
4130 }
4131
4132 trace_ram_dirty_bitmap_sync_wait();
4133
4134 /* Wait until all the ramblocks' dirty bitmap synced */
4135 while (ramblock_count--) {
4136 qemu_sem_wait(&s->rp_state.rp_sem);
4137 }
4138
4139 trace_ram_dirty_bitmap_sync_complete();
4140
4141 return 0;
4142 }
4143
4144 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4145 {
4146 qemu_sem_post(&s->rp_state.rp_sem);
4147 }
4148
4149 /*
4150 * Read the received bitmap, revert it as the initial dirty bitmap.
4151 * This is only used when the postcopy migration is paused but wants
4152 * to resume from a middle point.
4153 */
4154 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4155 {
4156 int ret = -EINVAL;
4157 /* from_dst_file is always valid because we're within rp_thread */
4158 QEMUFile *file = s->rp_state.from_dst_file;
4159 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4160 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4161 uint64_t size, end_mark;
4162
4163 trace_ram_dirty_bitmap_reload_begin(block->idstr);
4164
4165 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4166 error_report("%s: incorrect state %s", __func__,
4167 MigrationStatus_str(s->state));
4168 return -EINVAL;
4169 }
4170
4171 /*
4172 * Note: see comments in ramblock_recv_bitmap_send() on why we
4173 * need the endianness conversion, and the paddings.
4174 */
4175 local_size = ROUND_UP(local_size, 8);
4176
4177 /* Add paddings */
4178 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4179
4180 size = qemu_get_be64(file);
4181
4182 /* The size of the bitmap should match with our ramblock */
4183 if (size != local_size) {
4184 error_report("%s: ramblock '%s' bitmap size mismatch "
4185 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4186 block->idstr, size, local_size);
4187 ret = -EINVAL;
4188 goto out;
4189 }
4190
4191 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4192 end_mark = qemu_get_be64(file);
4193
4194 ret = qemu_file_get_error(file);
4195 if (ret || size != local_size) {
4196 error_report("%s: read bitmap failed for ramblock '%s': %d"
4197 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4198 __func__, block->idstr, ret, local_size, size);
4199 ret = -EIO;
4200 goto out;
4201 }
4202
4203 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4204 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4205 __func__, block->idstr, end_mark);
4206 ret = -EINVAL;
4207 goto out;
4208 }
4209
4210 /*
4211 * Endianness conversion. We are during postcopy (though paused).
4212 * The dirty bitmap won't change. We can directly modify it.
4213 */
4214 bitmap_from_le(block->bmap, le_bitmap, nbits);
4215
4216 /*
4217 * What we received is "received bitmap". Revert it as the initial
4218 * dirty bitmap for this ramblock.
4219 */
4220 bitmap_complement(block->bmap, block->bmap, nbits);
4221
4222 /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4223 ramblock_dirty_bitmap_clear_discarded_pages(block);
4224
4225 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4226 trace_ram_dirty_bitmap_reload_complete(block->idstr);
4227
4228 /*
4229 * We succeeded to sync bitmap for current ramblock. If this is
4230 * the last one to sync, we need to notify the main send thread.
4231 */
4232 ram_dirty_bitmap_reload_notify(s);
4233
4234 ret = 0;
4235 out:
4236 g_free(le_bitmap);
4237 return ret;
4238 }
4239
4240 static int ram_resume_prepare(MigrationState *s, void *opaque)
4241 {
4242 RAMState *rs = *(RAMState **)opaque;
4243 int ret;
4244
4245 ret = ram_dirty_bitmap_sync_all(s, rs);
4246 if (ret) {
4247 return ret;
4248 }
4249
4250 ram_state_resume_prepare(rs, s->to_dst_file);
4251
4252 return 0;
4253 }
4254
4255 void postcopy_preempt_shutdown_file(MigrationState *s)
4256 {
4257 qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
4258 qemu_fflush(s->postcopy_qemufile_src);
4259 }
4260
4261 static SaveVMHandlers savevm_ram_handlers = {
4262 .save_setup = ram_save_setup,
4263 .save_live_iterate = ram_save_iterate,
4264 .save_live_complete_postcopy = ram_save_complete,
4265 .save_live_complete_precopy = ram_save_complete,
4266 .has_postcopy = ram_has_postcopy,
4267 .state_pending_exact = ram_state_pending_exact,
4268 .state_pending_estimate = ram_state_pending_estimate,
4269 .load_state = ram_load,
4270 .save_cleanup = ram_save_cleanup,
4271 .load_setup = ram_load_setup,
4272 .load_cleanup = ram_load_cleanup,
4273 .resume_prepare = ram_resume_prepare,
4274 };
4275
4276 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4277 size_t old_size, size_t new_size)
4278 {
4279 PostcopyState ps = postcopy_state_get();
4280 ram_addr_t offset;
4281 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4282 Error *err = NULL;
4283
4284 if (ramblock_is_ignored(rb)) {
4285 return;
4286 }
4287
4288 if (!migration_is_idle()) {
4289 /*
4290 * Precopy code on the source cannot deal with the size of RAM blocks
4291 * changing at random points in time - especially after sending the
4292 * RAM block sizes in the migration stream, they must no longer change.
4293 * Abort and indicate a proper reason.
4294 */
4295 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4296 migration_cancel(err);
4297 error_free(err);
4298 }
4299
4300 switch (ps) {
4301 case POSTCOPY_INCOMING_ADVISE:
4302 /*
4303 * Update what ram_postcopy_incoming_init()->init_range() does at the
4304 * time postcopy was advised. Syncing RAM blocks with the source will
4305 * result in RAM resizes.
4306 */
4307 if (old_size < new_size) {
4308 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4309 error_report("RAM block '%s' discard of resized RAM failed",
4310 rb->idstr);
4311 }
4312 }
4313 rb->postcopy_length = new_size;
4314 break;
4315 case POSTCOPY_INCOMING_NONE:
4316 case POSTCOPY_INCOMING_RUNNING:
4317 case POSTCOPY_INCOMING_END:
4318 /*
4319 * Once our guest is running, postcopy does no longer care about
4320 * resizes. When growing, the new memory was not available on the
4321 * source, no handler needed.
4322 */
4323 break;
4324 default:
4325 error_report("RAM block '%s' resized during postcopy state: %d",
4326 rb->idstr, ps);
4327 exit(-1);
4328 }
4329 }
4330
4331 static RAMBlockNotifier ram_mig_ram_notifier = {
4332 .ram_block_resized = ram_mig_ram_block_resized,
4333 };
4334
4335 void ram_mig_init(void)
4336 {
4337 qemu_mutex_init(&XBZRLE.lock);
4338 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4339 ram_block_notifier_add(&ram_mig_ram_notifier);
4340 }