]> git.proxmox.com Git - mirror_qemu.git/blob - migration/ram.c
migration: Export ram_release_page()
[mirror_qemu.git] / migration / ram.c
1 /*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
28
29 #include "qemu/osdep.h"
30 #include "qemu/cutils.h"
31 #include "qemu/bitops.h"
32 #include "qemu/bitmap.h"
33 #include "qemu/madvise.h"
34 #include "qemu/main-loop.h"
35 #include "io/channel-null.h"
36 #include "xbzrle.h"
37 #include "ram.h"
38 #include "migration.h"
39 #include "migration/register.h"
40 #include "migration/misc.h"
41 #include "qemu-file.h"
42 #include "postcopy-ram.h"
43 #include "page_cache.h"
44 #include "qemu/error-report.h"
45 #include "qapi/error.h"
46 #include "qapi/qapi-types-migration.h"
47 #include "qapi/qapi-events-migration.h"
48 #include "qapi/qmp/qerror.h"
49 #include "trace.h"
50 #include "exec/ram_addr.h"
51 #include "exec/target_page.h"
52 #include "qemu/rcu_queue.h"
53 #include "migration/colo.h"
54 #include "block.h"
55 #include "sysemu/cpu-throttle.h"
56 #include "savevm.h"
57 #include "qemu/iov.h"
58 #include "multifd.h"
59 #include "sysemu/runstate.h"
60
61 #include "hw/boards.h" /* for machine_dump_guest_core() */
62
63 #if defined(__linux__)
64 #include "qemu/userfaultfd.h"
65 #endif /* defined(__linux__) */
66
67 /***********************************************************/
68 /* ram save/restore */
69
70 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
71 * worked for pages that where filled with the same char. We switched
72 * it to only search for the zero value. And to avoid confusion with
73 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
74 */
75
76 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
77 #define RAM_SAVE_FLAG_ZERO 0x02
78 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
79 #define RAM_SAVE_FLAG_PAGE 0x08
80 #define RAM_SAVE_FLAG_EOS 0x10
81 #define RAM_SAVE_FLAG_CONTINUE 0x20
82 #define RAM_SAVE_FLAG_XBZRLE 0x40
83 /* 0x80 is reserved in migration.h start with 0x100 next */
84 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
85
86 XBZRLECacheStats xbzrle_counters;
87
88 /* struct contains XBZRLE cache and a static page
89 used by the compression */
90 static struct {
91 /* buffer used for XBZRLE encoding */
92 uint8_t *encoded_buf;
93 /* buffer for storing page content */
94 uint8_t *current_buf;
95 /* Cache for XBZRLE, Protected by lock. */
96 PageCache *cache;
97 QemuMutex lock;
98 /* it will store a page full of zeros */
99 uint8_t *zero_target_page;
100 /* buffer used for XBZRLE decoding */
101 uint8_t *decoded_buf;
102 } XBZRLE;
103
104 static void XBZRLE_cache_lock(void)
105 {
106 if (migrate_use_xbzrle()) {
107 qemu_mutex_lock(&XBZRLE.lock);
108 }
109 }
110
111 static void XBZRLE_cache_unlock(void)
112 {
113 if (migrate_use_xbzrle()) {
114 qemu_mutex_unlock(&XBZRLE.lock);
115 }
116 }
117
118 /**
119 * xbzrle_cache_resize: resize the xbzrle cache
120 *
121 * This function is called from migrate_params_apply in main
122 * thread, possibly while a migration is in progress. A running
123 * migration may be using the cache and might finish during this call,
124 * hence changes to the cache are protected by XBZRLE.lock().
125 *
126 * Returns 0 for success or -1 for error
127 *
128 * @new_size: new cache size
129 * @errp: set *errp if the check failed, with reason
130 */
131 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
132 {
133 PageCache *new_cache;
134 int64_t ret = 0;
135
136 /* Check for truncation */
137 if (new_size != (size_t)new_size) {
138 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
139 "exceeding address space");
140 return -1;
141 }
142
143 if (new_size == migrate_xbzrle_cache_size()) {
144 /* nothing to do */
145 return 0;
146 }
147
148 XBZRLE_cache_lock();
149
150 if (XBZRLE.cache != NULL) {
151 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
152 if (!new_cache) {
153 ret = -1;
154 goto out;
155 }
156
157 cache_fini(XBZRLE.cache);
158 XBZRLE.cache = new_cache;
159 }
160 out:
161 XBZRLE_cache_unlock();
162 return ret;
163 }
164
165 bool ramblock_is_ignored(RAMBlock *block)
166 {
167 return !qemu_ram_is_migratable(block) ||
168 (migrate_ignore_shared() && qemu_ram_is_shared(block));
169 }
170
171 #undef RAMBLOCK_FOREACH
172
173 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
174 {
175 RAMBlock *block;
176 int ret = 0;
177
178 RCU_READ_LOCK_GUARD();
179
180 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
181 ret = func(block, opaque);
182 if (ret) {
183 break;
184 }
185 }
186 return ret;
187 }
188
189 static void ramblock_recv_map_init(void)
190 {
191 RAMBlock *rb;
192
193 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
194 assert(!rb->receivedmap);
195 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
196 }
197 }
198
199 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
200 {
201 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
202 rb->receivedmap);
203 }
204
205 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
206 {
207 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
208 }
209
210 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
211 {
212 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
213 }
214
215 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
216 size_t nr)
217 {
218 bitmap_set_atomic(rb->receivedmap,
219 ramblock_recv_bitmap_offset(host_addr, rb),
220 nr);
221 }
222
223 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
224
225 /*
226 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
227 *
228 * Returns >0 if success with sent bytes, or <0 if error.
229 */
230 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
231 const char *block_name)
232 {
233 RAMBlock *block = qemu_ram_block_by_name(block_name);
234 unsigned long *le_bitmap, nbits;
235 uint64_t size;
236
237 if (!block) {
238 error_report("%s: invalid block name: %s", __func__, block_name);
239 return -1;
240 }
241
242 nbits = block->postcopy_length >> TARGET_PAGE_BITS;
243
244 /*
245 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
246 * machines we may need 4 more bytes for padding (see below
247 * comment). So extend it a bit before hand.
248 */
249 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
250
251 /*
252 * Always use little endian when sending the bitmap. This is
253 * required that when source and destination VMs are not using the
254 * same endianness. (Note: big endian won't work.)
255 */
256 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
257
258 /* Size of the bitmap, in bytes */
259 size = DIV_ROUND_UP(nbits, 8);
260
261 /*
262 * size is always aligned to 8 bytes for 64bit machines, but it
263 * may not be true for 32bit machines. We need this padding to
264 * make sure the migration can survive even between 32bit and
265 * 64bit machines.
266 */
267 size = ROUND_UP(size, 8);
268
269 qemu_put_be64(file, size);
270 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
271 /*
272 * Mark as an end, in case the middle part is screwed up due to
273 * some "mysterious" reason.
274 */
275 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
276 qemu_fflush(file);
277
278 g_free(le_bitmap);
279
280 if (qemu_file_get_error(file)) {
281 return qemu_file_get_error(file);
282 }
283
284 return size + sizeof(size);
285 }
286
287 /*
288 * An outstanding page request, on the source, having been received
289 * and queued
290 */
291 struct RAMSrcPageRequest {
292 RAMBlock *rb;
293 hwaddr offset;
294 hwaddr len;
295
296 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
297 };
298
299 typedef struct {
300 /*
301 * Cached ramblock/offset values if preempted. They're only meaningful if
302 * preempted==true below.
303 */
304 RAMBlock *ram_block;
305 unsigned long ram_page;
306 /*
307 * Whether a postcopy preemption just happened. Will be reset after
308 * precopy recovered to background migration.
309 */
310 bool preempted;
311 } PostcopyPreemptState;
312
313 /* State of RAM for migration */
314 struct RAMState {
315 /* QEMUFile used for this migration */
316 QEMUFile *f;
317 /* UFFD file descriptor, used in 'write-tracking' migration */
318 int uffdio_fd;
319 /* Last block that we have visited searching for dirty pages */
320 RAMBlock *last_seen_block;
321 /* Last block from where we have sent data */
322 RAMBlock *last_sent_block;
323 /* Last dirty target page we have sent */
324 ram_addr_t last_page;
325 /* last ram version we have seen */
326 uint32_t last_version;
327 /* How many times we have dirty too many pages */
328 int dirty_rate_high_cnt;
329 /* these variables are used for bitmap sync */
330 /* last time we did a full bitmap_sync */
331 int64_t time_last_bitmap_sync;
332 /* bytes transferred at start_time */
333 uint64_t bytes_xfer_prev;
334 /* number of dirty pages since start_time */
335 uint64_t num_dirty_pages_period;
336 /* xbzrle misses since the beginning of the period */
337 uint64_t xbzrle_cache_miss_prev;
338 /* Amount of xbzrle pages since the beginning of the period */
339 uint64_t xbzrle_pages_prev;
340 /* Amount of xbzrle encoded bytes since the beginning of the period */
341 uint64_t xbzrle_bytes_prev;
342 /* Start using XBZRLE (e.g., after the first round). */
343 bool xbzrle_enabled;
344 /* Are we on the last stage of migration */
345 bool last_stage;
346 /* compression statistics since the beginning of the period */
347 /* amount of count that no free thread to compress data */
348 uint64_t compress_thread_busy_prev;
349 /* amount bytes after compression */
350 uint64_t compressed_size_prev;
351 /* amount of compressed pages */
352 uint64_t compress_pages_prev;
353
354 /* total handled target pages at the beginning of period */
355 uint64_t target_page_count_prev;
356 /* total handled target pages since start */
357 uint64_t target_page_count;
358 /* number of dirty bits in the bitmap */
359 uint64_t migration_dirty_pages;
360 /* Protects modification of the bitmap and migration dirty pages */
361 QemuMutex bitmap_mutex;
362 /* The RAMBlock used in the last src_page_requests */
363 RAMBlock *last_req_rb;
364 /* Queue of outstanding page requests from the destination */
365 QemuMutex src_page_req_mutex;
366 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
367
368 /* Postcopy preemption informations */
369 PostcopyPreemptState postcopy_preempt_state;
370 /*
371 * Current channel we're using on src VM. Only valid if postcopy-preempt
372 * is enabled.
373 */
374 unsigned int postcopy_channel;
375 };
376 typedef struct RAMState RAMState;
377
378 static RAMState *ram_state;
379
380 static NotifierWithReturnList precopy_notifier_list;
381
382 static void postcopy_preempt_reset(RAMState *rs)
383 {
384 memset(&rs->postcopy_preempt_state, 0, sizeof(PostcopyPreemptState));
385 }
386
387 /* Whether postcopy has queued requests? */
388 static bool postcopy_has_request(RAMState *rs)
389 {
390 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
391 }
392
393 void precopy_infrastructure_init(void)
394 {
395 notifier_with_return_list_init(&precopy_notifier_list);
396 }
397
398 void precopy_add_notifier(NotifierWithReturn *n)
399 {
400 notifier_with_return_list_add(&precopy_notifier_list, n);
401 }
402
403 void precopy_remove_notifier(NotifierWithReturn *n)
404 {
405 notifier_with_return_remove(n);
406 }
407
408 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
409 {
410 PrecopyNotifyData pnd;
411 pnd.reason = reason;
412 pnd.errp = errp;
413
414 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
415 }
416
417 uint64_t ram_bytes_remaining(void)
418 {
419 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
420 0;
421 }
422
423 MigrationStats ram_counters;
424
425 void ram_transferred_add(uint64_t bytes)
426 {
427 if (runstate_is_running()) {
428 ram_counters.precopy_bytes += bytes;
429 } else if (migration_in_postcopy()) {
430 ram_counters.postcopy_bytes += bytes;
431 } else {
432 ram_counters.downtime_bytes += bytes;
433 }
434 ram_counters.transferred += bytes;
435 }
436
437 void dirty_sync_missed_zero_copy(void)
438 {
439 ram_counters.dirty_sync_missed_zero_copy++;
440 }
441
442 /* used by the search for pages to send */
443 struct PageSearchStatus {
444 /* Current block being searched */
445 RAMBlock *block;
446 /* Current page to search from */
447 unsigned long page;
448 /* Set once we wrap around */
449 bool complete_round;
450 /*
451 * [POSTCOPY-ONLY] Whether current page is explicitly requested by
452 * postcopy. When set, the request is "urgent" because the dest QEMU
453 * threads are waiting for us.
454 */
455 bool postcopy_requested;
456 /*
457 * [POSTCOPY-ONLY] The target channel to use to send current page.
458 *
459 * Note: This may _not_ match with the value in postcopy_requested
460 * above. Let's imagine the case where the postcopy request is exactly
461 * the page that we're sending in progress during precopy. In this case
462 * we'll have postcopy_requested set to true but the target channel
463 * will be the precopy channel (so that we don't split brain on that
464 * specific page since the precopy channel already contains partial of
465 * that page data).
466 *
467 * Besides that specific use case, postcopy_target_channel should
468 * always be equal to postcopy_requested, because by default we send
469 * postcopy pages via postcopy preempt channel.
470 */
471 bool postcopy_target_channel;
472 };
473 typedef struct PageSearchStatus PageSearchStatus;
474
475 CompressionStats compression_counters;
476
477 struct CompressParam {
478 bool done;
479 bool quit;
480 bool zero_page;
481 QEMUFile *file;
482 QemuMutex mutex;
483 QemuCond cond;
484 RAMBlock *block;
485 ram_addr_t offset;
486
487 /* internally used fields */
488 z_stream stream;
489 uint8_t *originbuf;
490 };
491 typedef struct CompressParam CompressParam;
492
493 struct DecompressParam {
494 bool done;
495 bool quit;
496 QemuMutex mutex;
497 QemuCond cond;
498 void *des;
499 uint8_t *compbuf;
500 int len;
501 z_stream stream;
502 };
503 typedef struct DecompressParam DecompressParam;
504
505 static CompressParam *comp_param;
506 static QemuThread *compress_threads;
507 /* comp_done_cond is used to wake up the migration thread when
508 * one of the compression threads has finished the compression.
509 * comp_done_lock is used to co-work with comp_done_cond.
510 */
511 static QemuMutex comp_done_lock;
512 static QemuCond comp_done_cond;
513
514 static QEMUFile *decomp_file;
515 static DecompressParam *decomp_param;
516 static QemuThread *decompress_threads;
517 static QemuMutex decomp_done_lock;
518 static QemuCond decomp_done_cond;
519
520 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
521 ram_addr_t offset, uint8_t *source_buf);
522
523 static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss,
524 bool postcopy_requested);
525
526 static void *do_data_compress(void *opaque)
527 {
528 CompressParam *param = opaque;
529 RAMBlock *block;
530 ram_addr_t offset;
531 bool zero_page;
532
533 qemu_mutex_lock(&param->mutex);
534 while (!param->quit) {
535 if (param->block) {
536 block = param->block;
537 offset = param->offset;
538 param->block = NULL;
539 qemu_mutex_unlock(&param->mutex);
540
541 zero_page = do_compress_ram_page(param->file, &param->stream,
542 block, offset, param->originbuf);
543
544 qemu_mutex_lock(&comp_done_lock);
545 param->done = true;
546 param->zero_page = zero_page;
547 qemu_cond_signal(&comp_done_cond);
548 qemu_mutex_unlock(&comp_done_lock);
549
550 qemu_mutex_lock(&param->mutex);
551 } else {
552 qemu_cond_wait(&param->cond, &param->mutex);
553 }
554 }
555 qemu_mutex_unlock(&param->mutex);
556
557 return NULL;
558 }
559
560 static void compress_threads_save_cleanup(void)
561 {
562 int i, thread_count;
563
564 if (!migrate_use_compression() || !comp_param) {
565 return;
566 }
567
568 thread_count = migrate_compress_threads();
569 for (i = 0; i < thread_count; i++) {
570 /*
571 * we use it as a indicator which shows if the thread is
572 * properly init'd or not
573 */
574 if (!comp_param[i].file) {
575 break;
576 }
577
578 qemu_mutex_lock(&comp_param[i].mutex);
579 comp_param[i].quit = true;
580 qemu_cond_signal(&comp_param[i].cond);
581 qemu_mutex_unlock(&comp_param[i].mutex);
582
583 qemu_thread_join(compress_threads + i);
584 qemu_mutex_destroy(&comp_param[i].mutex);
585 qemu_cond_destroy(&comp_param[i].cond);
586 deflateEnd(&comp_param[i].stream);
587 g_free(comp_param[i].originbuf);
588 qemu_fclose(comp_param[i].file);
589 comp_param[i].file = NULL;
590 }
591 qemu_mutex_destroy(&comp_done_lock);
592 qemu_cond_destroy(&comp_done_cond);
593 g_free(compress_threads);
594 g_free(comp_param);
595 compress_threads = NULL;
596 comp_param = NULL;
597 }
598
599 static int compress_threads_save_setup(void)
600 {
601 int i, thread_count;
602
603 if (!migrate_use_compression()) {
604 return 0;
605 }
606 thread_count = migrate_compress_threads();
607 compress_threads = g_new0(QemuThread, thread_count);
608 comp_param = g_new0(CompressParam, thread_count);
609 qemu_cond_init(&comp_done_cond);
610 qemu_mutex_init(&comp_done_lock);
611 for (i = 0; i < thread_count; i++) {
612 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
613 if (!comp_param[i].originbuf) {
614 goto exit;
615 }
616
617 if (deflateInit(&comp_param[i].stream,
618 migrate_compress_level()) != Z_OK) {
619 g_free(comp_param[i].originbuf);
620 goto exit;
621 }
622
623 /* comp_param[i].file is just used as a dummy buffer to save data,
624 * set its ops to empty.
625 */
626 comp_param[i].file = qemu_file_new_output(
627 QIO_CHANNEL(qio_channel_null_new()));
628 comp_param[i].done = true;
629 comp_param[i].quit = false;
630 qemu_mutex_init(&comp_param[i].mutex);
631 qemu_cond_init(&comp_param[i].cond);
632 qemu_thread_create(compress_threads + i, "compress",
633 do_data_compress, comp_param + i,
634 QEMU_THREAD_JOINABLE);
635 }
636 return 0;
637
638 exit:
639 compress_threads_save_cleanup();
640 return -1;
641 }
642
643 /**
644 * save_page_header: write page header to wire
645 *
646 * If this is the 1st block, it also writes the block identification
647 *
648 * Returns the number of bytes written
649 *
650 * @f: QEMUFile where to send the data
651 * @block: block that contains the page we want to send
652 * @offset: offset inside the block for the page
653 * in the lower bits, it contains flags
654 */
655 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
656 ram_addr_t offset)
657 {
658 size_t size, len;
659
660 if (block == rs->last_sent_block) {
661 offset |= RAM_SAVE_FLAG_CONTINUE;
662 }
663 qemu_put_be64(f, offset);
664 size = 8;
665
666 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
667 len = strlen(block->idstr);
668 qemu_put_byte(f, len);
669 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
670 size += 1 + len;
671 rs->last_sent_block = block;
672 }
673 return size;
674 }
675
676 /**
677 * mig_throttle_guest_down: throttle down the guest
678 *
679 * Reduce amount of guest cpu execution to hopefully slow down memory
680 * writes. If guest dirty memory rate is reduced below the rate at
681 * which we can transfer pages to the destination then we should be
682 * able to complete migration. Some workloads dirty memory way too
683 * fast and will not effectively converge, even with auto-converge.
684 */
685 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
686 uint64_t bytes_dirty_threshold)
687 {
688 MigrationState *s = migrate_get_current();
689 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
690 uint64_t pct_increment = s->parameters.cpu_throttle_increment;
691 bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
692 int pct_max = s->parameters.max_cpu_throttle;
693
694 uint64_t throttle_now = cpu_throttle_get_percentage();
695 uint64_t cpu_now, cpu_ideal, throttle_inc;
696
697 /* We have not started throttling yet. Let's start it. */
698 if (!cpu_throttle_active()) {
699 cpu_throttle_set(pct_initial);
700 } else {
701 /* Throttling already on, just increase the rate */
702 if (!pct_tailslow) {
703 throttle_inc = pct_increment;
704 } else {
705 /* Compute the ideal CPU percentage used by Guest, which may
706 * make the dirty rate match the dirty rate threshold. */
707 cpu_now = 100 - throttle_now;
708 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
709 bytes_dirty_period);
710 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
711 }
712 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
713 }
714 }
715
716 void mig_throttle_counter_reset(void)
717 {
718 RAMState *rs = ram_state;
719
720 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
721 rs->num_dirty_pages_period = 0;
722 rs->bytes_xfer_prev = ram_counters.transferred;
723 }
724
725 /**
726 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
727 *
728 * @rs: current RAM state
729 * @current_addr: address for the zero page
730 *
731 * Update the xbzrle cache to reflect a page that's been sent as all 0.
732 * The important thing is that a stale (not-yet-0'd) page be replaced
733 * by the new data.
734 * As a bonus, if the page wasn't in the cache it gets added so that
735 * when a small write is made into the 0'd page it gets XBZRLE sent.
736 */
737 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
738 {
739 if (!rs->xbzrle_enabled) {
740 return;
741 }
742
743 /* We don't care if this fails to allocate a new cache page
744 * as long as it updated an old one */
745 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
746 ram_counters.dirty_sync_count);
747 }
748
749 #define ENCODING_FLAG_XBZRLE 0x1
750
751 /**
752 * save_xbzrle_page: compress and send current page
753 *
754 * Returns: 1 means that we wrote the page
755 * 0 means that page is identical to the one already sent
756 * -1 means that xbzrle would be longer than normal
757 *
758 * @rs: current RAM state
759 * @current_data: pointer to the address of the page contents
760 * @current_addr: addr of the page
761 * @block: block that contains the page we want to send
762 * @offset: offset inside the block for the page
763 */
764 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
765 ram_addr_t current_addr, RAMBlock *block,
766 ram_addr_t offset)
767 {
768 int encoded_len = 0, bytes_xbzrle;
769 uint8_t *prev_cached_page;
770
771 if (!cache_is_cached(XBZRLE.cache, current_addr,
772 ram_counters.dirty_sync_count)) {
773 xbzrle_counters.cache_miss++;
774 if (!rs->last_stage) {
775 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
776 ram_counters.dirty_sync_count) == -1) {
777 return -1;
778 } else {
779 /* update *current_data when the page has been
780 inserted into cache */
781 *current_data = get_cached_data(XBZRLE.cache, current_addr);
782 }
783 }
784 return -1;
785 }
786
787 /*
788 * Reaching here means the page has hit the xbzrle cache, no matter what
789 * encoding result it is (normal encoding, overflow or skipping the page),
790 * count the page as encoded. This is used to calculate the encoding rate.
791 *
792 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
793 * 2nd page turns out to be skipped (i.e. no new bytes written to the
794 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
795 * skipped page included. In this way, the encoding rate can tell if the
796 * guest page is good for xbzrle encoding.
797 */
798 xbzrle_counters.pages++;
799 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
800
801 /* save current buffer into memory */
802 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
803
804 /* XBZRLE encoding (if there is no overflow) */
805 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
806 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
807 TARGET_PAGE_SIZE);
808
809 /*
810 * Update the cache contents, so that it corresponds to the data
811 * sent, in all cases except where we skip the page.
812 */
813 if (!rs->last_stage && encoded_len != 0) {
814 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
815 /*
816 * In the case where we couldn't compress, ensure that the caller
817 * sends the data from the cache, since the guest might have
818 * changed the RAM since we copied it.
819 */
820 *current_data = prev_cached_page;
821 }
822
823 if (encoded_len == 0) {
824 trace_save_xbzrle_page_skipping();
825 return 0;
826 } else if (encoded_len == -1) {
827 trace_save_xbzrle_page_overflow();
828 xbzrle_counters.overflow++;
829 xbzrle_counters.bytes += TARGET_PAGE_SIZE;
830 return -1;
831 }
832
833 /* Send XBZRLE based compressed page */
834 bytes_xbzrle = save_page_header(rs, rs->f, block,
835 offset | RAM_SAVE_FLAG_XBZRLE);
836 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
837 qemu_put_be16(rs->f, encoded_len);
838 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
839 bytes_xbzrle += encoded_len + 1 + 2;
840 /*
841 * Like compressed_size (please see update_compress_thread_counts),
842 * the xbzrle encoded bytes don't count the 8 byte header with
843 * RAM_SAVE_FLAG_CONTINUE.
844 */
845 xbzrle_counters.bytes += bytes_xbzrle - 8;
846 ram_transferred_add(bytes_xbzrle);
847
848 return 1;
849 }
850
851 /**
852 * migration_bitmap_find_dirty: find the next dirty page from start
853 *
854 * Returns the page offset within memory region of the start of a dirty page
855 *
856 * @rs: current RAM state
857 * @rb: RAMBlock where to search for dirty pages
858 * @start: page where we start the search
859 */
860 static inline
861 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
862 unsigned long start)
863 {
864 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
865 unsigned long *bitmap = rb->bmap;
866
867 if (ramblock_is_ignored(rb)) {
868 return size;
869 }
870
871 return find_next_bit(bitmap, size, start);
872 }
873
874 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
875 unsigned long page)
876 {
877 uint8_t shift;
878 hwaddr size, start;
879
880 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
881 return;
882 }
883
884 shift = rb->clear_bmap_shift;
885 /*
886 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
887 * can make things easier sometimes since then start address
888 * of the small chunk will always be 64 pages aligned so the
889 * bitmap will always be aligned to unsigned long. We should
890 * even be able to remove this restriction but I'm simply
891 * keeping it.
892 */
893 assert(shift >= 6);
894
895 size = 1ULL << (TARGET_PAGE_BITS + shift);
896 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
897 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
898 memory_region_clear_dirty_bitmap(rb->mr, start, size);
899 }
900
901 static void
902 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
903 unsigned long start,
904 unsigned long npages)
905 {
906 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
907 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
908 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
909
910 /*
911 * Clear pages from start to start + npages - 1, so the end boundary is
912 * exclusive.
913 */
914 for (i = chunk_start; i < chunk_end; i += chunk_pages) {
915 migration_clear_memory_region_dirty_bitmap(rb, i);
916 }
917 }
918
919 /*
920 * colo_bitmap_find_diry:find contiguous dirty pages from start
921 *
922 * Returns the page offset within memory region of the start of the contiguout
923 * dirty page
924 *
925 * @rs: current RAM state
926 * @rb: RAMBlock where to search for dirty pages
927 * @start: page where we start the search
928 * @num: the number of contiguous dirty pages
929 */
930 static inline
931 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
932 unsigned long start, unsigned long *num)
933 {
934 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
935 unsigned long *bitmap = rb->bmap;
936 unsigned long first, next;
937
938 *num = 0;
939
940 if (ramblock_is_ignored(rb)) {
941 return size;
942 }
943
944 first = find_next_bit(bitmap, size, start);
945 if (first >= size) {
946 return first;
947 }
948 next = find_next_zero_bit(bitmap, size, first + 1);
949 assert(next >= first);
950 *num = next - first;
951 return first;
952 }
953
954 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
955 RAMBlock *rb,
956 unsigned long page)
957 {
958 bool ret;
959
960 /*
961 * Clear dirty bitmap if needed. This _must_ be called before we
962 * send any of the page in the chunk because we need to make sure
963 * we can capture further page content changes when we sync dirty
964 * log the next time. So as long as we are going to send any of
965 * the page in the chunk we clear the remote dirty bitmap for all.
966 * Clearing it earlier won't be a problem, but too late will.
967 */
968 migration_clear_memory_region_dirty_bitmap(rb, page);
969
970 ret = test_and_clear_bit(page, rb->bmap);
971 if (ret) {
972 rs->migration_dirty_pages--;
973 }
974
975 return ret;
976 }
977
978 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
979 void *opaque)
980 {
981 const hwaddr offset = section->offset_within_region;
982 const hwaddr size = int128_get64(section->size);
983 const unsigned long start = offset >> TARGET_PAGE_BITS;
984 const unsigned long npages = size >> TARGET_PAGE_BITS;
985 RAMBlock *rb = section->mr->ram_block;
986 uint64_t *cleared_bits = opaque;
987
988 /*
989 * We don't grab ram_state->bitmap_mutex because we expect to run
990 * only when starting migration or during postcopy recovery where
991 * we don't have concurrent access.
992 */
993 if (!migration_in_postcopy() && !migrate_background_snapshot()) {
994 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
995 }
996 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
997 bitmap_clear(rb->bmap, start, npages);
998 }
999
1000 /*
1001 * Exclude all dirty pages from migration that fall into a discarded range as
1002 * managed by a RamDiscardManager responsible for the mapped memory region of
1003 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
1004 *
1005 * Discarded pages ("logically unplugged") have undefined content and must
1006 * not get migrated, because even reading these pages for migration might
1007 * result in undesired behavior.
1008 *
1009 * Returns the number of cleared bits in the RAMBlock dirty bitmap.
1010 *
1011 * Note: The result is only stable while migrating (precopy/postcopy).
1012 */
1013 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
1014 {
1015 uint64_t cleared_bits = 0;
1016
1017 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
1018 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1019 MemoryRegionSection section = {
1020 .mr = rb->mr,
1021 .offset_within_region = 0,
1022 .size = int128_make64(qemu_ram_get_used_length(rb)),
1023 };
1024
1025 ram_discard_manager_replay_discarded(rdm, &section,
1026 dirty_bitmap_clear_section,
1027 &cleared_bits);
1028 }
1029 return cleared_bits;
1030 }
1031
1032 /*
1033 * Check if a host-page aligned page falls into a discarded range as managed by
1034 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
1035 *
1036 * Note: The result is only stable while migrating (precopy/postcopy).
1037 */
1038 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
1039 {
1040 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1041 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1042 MemoryRegionSection section = {
1043 .mr = rb->mr,
1044 .offset_within_region = start,
1045 .size = int128_make64(qemu_ram_pagesize(rb)),
1046 };
1047
1048 return !ram_discard_manager_is_populated(rdm, &section);
1049 }
1050 return false;
1051 }
1052
1053 /* Called with RCU critical section */
1054 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
1055 {
1056 uint64_t new_dirty_pages =
1057 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
1058
1059 rs->migration_dirty_pages += new_dirty_pages;
1060 rs->num_dirty_pages_period += new_dirty_pages;
1061 }
1062
1063 /**
1064 * ram_pagesize_summary: calculate all the pagesizes of a VM
1065 *
1066 * Returns a summary bitmap of the page sizes of all RAMBlocks
1067 *
1068 * For VMs with just normal pages this is equivalent to the host page
1069 * size. If it's got some huge pages then it's the OR of all the
1070 * different page sizes.
1071 */
1072 uint64_t ram_pagesize_summary(void)
1073 {
1074 RAMBlock *block;
1075 uint64_t summary = 0;
1076
1077 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1078 summary |= block->page_size;
1079 }
1080
1081 return summary;
1082 }
1083
1084 uint64_t ram_get_total_transferred_pages(void)
1085 {
1086 return ram_counters.normal + ram_counters.duplicate +
1087 compression_counters.pages + xbzrle_counters.pages;
1088 }
1089
1090 static void migration_update_rates(RAMState *rs, int64_t end_time)
1091 {
1092 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1093 double compressed_size;
1094
1095 /* calculate period counters */
1096 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1097 / (end_time - rs->time_last_bitmap_sync);
1098
1099 if (!page_count) {
1100 return;
1101 }
1102
1103 if (migrate_use_xbzrle()) {
1104 double encoded_size, unencoded_size;
1105
1106 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1107 rs->xbzrle_cache_miss_prev) / page_count;
1108 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1109 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1110 TARGET_PAGE_SIZE;
1111 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
1112 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
1113 xbzrle_counters.encoding_rate = 0;
1114 } else {
1115 xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1116 }
1117 rs->xbzrle_pages_prev = xbzrle_counters.pages;
1118 rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
1119 }
1120
1121 if (migrate_use_compression()) {
1122 compression_counters.busy_rate = (double)(compression_counters.busy -
1123 rs->compress_thread_busy_prev) / page_count;
1124 rs->compress_thread_busy_prev = compression_counters.busy;
1125
1126 compressed_size = compression_counters.compressed_size -
1127 rs->compressed_size_prev;
1128 if (compressed_size) {
1129 double uncompressed_size = (compression_counters.pages -
1130 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1131
1132 /* Compression-Ratio = Uncompressed-size / Compressed-size */
1133 compression_counters.compression_rate =
1134 uncompressed_size / compressed_size;
1135
1136 rs->compress_pages_prev = compression_counters.pages;
1137 rs->compressed_size_prev = compression_counters.compressed_size;
1138 }
1139 }
1140 }
1141
1142 static void migration_trigger_throttle(RAMState *rs)
1143 {
1144 MigrationState *s = migrate_get_current();
1145 uint64_t threshold = s->parameters.throttle_trigger_threshold;
1146
1147 uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
1148 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1149 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1150
1151 /* During block migration the auto-converge logic incorrectly detects
1152 * that ram migration makes no progress. Avoid this by disabling the
1153 * throttling logic during the bulk phase of block migration. */
1154 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1155 /* The following detection logic can be refined later. For now:
1156 Check to see if the ratio between dirtied bytes and the approx.
1157 amount of bytes that just got transferred since the last time
1158 we were in this routine reaches the threshold. If that happens
1159 twice, start or increase throttling. */
1160
1161 if ((bytes_dirty_period > bytes_dirty_threshold) &&
1162 (++rs->dirty_rate_high_cnt >= 2)) {
1163 trace_migration_throttle();
1164 rs->dirty_rate_high_cnt = 0;
1165 mig_throttle_guest_down(bytes_dirty_period,
1166 bytes_dirty_threshold);
1167 }
1168 }
1169 }
1170
1171 static void migration_bitmap_sync(RAMState *rs)
1172 {
1173 RAMBlock *block;
1174 int64_t end_time;
1175
1176 ram_counters.dirty_sync_count++;
1177
1178 if (!rs->time_last_bitmap_sync) {
1179 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1180 }
1181
1182 trace_migration_bitmap_sync_start();
1183 memory_global_dirty_log_sync();
1184
1185 qemu_mutex_lock(&rs->bitmap_mutex);
1186 WITH_RCU_READ_LOCK_GUARD() {
1187 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1188 ramblock_sync_dirty_bitmap(rs, block);
1189 }
1190 ram_counters.remaining = ram_bytes_remaining();
1191 }
1192 qemu_mutex_unlock(&rs->bitmap_mutex);
1193
1194 memory_global_after_dirty_log_sync();
1195 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1196
1197 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1198
1199 /* more than 1 second = 1000 millisecons */
1200 if (end_time > rs->time_last_bitmap_sync + 1000) {
1201 migration_trigger_throttle(rs);
1202
1203 migration_update_rates(rs, end_time);
1204
1205 rs->target_page_count_prev = rs->target_page_count;
1206
1207 /* reset period counters */
1208 rs->time_last_bitmap_sync = end_time;
1209 rs->num_dirty_pages_period = 0;
1210 rs->bytes_xfer_prev = ram_counters.transferred;
1211 }
1212 if (migrate_use_events()) {
1213 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1214 }
1215 }
1216
1217 static void migration_bitmap_sync_precopy(RAMState *rs)
1218 {
1219 Error *local_err = NULL;
1220
1221 /*
1222 * The current notifier usage is just an optimization to migration, so we
1223 * don't stop the normal migration process in the error case.
1224 */
1225 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1226 error_report_err(local_err);
1227 local_err = NULL;
1228 }
1229
1230 migration_bitmap_sync(rs);
1231
1232 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1233 error_report_err(local_err);
1234 }
1235 }
1236
1237 void ram_release_page(const char *rbname, uint64_t offset)
1238 {
1239 if (!migrate_release_ram() || !migration_in_postcopy()) {
1240 return;
1241 }
1242
1243 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1244 }
1245
1246 /**
1247 * save_zero_page_to_file: send the zero page to the file
1248 *
1249 * Returns the size of data written to the file, 0 means the page is not
1250 * a zero page
1251 *
1252 * @rs: current RAM state
1253 * @file: the file where the data is saved
1254 * @block: block that contains the page we want to send
1255 * @offset: offset inside the block for the page
1256 */
1257 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1258 RAMBlock *block, ram_addr_t offset)
1259 {
1260 uint8_t *p = block->host + offset;
1261 int len = 0;
1262
1263 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1264 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1265 qemu_put_byte(file, 0);
1266 len += 1;
1267 ram_release_page(block->idstr, offset);
1268 }
1269 return len;
1270 }
1271
1272 /**
1273 * save_zero_page: send the zero page to the stream
1274 *
1275 * Returns the number of pages written.
1276 *
1277 * @rs: current RAM state
1278 * @block: block that contains the page we want to send
1279 * @offset: offset inside the block for the page
1280 */
1281 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1282 {
1283 int len = save_zero_page_to_file(rs, rs->f, block, offset);
1284
1285 if (len) {
1286 ram_counters.duplicate++;
1287 ram_transferred_add(len);
1288 return 1;
1289 }
1290 return -1;
1291 }
1292
1293 /*
1294 * @pages: the number of pages written by the control path,
1295 * < 0 - error
1296 * > 0 - number of pages written
1297 *
1298 * Return true if the pages has been saved, otherwise false is returned.
1299 */
1300 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1301 int *pages)
1302 {
1303 uint64_t bytes_xmit = 0;
1304 int ret;
1305
1306 *pages = -1;
1307 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1308 &bytes_xmit);
1309 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1310 return false;
1311 }
1312
1313 if (bytes_xmit) {
1314 ram_transferred_add(bytes_xmit);
1315 *pages = 1;
1316 }
1317
1318 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1319 return true;
1320 }
1321
1322 if (bytes_xmit > 0) {
1323 ram_counters.normal++;
1324 } else if (bytes_xmit == 0) {
1325 ram_counters.duplicate++;
1326 }
1327
1328 return true;
1329 }
1330
1331 /*
1332 * directly send the page to the stream
1333 *
1334 * Returns the number of pages written.
1335 *
1336 * @rs: current RAM state
1337 * @block: block that contains the page we want to send
1338 * @offset: offset inside the block for the page
1339 * @buf: the page to be sent
1340 * @async: send to page asyncly
1341 */
1342 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1343 uint8_t *buf, bool async)
1344 {
1345 ram_transferred_add(save_page_header(rs, rs->f, block,
1346 offset | RAM_SAVE_FLAG_PAGE));
1347 if (async) {
1348 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1349 migrate_release_ram() &&
1350 migration_in_postcopy());
1351 } else {
1352 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1353 }
1354 ram_transferred_add(TARGET_PAGE_SIZE);
1355 ram_counters.normal++;
1356 return 1;
1357 }
1358
1359 /**
1360 * ram_save_page: send the given page to the stream
1361 *
1362 * Returns the number of pages written.
1363 * < 0 - error
1364 * >=0 - Number of pages written - this might legally be 0
1365 * if xbzrle noticed the page was the same.
1366 *
1367 * @rs: current RAM state
1368 * @block: block that contains the page we want to send
1369 * @offset: offset inside the block for the page
1370 */
1371 static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
1372 {
1373 int pages = -1;
1374 uint8_t *p;
1375 bool send_async = true;
1376 RAMBlock *block = pss->block;
1377 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1378 ram_addr_t current_addr = block->offset + offset;
1379
1380 p = block->host + offset;
1381 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1382
1383 XBZRLE_cache_lock();
1384 if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1385 pages = save_xbzrle_page(rs, &p, current_addr, block,
1386 offset);
1387 if (!rs->last_stage) {
1388 /* Can't send this cached data async, since the cache page
1389 * might get updated before it gets to the wire
1390 */
1391 send_async = false;
1392 }
1393 }
1394
1395 /* XBZRLE overflow or normal page */
1396 if (pages == -1) {
1397 pages = save_normal_page(rs, block, offset, p, send_async);
1398 }
1399
1400 XBZRLE_cache_unlock();
1401
1402 return pages;
1403 }
1404
1405 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1406 ram_addr_t offset)
1407 {
1408 if (multifd_queue_page(rs->f, block, offset) < 0) {
1409 return -1;
1410 }
1411 ram_counters.normal++;
1412
1413 return 1;
1414 }
1415
1416 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1417 ram_addr_t offset, uint8_t *source_buf)
1418 {
1419 RAMState *rs = ram_state;
1420 uint8_t *p = block->host + offset;
1421 int ret;
1422
1423 if (save_zero_page_to_file(rs, f, block, offset)) {
1424 return true;
1425 }
1426
1427 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1428
1429 /*
1430 * copy it to a internal buffer to avoid it being modified by VM
1431 * so that we can catch up the error during compression and
1432 * decompression
1433 */
1434 memcpy(source_buf, p, TARGET_PAGE_SIZE);
1435 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1436 if (ret < 0) {
1437 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1438 error_report("compressed data failed!");
1439 }
1440 return false;
1441 }
1442
1443 static void
1444 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1445 {
1446 ram_transferred_add(bytes_xmit);
1447
1448 if (param->zero_page) {
1449 ram_counters.duplicate++;
1450 return;
1451 }
1452
1453 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1454 compression_counters.compressed_size += bytes_xmit - 8;
1455 compression_counters.pages++;
1456 }
1457
1458 static bool save_page_use_compression(RAMState *rs);
1459
1460 static void flush_compressed_data(RAMState *rs)
1461 {
1462 int idx, len, thread_count;
1463
1464 if (!save_page_use_compression(rs)) {
1465 return;
1466 }
1467 thread_count = migrate_compress_threads();
1468
1469 qemu_mutex_lock(&comp_done_lock);
1470 for (idx = 0; idx < thread_count; idx++) {
1471 while (!comp_param[idx].done) {
1472 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1473 }
1474 }
1475 qemu_mutex_unlock(&comp_done_lock);
1476
1477 for (idx = 0; idx < thread_count; idx++) {
1478 qemu_mutex_lock(&comp_param[idx].mutex);
1479 if (!comp_param[idx].quit) {
1480 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1481 /*
1482 * it's safe to fetch zero_page without holding comp_done_lock
1483 * as there is no further request submitted to the thread,
1484 * i.e, the thread should be waiting for a request at this point.
1485 */
1486 update_compress_thread_counts(&comp_param[idx], len);
1487 }
1488 qemu_mutex_unlock(&comp_param[idx].mutex);
1489 }
1490 }
1491
1492 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1493 ram_addr_t offset)
1494 {
1495 param->block = block;
1496 param->offset = offset;
1497 }
1498
1499 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1500 ram_addr_t offset)
1501 {
1502 int idx, thread_count, bytes_xmit = -1, pages = -1;
1503 bool wait = migrate_compress_wait_thread();
1504
1505 thread_count = migrate_compress_threads();
1506 qemu_mutex_lock(&comp_done_lock);
1507 retry:
1508 for (idx = 0; idx < thread_count; idx++) {
1509 if (comp_param[idx].done) {
1510 comp_param[idx].done = false;
1511 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1512 qemu_mutex_lock(&comp_param[idx].mutex);
1513 set_compress_params(&comp_param[idx], block, offset);
1514 qemu_cond_signal(&comp_param[idx].cond);
1515 qemu_mutex_unlock(&comp_param[idx].mutex);
1516 pages = 1;
1517 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1518 break;
1519 }
1520 }
1521
1522 /*
1523 * wait for the free thread if the user specifies 'compress-wait-thread',
1524 * otherwise we will post the page out in the main thread as normal page.
1525 */
1526 if (pages < 0 && wait) {
1527 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1528 goto retry;
1529 }
1530 qemu_mutex_unlock(&comp_done_lock);
1531
1532 return pages;
1533 }
1534
1535 /**
1536 * find_dirty_block: find the next dirty page and update any state
1537 * associated with the search process.
1538 *
1539 * Returns true if a page is found
1540 *
1541 * @rs: current RAM state
1542 * @pss: data about the state of the current dirty page scan
1543 * @again: set to false if the search has scanned the whole of RAM
1544 */
1545 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1546 {
1547 /*
1548 * This is not a postcopy requested page, mark it "not urgent", and use
1549 * precopy channel to send it.
1550 */
1551 pss->postcopy_requested = false;
1552 pss->postcopy_target_channel = RAM_CHANNEL_PRECOPY;
1553
1554 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1555 if (pss->complete_round && pss->block == rs->last_seen_block &&
1556 pss->page >= rs->last_page) {
1557 /*
1558 * We've been once around the RAM and haven't found anything.
1559 * Give up.
1560 */
1561 *again = false;
1562 return false;
1563 }
1564 if (!offset_in_ramblock(pss->block,
1565 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1566 /* Didn't find anything in this RAM Block */
1567 pss->page = 0;
1568 pss->block = QLIST_NEXT_RCU(pss->block, next);
1569 if (!pss->block) {
1570 /*
1571 * If memory migration starts over, we will meet a dirtied page
1572 * which may still exists in compression threads's ring, so we
1573 * should flush the compressed data to make sure the new page
1574 * is not overwritten by the old one in the destination.
1575 *
1576 * Also If xbzrle is on, stop using the data compression at this
1577 * point. In theory, xbzrle can do better than compression.
1578 */
1579 flush_compressed_data(rs);
1580
1581 /* Hit the end of the list */
1582 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1583 /* Flag that we've looped */
1584 pss->complete_round = true;
1585 /* After the first round, enable XBZRLE. */
1586 if (migrate_use_xbzrle()) {
1587 rs->xbzrle_enabled = true;
1588 }
1589 }
1590 /* Didn't find anything this time, but try again on the new block */
1591 *again = true;
1592 return false;
1593 } else {
1594 /* Can go around again, but... */
1595 *again = true;
1596 /* We've found something so probably don't need to */
1597 return true;
1598 }
1599 }
1600
1601 /**
1602 * unqueue_page: gets a page of the queue
1603 *
1604 * Helper for 'get_queued_page' - gets a page off the queue
1605 *
1606 * Returns the block of the page (or NULL if none available)
1607 *
1608 * @rs: current RAM state
1609 * @offset: used to return the offset within the RAMBlock
1610 */
1611 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1612 {
1613 struct RAMSrcPageRequest *entry;
1614 RAMBlock *block = NULL;
1615
1616 if (!postcopy_has_request(rs)) {
1617 return NULL;
1618 }
1619
1620 QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1621
1622 /*
1623 * This should _never_ change even after we take the lock, because no one
1624 * should be taking anything off the request list other than us.
1625 */
1626 assert(postcopy_has_request(rs));
1627
1628 entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1629 block = entry->rb;
1630 *offset = entry->offset;
1631
1632 if (entry->len > TARGET_PAGE_SIZE) {
1633 entry->len -= TARGET_PAGE_SIZE;
1634 entry->offset += TARGET_PAGE_SIZE;
1635 } else {
1636 memory_region_unref(block->mr);
1637 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1638 g_free(entry);
1639 migration_consume_urgent_request();
1640 }
1641
1642 return block;
1643 }
1644
1645 #if defined(__linux__)
1646 /**
1647 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1648 * is found, return RAM block pointer and page offset
1649 *
1650 * Returns pointer to the RAMBlock containing faulting page,
1651 * NULL if no write faults are pending
1652 *
1653 * @rs: current RAM state
1654 * @offset: page offset from the beginning of the block
1655 */
1656 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1657 {
1658 struct uffd_msg uffd_msg;
1659 void *page_address;
1660 RAMBlock *block;
1661 int res;
1662
1663 if (!migrate_background_snapshot()) {
1664 return NULL;
1665 }
1666
1667 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1668 if (res <= 0) {
1669 return NULL;
1670 }
1671
1672 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1673 block = qemu_ram_block_from_host(page_address, false, offset);
1674 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1675 return block;
1676 }
1677
1678 /**
1679 * ram_save_release_protection: release UFFD write protection after
1680 * a range of pages has been saved
1681 *
1682 * @rs: current RAM state
1683 * @pss: page-search-status structure
1684 * @start_page: index of the first page in the range relative to pss->block
1685 *
1686 * Returns 0 on success, negative value in case of an error
1687 */
1688 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1689 unsigned long start_page)
1690 {
1691 int res = 0;
1692
1693 /* Check if page is from UFFD-managed region. */
1694 if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1695 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1696 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
1697
1698 /* Flush async buffers before un-protect. */
1699 qemu_fflush(rs->f);
1700 /* Un-protect memory range. */
1701 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1702 false, false);
1703 }
1704
1705 return res;
1706 }
1707
1708 /* ram_write_tracking_available: check if kernel supports required UFFD features
1709 *
1710 * Returns true if supports, false otherwise
1711 */
1712 bool ram_write_tracking_available(void)
1713 {
1714 uint64_t uffd_features;
1715 int res;
1716
1717 res = uffd_query_features(&uffd_features);
1718 return (res == 0 &&
1719 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1720 }
1721
1722 /* ram_write_tracking_compatible: check if guest configuration is
1723 * compatible with 'write-tracking'
1724 *
1725 * Returns true if compatible, false otherwise
1726 */
1727 bool ram_write_tracking_compatible(void)
1728 {
1729 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1730 int uffd_fd;
1731 RAMBlock *block;
1732 bool ret = false;
1733
1734 /* Open UFFD file descriptor */
1735 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1736 if (uffd_fd < 0) {
1737 return false;
1738 }
1739
1740 RCU_READ_LOCK_GUARD();
1741
1742 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1743 uint64_t uffd_ioctls;
1744
1745 /* Nothing to do with read-only and MMIO-writable regions */
1746 if (block->mr->readonly || block->mr->rom_device) {
1747 continue;
1748 }
1749 /* Try to register block memory via UFFD-IO to track writes */
1750 if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1751 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1752 goto out;
1753 }
1754 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1755 goto out;
1756 }
1757 }
1758 ret = true;
1759
1760 out:
1761 uffd_close_fd(uffd_fd);
1762 return ret;
1763 }
1764
1765 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1766 ram_addr_t size)
1767 {
1768 /*
1769 * We read one byte of each page; this will preallocate page tables if
1770 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1771 * where no page was populated yet. This might require adaption when
1772 * supporting other mappings, like shmem.
1773 */
1774 for (; offset < size; offset += block->page_size) {
1775 char tmp = *((char *)block->host + offset);
1776
1777 /* Don't optimize the read out */
1778 asm volatile("" : "+r" (tmp));
1779 }
1780 }
1781
1782 static inline int populate_read_section(MemoryRegionSection *section,
1783 void *opaque)
1784 {
1785 const hwaddr size = int128_get64(section->size);
1786 hwaddr offset = section->offset_within_region;
1787 RAMBlock *block = section->mr->ram_block;
1788
1789 populate_read_range(block, offset, size);
1790 return 0;
1791 }
1792
1793 /*
1794 * ram_block_populate_read: preallocate page tables and populate pages in the
1795 * RAM block by reading a byte of each page.
1796 *
1797 * Since it's solely used for userfault_fd WP feature, here we just
1798 * hardcode page size to qemu_real_host_page_size.
1799 *
1800 * @block: RAM block to populate
1801 */
1802 static void ram_block_populate_read(RAMBlock *rb)
1803 {
1804 /*
1805 * Skip populating all pages that fall into a discarded range as managed by
1806 * a RamDiscardManager responsible for the mapped memory region of the
1807 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1808 * must not get populated automatically. We don't have to track
1809 * modifications via userfaultfd WP reliably, because these pages will
1810 * not be part of the migration stream either way -- see
1811 * ramblock_dirty_bitmap_exclude_discarded_pages().
1812 *
1813 * Note: The result is only stable while migrating (precopy/postcopy).
1814 */
1815 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1816 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1817 MemoryRegionSection section = {
1818 .mr = rb->mr,
1819 .offset_within_region = 0,
1820 .size = rb->mr->size,
1821 };
1822
1823 ram_discard_manager_replay_populated(rdm, &section,
1824 populate_read_section, NULL);
1825 } else {
1826 populate_read_range(rb, 0, rb->used_length);
1827 }
1828 }
1829
1830 /*
1831 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1832 */
1833 void ram_write_tracking_prepare(void)
1834 {
1835 RAMBlock *block;
1836
1837 RCU_READ_LOCK_GUARD();
1838
1839 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1840 /* Nothing to do with read-only and MMIO-writable regions */
1841 if (block->mr->readonly || block->mr->rom_device) {
1842 continue;
1843 }
1844
1845 /*
1846 * Populate pages of the RAM block before enabling userfault_fd
1847 * write protection.
1848 *
1849 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1850 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1851 * pages with pte_none() entries in page table.
1852 */
1853 ram_block_populate_read(block);
1854 }
1855 }
1856
1857 /*
1858 * ram_write_tracking_start: start UFFD-WP memory tracking
1859 *
1860 * Returns 0 for success or negative value in case of error
1861 */
1862 int ram_write_tracking_start(void)
1863 {
1864 int uffd_fd;
1865 RAMState *rs = ram_state;
1866 RAMBlock *block;
1867
1868 /* Open UFFD file descriptor */
1869 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1870 if (uffd_fd < 0) {
1871 return uffd_fd;
1872 }
1873 rs->uffdio_fd = uffd_fd;
1874
1875 RCU_READ_LOCK_GUARD();
1876
1877 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1878 /* Nothing to do with read-only and MMIO-writable regions */
1879 if (block->mr->readonly || block->mr->rom_device) {
1880 continue;
1881 }
1882
1883 /* Register block memory with UFFD to track writes */
1884 if (uffd_register_memory(rs->uffdio_fd, block->host,
1885 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1886 goto fail;
1887 }
1888 /* Apply UFFD write protection to the block memory range */
1889 if (uffd_change_protection(rs->uffdio_fd, block->host,
1890 block->max_length, true, false)) {
1891 goto fail;
1892 }
1893 block->flags |= RAM_UF_WRITEPROTECT;
1894 memory_region_ref(block->mr);
1895
1896 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1897 block->host, block->max_length);
1898 }
1899
1900 return 0;
1901
1902 fail:
1903 error_report("ram_write_tracking_start() failed: restoring initial memory state");
1904
1905 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1906 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1907 continue;
1908 }
1909 /*
1910 * In case some memory block failed to be write-protected
1911 * remove protection and unregister all succeeded RAM blocks
1912 */
1913 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1914 false, false);
1915 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1916 /* Cleanup flags and remove reference */
1917 block->flags &= ~RAM_UF_WRITEPROTECT;
1918 memory_region_unref(block->mr);
1919 }
1920
1921 uffd_close_fd(uffd_fd);
1922 rs->uffdio_fd = -1;
1923 return -1;
1924 }
1925
1926 /**
1927 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1928 */
1929 void ram_write_tracking_stop(void)
1930 {
1931 RAMState *rs = ram_state;
1932 RAMBlock *block;
1933
1934 RCU_READ_LOCK_GUARD();
1935
1936 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1937 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1938 continue;
1939 }
1940 /* Remove protection and unregister all affected RAM blocks */
1941 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1942 false, false);
1943 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1944
1945 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1946 block->host, block->max_length);
1947
1948 /* Cleanup flags and remove reference */
1949 block->flags &= ~RAM_UF_WRITEPROTECT;
1950 memory_region_unref(block->mr);
1951 }
1952
1953 /* Finally close UFFD file descriptor */
1954 uffd_close_fd(rs->uffdio_fd);
1955 rs->uffdio_fd = -1;
1956 }
1957
1958 #else
1959 /* No target OS support, stubs just fail or ignore */
1960
1961 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1962 {
1963 (void) rs;
1964 (void) offset;
1965
1966 return NULL;
1967 }
1968
1969 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1970 unsigned long start_page)
1971 {
1972 (void) rs;
1973 (void) pss;
1974 (void) start_page;
1975
1976 return 0;
1977 }
1978
1979 bool ram_write_tracking_available(void)
1980 {
1981 return false;
1982 }
1983
1984 bool ram_write_tracking_compatible(void)
1985 {
1986 assert(0);
1987 return false;
1988 }
1989
1990 int ram_write_tracking_start(void)
1991 {
1992 assert(0);
1993 return -1;
1994 }
1995
1996 void ram_write_tracking_stop(void)
1997 {
1998 assert(0);
1999 }
2000 #endif /* defined(__linux__) */
2001
2002 /*
2003 * Check whether two addr/offset of the ramblock falls onto the same host huge
2004 * page. Returns true if so, false otherwise.
2005 */
2006 static bool offset_on_same_huge_page(RAMBlock *rb, uint64_t addr1,
2007 uint64_t addr2)
2008 {
2009 size_t page_size = qemu_ram_pagesize(rb);
2010
2011 addr1 = ROUND_DOWN(addr1, page_size);
2012 addr2 = ROUND_DOWN(addr2, page_size);
2013
2014 return addr1 == addr2;
2015 }
2016
2017 /*
2018 * Whether a previous preempted precopy huge page contains current requested
2019 * page? Returns true if so, false otherwise.
2020 *
2021 * This should really happen very rarely, because it means when we were sending
2022 * during background migration for postcopy we're sending exactly the page that
2023 * some vcpu got faulted on on dest node. When it happens, we probably don't
2024 * need to do much but drop the request, because we know right after we restore
2025 * the precopy stream it'll be serviced. It'll slightly affect the order of
2026 * postcopy requests to be serviced (e.g. it'll be the same as we move current
2027 * request to the end of the queue) but it shouldn't be a big deal. The most
2028 * imporant thing is we can _never_ try to send a partial-sent huge page on the
2029 * POSTCOPY channel again, otherwise that huge page will got "split brain" on
2030 * two channels (PRECOPY, POSTCOPY).
2031 */
2032 static bool postcopy_preempted_contains(RAMState *rs, RAMBlock *block,
2033 ram_addr_t offset)
2034 {
2035 PostcopyPreemptState *state = &rs->postcopy_preempt_state;
2036
2037 /* No preemption at all? */
2038 if (!state->preempted) {
2039 return false;
2040 }
2041
2042 /* Not even the same ramblock? */
2043 if (state->ram_block != block) {
2044 return false;
2045 }
2046
2047 return offset_on_same_huge_page(block, offset,
2048 state->ram_page << TARGET_PAGE_BITS);
2049 }
2050
2051 /**
2052 * get_queued_page: unqueue a page from the postcopy requests
2053 *
2054 * Skips pages that are already sent (!dirty)
2055 *
2056 * Returns true if a queued page is found
2057 *
2058 * @rs: current RAM state
2059 * @pss: data about the state of the current dirty page scan
2060 */
2061 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2062 {
2063 RAMBlock *block;
2064 ram_addr_t offset;
2065 bool dirty;
2066
2067 do {
2068 block = unqueue_page(rs, &offset);
2069 /*
2070 * We're sending this page, and since it's postcopy nothing else
2071 * will dirty it, and we must make sure it doesn't get sent again
2072 * even if this queue request was received after the background
2073 * search already sent it.
2074 */
2075 if (block) {
2076 unsigned long page;
2077
2078 page = offset >> TARGET_PAGE_BITS;
2079 dirty = test_bit(page, block->bmap);
2080 if (!dirty) {
2081 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2082 page);
2083 } else {
2084 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2085 }
2086 }
2087
2088 } while (block && !dirty);
2089
2090 if (block) {
2091 /* See comment above postcopy_preempted_contains() */
2092 if (postcopy_preempted_contains(rs, block, offset)) {
2093 trace_postcopy_preempt_hit(block->idstr, offset);
2094 /*
2095 * If what we preempted previously was exactly what we're
2096 * requesting right now, restore the preempted precopy
2097 * immediately, boosting its priority as it's requested by
2098 * postcopy.
2099 */
2100 postcopy_preempt_restore(rs, pss, true);
2101 return true;
2102 }
2103 } else {
2104 /*
2105 * Poll write faults too if background snapshot is enabled; that's
2106 * when we have vcpus got blocked by the write protected pages.
2107 */
2108 block = poll_fault_page(rs, &offset);
2109 }
2110
2111 if (block) {
2112 /*
2113 * We want the background search to continue from the queued page
2114 * since the guest is likely to want other pages near to the page
2115 * it just requested.
2116 */
2117 pss->block = block;
2118 pss->page = offset >> TARGET_PAGE_BITS;
2119
2120 /*
2121 * This unqueued page would break the "one round" check, even is
2122 * really rare.
2123 */
2124 pss->complete_round = false;
2125 /* Mark it an urgent request, meanwhile using POSTCOPY channel */
2126 pss->postcopy_requested = true;
2127 pss->postcopy_target_channel = RAM_CHANNEL_POSTCOPY;
2128 }
2129
2130 return !!block;
2131 }
2132
2133 /**
2134 * migration_page_queue_free: drop any remaining pages in the ram
2135 * request queue
2136 *
2137 * It should be empty at the end anyway, but in error cases there may
2138 * be some left. in case that there is any page left, we drop it.
2139 *
2140 */
2141 static void migration_page_queue_free(RAMState *rs)
2142 {
2143 struct RAMSrcPageRequest *mspr, *next_mspr;
2144 /* This queue generally should be empty - but in the case of a failed
2145 * migration might have some droppings in.
2146 */
2147 RCU_READ_LOCK_GUARD();
2148 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2149 memory_region_unref(mspr->rb->mr);
2150 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2151 g_free(mspr);
2152 }
2153 }
2154
2155 /**
2156 * ram_save_queue_pages: queue the page for transmission
2157 *
2158 * A request from postcopy destination for example.
2159 *
2160 * Returns zero on success or negative on error
2161 *
2162 * @rbname: Name of the RAMBLock of the request. NULL means the
2163 * same that last one.
2164 * @start: starting address from the start of the RAMBlock
2165 * @len: length (in bytes) to send
2166 */
2167 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2168 {
2169 RAMBlock *ramblock;
2170 RAMState *rs = ram_state;
2171
2172 ram_counters.postcopy_requests++;
2173 RCU_READ_LOCK_GUARD();
2174
2175 if (!rbname) {
2176 /* Reuse last RAMBlock */
2177 ramblock = rs->last_req_rb;
2178
2179 if (!ramblock) {
2180 /*
2181 * Shouldn't happen, we can't reuse the last RAMBlock if
2182 * it's the 1st request.
2183 */
2184 error_report("ram_save_queue_pages no previous block");
2185 return -1;
2186 }
2187 } else {
2188 ramblock = qemu_ram_block_by_name(rbname);
2189
2190 if (!ramblock) {
2191 /* We shouldn't be asked for a non-existent RAMBlock */
2192 error_report("ram_save_queue_pages no block '%s'", rbname);
2193 return -1;
2194 }
2195 rs->last_req_rb = ramblock;
2196 }
2197 trace_ram_save_queue_pages(ramblock->idstr, start, len);
2198 if (!offset_in_ramblock(ramblock, start + len - 1)) {
2199 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2200 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2201 __func__, start, len, ramblock->used_length);
2202 return -1;
2203 }
2204
2205 struct RAMSrcPageRequest *new_entry =
2206 g_new0(struct RAMSrcPageRequest, 1);
2207 new_entry->rb = ramblock;
2208 new_entry->offset = start;
2209 new_entry->len = len;
2210
2211 memory_region_ref(ramblock->mr);
2212 qemu_mutex_lock(&rs->src_page_req_mutex);
2213 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2214 migration_make_urgent_request();
2215 qemu_mutex_unlock(&rs->src_page_req_mutex);
2216
2217 return 0;
2218 }
2219
2220 static bool save_page_use_compression(RAMState *rs)
2221 {
2222 if (!migrate_use_compression()) {
2223 return false;
2224 }
2225
2226 /*
2227 * If xbzrle is enabled (e.g., after first round of migration), stop
2228 * using the data compression. In theory, xbzrle can do better than
2229 * compression.
2230 */
2231 if (rs->xbzrle_enabled) {
2232 return false;
2233 }
2234
2235 return true;
2236 }
2237
2238 /*
2239 * try to compress the page before posting it out, return true if the page
2240 * has been properly handled by compression, otherwise needs other
2241 * paths to handle it
2242 */
2243 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2244 {
2245 if (!save_page_use_compression(rs)) {
2246 return false;
2247 }
2248
2249 /*
2250 * When starting the process of a new block, the first page of
2251 * the block should be sent out before other pages in the same
2252 * block, and all the pages in last block should have been sent
2253 * out, keeping this order is important, because the 'cont' flag
2254 * is used to avoid resending the block name.
2255 *
2256 * We post the fist page as normal page as compression will take
2257 * much CPU resource.
2258 */
2259 if (block != rs->last_sent_block) {
2260 flush_compressed_data(rs);
2261 return false;
2262 }
2263
2264 if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2265 return true;
2266 }
2267
2268 compression_counters.busy++;
2269 return false;
2270 }
2271
2272 /**
2273 * ram_save_target_page: save one target page
2274 *
2275 * Returns the number of pages written
2276 *
2277 * @rs: current RAM state
2278 * @pss: data about the page we want to send
2279 */
2280 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss)
2281 {
2282 RAMBlock *block = pss->block;
2283 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2284 int res;
2285
2286 if (control_save_page(rs, block, offset, &res)) {
2287 return res;
2288 }
2289
2290 if (save_compress_page(rs, block, offset)) {
2291 return 1;
2292 }
2293
2294 res = save_zero_page(rs, block, offset);
2295 if (res > 0) {
2296 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2297 * page would be stale
2298 */
2299 if (!save_page_use_compression(rs)) {
2300 XBZRLE_cache_lock();
2301 xbzrle_cache_zero_page(rs, block->offset + offset);
2302 XBZRLE_cache_unlock();
2303 }
2304 return res;
2305 }
2306
2307 /*
2308 * Do not use multifd in postcopy as one whole host page should be
2309 * placed. Meanwhile postcopy requires atomic update of pages, so even
2310 * if host page size == guest page size the dest guest during run may
2311 * still see partially copied pages which is data corruption.
2312 */
2313 if (migrate_use_multifd() && !migration_in_postcopy()) {
2314 return ram_save_multifd_page(rs, block, offset);
2315 }
2316
2317 return ram_save_page(rs, pss);
2318 }
2319
2320 static bool postcopy_needs_preempt(RAMState *rs, PageSearchStatus *pss)
2321 {
2322 MigrationState *ms = migrate_get_current();
2323
2324 /* Not enabled eager preempt? Then never do that. */
2325 if (!migrate_postcopy_preempt()) {
2326 return false;
2327 }
2328
2329 /* If the user explicitly disabled breaking of huge page, skip */
2330 if (!ms->postcopy_preempt_break_huge) {
2331 return false;
2332 }
2333
2334 /* If the ramblock we're sending is a small page? Never bother. */
2335 if (qemu_ram_pagesize(pss->block) == TARGET_PAGE_SIZE) {
2336 return false;
2337 }
2338
2339 /* Not in postcopy at all? */
2340 if (!migration_in_postcopy()) {
2341 return false;
2342 }
2343
2344 /*
2345 * If we're already handling a postcopy request, don't preempt as this page
2346 * has got the same high priority.
2347 */
2348 if (pss->postcopy_requested) {
2349 return false;
2350 }
2351
2352 /* If there's postcopy requests, then check it up! */
2353 return postcopy_has_request(rs);
2354 }
2355
2356 /* Returns true if we preempted precopy, false otherwise */
2357 static void postcopy_do_preempt(RAMState *rs, PageSearchStatus *pss)
2358 {
2359 PostcopyPreemptState *p_state = &rs->postcopy_preempt_state;
2360
2361 trace_postcopy_preempt_triggered(pss->block->idstr, pss->page);
2362
2363 /*
2364 * Time to preempt precopy. Cache current PSS into preempt state, so that
2365 * after handling the postcopy pages we can recover to it. We need to do
2366 * so because the dest VM will have partial of the precopy huge page kept
2367 * over in its tmp huge page caches; better move on with it when we can.
2368 */
2369 p_state->ram_block = pss->block;
2370 p_state->ram_page = pss->page;
2371 p_state->preempted = true;
2372 }
2373
2374 /* Whether we're preempted by a postcopy request during sending a huge page */
2375 static bool postcopy_preempt_triggered(RAMState *rs)
2376 {
2377 return rs->postcopy_preempt_state.preempted;
2378 }
2379
2380 static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss,
2381 bool postcopy_requested)
2382 {
2383 PostcopyPreemptState *state = &rs->postcopy_preempt_state;
2384
2385 assert(state->preempted);
2386
2387 pss->block = state->ram_block;
2388 pss->page = state->ram_page;
2389
2390 /* Whether this is a postcopy request? */
2391 pss->postcopy_requested = postcopy_requested;
2392 /*
2393 * When restoring a preempted page, the old data resides in PRECOPY
2394 * slow channel, even if postcopy_requested is set. So always use
2395 * PRECOPY channel here.
2396 */
2397 pss->postcopy_target_channel = RAM_CHANNEL_PRECOPY;
2398
2399 trace_postcopy_preempt_restored(pss->block->idstr, pss->page);
2400
2401 /* Reset preempt state, most importantly, set preempted==false */
2402 postcopy_preempt_reset(rs);
2403 }
2404
2405 static void postcopy_preempt_choose_channel(RAMState *rs, PageSearchStatus *pss)
2406 {
2407 MigrationState *s = migrate_get_current();
2408 unsigned int channel = pss->postcopy_target_channel;
2409 QEMUFile *next;
2410
2411 if (channel != rs->postcopy_channel) {
2412 if (channel == RAM_CHANNEL_PRECOPY) {
2413 next = s->to_dst_file;
2414 } else {
2415 next = s->postcopy_qemufile_src;
2416 }
2417 /* Update and cache the current channel */
2418 rs->f = next;
2419 rs->postcopy_channel = channel;
2420
2421 /*
2422 * If channel switched, reset last_sent_block since the old sent block
2423 * may not be on the same channel.
2424 */
2425 rs->last_sent_block = NULL;
2426
2427 trace_postcopy_preempt_switch_channel(channel);
2428 }
2429
2430 trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2431 }
2432
2433 /* We need to make sure rs->f always points to the default channel elsewhere */
2434 static void postcopy_preempt_reset_channel(RAMState *rs)
2435 {
2436 if (migrate_postcopy_preempt() && migration_in_postcopy()) {
2437 rs->postcopy_channel = RAM_CHANNEL_PRECOPY;
2438 rs->f = migrate_get_current()->to_dst_file;
2439 trace_postcopy_preempt_reset_channel();
2440 }
2441 }
2442
2443 /**
2444 * ram_save_host_page: save a whole host page
2445 *
2446 * Starting at *offset send pages up to the end of the current host
2447 * page. It's valid for the initial offset to point into the middle of
2448 * a host page in which case the remainder of the hostpage is sent.
2449 * Only dirty target pages are sent. Note that the host page size may
2450 * be a huge page for this block.
2451 * The saving stops at the boundary of the used_length of the block
2452 * if the RAMBlock isn't a multiple of the host page size.
2453 *
2454 * Returns the number of pages written or negative on error
2455 *
2456 * @rs: current RAM state
2457 * @pss: data about the page we want to send
2458 */
2459 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
2460 {
2461 int tmppages, pages = 0;
2462 size_t pagesize_bits =
2463 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2464 unsigned long hostpage_boundary =
2465 QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
2466 unsigned long start_page = pss->page;
2467 int res;
2468
2469 if (ramblock_is_ignored(pss->block)) {
2470 error_report("block %s should not be migrated !", pss->block->idstr);
2471 return 0;
2472 }
2473
2474 if (migrate_postcopy_preempt() && migration_in_postcopy()) {
2475 postcopy_preempt_choose_channel(rs, pss);
2476 }
2477
2478 do {
2479 if (postcopy_needs_preempt(rs, pss)) {
2480 postcopy_do_preempt(rs, pss);
2481 break;
2482 }
2483
2484 /* Check the pages is dirty and if it is send it */
2485 if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2486 tmppages = ram_save_target_page(rs, pss);
2487 if (tmppages < 0) {
2488 return tmppages;
2489 }
2490
2491 pages += tmppages;
2492 /*
2493 * Allow rate limiting to happen in the middle of huge pages if
2494 * something is sent in the current iteration.
2495 */
2496 if (pagesize_bits > 1 && tmppages > 0) {
2497 migration_rate_limit();
2498 }
2499 }
2500 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2501 } while ((pss->page < hostpage_boundary) &&
2502 offset_in_ramblock(pss->block,
2503 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2504 /* The offset we leave with is the min boundary of host page and block */
2505 pss->page = MIN(pss->page, hostpage_boundary);
2506
2507 /*
2508 * When with postcopy preempt mode, flush the data as soon as possible for
2509 * postcopy requests, because we've already sent a whole huge page, so the
2510 * dst node should already have enough resource to atomically filling in
2511 * the current missing page.
2512 *
2513 * More importantly, when using separate postcopy channel, we must do
2514 * explicit flush or it won't flush until the buffer is full.
2515 */
2516 if (migrate_postcopy_preempt() && pss->postcopy_requested) {
2517 qemu_fflush(rs->f);
2518 }
2519
2520 res = ram_save_release_protection(rs, pss, start_page);
2521 return (res < 0 ? res : pages);
2522 }
2523
2524 /**
2525 * ram_find_and_save_block: finds a dirty page and sends it to f
2526 *
2527 * Called within an RCU critical section.
2528 *
2529 * Returns the number of pages written where zero means no dirty pages,
2530 * or negative on error
2531 *
2532 * @rs: current RAM state
2533 *
2534 * On systems where host-page-size > target-page-size it will send all the
2535 * pages in a host page that are dirty.
2536 */
2537 static int ram_find_and_save_block(RAMState *rs)
2538 {
2539 PageSearchStatus pss;
2540 int pages = 0;
2541 bool again, found;
2542
2543 /* No dirty page as there is zero RAM */
2544 if (!ram_bytes_total()) {
2545 return pages;
2546 }
2547
2548 /*
2549 * Always keep last_seen_block/last_page valid during this procedure,
2550 * because find_dirty_block() relies on these values (e.g., we compare
2551 * last_seen_block with pss.block to see whether we searched all the
2552 * ramblocks) to detect the completion of migration. Having NULL value
2553 * of last_seen_block can conditionally cause below loop to run forever.
2554 */
2555 if (!rs->last_seen_block) {
2556 rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks);
2557 rs->last_page = 0;
2558 }
2559
2560 pss.block = rs->last_seen_block;
2561 pss.page = rs->last_page;
2562 pss.complete_round = false;
2563
2564 do {
2565 again = true;
2566 found = get_queued_page(rs, &pss);
2567
2568 if (!found) {
2569 /*
2570 * Recover previous precopy ramblock/offset if postcopy has
2571 * preempted precopy. Otherwise find the next dirty bit.
2572 */
2573 if (postcopy_preempt_triggered(rs)) {
2574 postcopy_preempt_restore(rs, &pss, false);
2575 found = true;
2576 } else {
2577 /* priority queue empty, so just search for something dirty */
2578 found = find_dirty_block(rs, &pss, &again);
2579 }
2580 }
2581
2582 if (found) {
2583 pages = ram_save_host_page(rs, &pss);
2584 }
2585 } while (!pages && again);
2586
2587 rs->last_seen_block = pss.block;
2588 rs->last_page = pss.page;
2589
2590 return pages;
2591 }
2592
2593 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2594 {
2595 uint64_t pages = size / TARGET_PAGE_SIZE;
2596
2597 if (zero) {
2598 ram_counters.duplicate += pages;
2599 } else {
2600 ram_counters.normal += pages;
2601 ram_transferred_add(size);
2602 qemu_file_credit_transfer(f, size);
2603 }
2604 }
2605
2606 static uint64_t ram_bytes_total_common(bool count_ignored)
2607 {
2608 RAMBlock *block;
2609 uint64_t total = 0;
2610
2611 RCU_READ_LOCK_GUARD();
2612
2613 if (count_ignored) {
2614 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2615 total += block->used_length;
2616 }
2617 } else {
2618 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2619 total += block->used_length;
2620 }
2621 }
2622 return total;
2623 }
2624
2625 uint64_t ram_bytes_total(void)
2626 {
2627 return ram_bytes_total_common(false);
2628 }
2629
2630 static void xbzrle_load_setup(void)
2631 {
2632 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2633 }
2634
2635 static void xbzrle_load_cleanup(void)
2636 {
2637 g_free(XBZRLE.decoded_buf);
2638 XBZRLE.decoded_buf = NULL;
2639 }
2640
2641 static void ram_state_cleanup(RAMState **rsp)
2642 {
2643 if (*rsp) {
2644 migration_page_queue_free(*rsp);
2645 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2646 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2647 g_free(*rsp);
2648 *rsp = NULL;
2649 }
2650 }
2651
2652 static void xbzrle_cleanup(void)
2653 {
2654 XBZRLE_cache_lock();
2655 if (XBZRLE.cache) {
2656 cache_fini(XBZRLE.cache);
2657 g_free(XBZRLE.encoded_buf);
2658 g_free(XBZRLE.current_buf);
2659 g_free(XBZRLE.zero_target_page);
2660 XBZRLE.cache = NULL;
2661 XBZRLE.encoded_buf = NULL;
2662 XBZRLE.current_buf = NULL;
2663 XBZRLE.zero_target_page = NULL;
2664 }
2665 XBZRLE_cache_unlock();
2666 }
2667
2668 static void ram_save_cleanup(void *opaque)
2669 {
2670 RAMState **rsp = opaque;
2671 RAMBlock *block;
2672
2673 /* We don't use dirty log with background snapshots */
2674 if (!migrate_background_snapshot()) {
2675 /* caller have hold iothread lock or is in a bh, so there is
2676 * no writing race against the migration bitmap
2677 */
2678 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2679 /*
2680 * do not stop dirty log without starting it, since
2681 * memory_global_dirty_log_stop will assert that
2682 * memory_global_dirty_log_start/stop used in pairs
2683 */
2684 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2685 }
2686 }
2687
2688 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2689 g_free(block->clear_bmap);
2690 block->clear_bmap = NULL;
2691 g_free(block->bmap);
2692 block->bmap = NULL;
2693 }
2694
2695 xbzrle_cleanup();
2696 compress_threads_save_cleanup();
2697 ram_state_cleanup(rsp);
2698 }
2699
2700 static void ram_state_reset(RAMState *rs)
2701 {
2702 rs->last_seen_block = NULL;
2703 rs->last_sent_block = NULL;
2704 rs->last_page = 0;
2705 rs->last_version = ram_list.version;
2706 rs->xbzrle_enabled = false;
2707 postcopy_preempt_reset(rs);
2708 rs->postcopy_channel = RAM_CHANNEL_PRECOPY;
2709 }
2710
2711 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2712
2713 /* **** functions for postcopy ***** */
2714
2715 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2716 {
2717 struct RAMBlock *block;
2718
2719 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2720 unsigned long *bitmap = block->bmap;
2721 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2722 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2723
2724 while (run_start < range) {
2725 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2726 ram_discard_range(block->idstr,
2727 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2728 ((ram_addr_t)(run_end - run_start))
2729 << TARGET_PAGE_BITS);
2730 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2731 }
2732 }
2733 }
2734
2735 /**
2736 * postcopy_send_discard_bm_ram: discard a RAMBlock
2737 *
2738 * Callback from postcopy_each_ram_send_discard for each RAMBlock
2739 *
2740 * @ms: current migration state
2741 * @block: RAMBlock to discard
2742 */
2743 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2744 {
2745 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2746 unsigned long current;
2747 unsigned long *bitmap = block->bmap;
2748
2749 for (current = 0; current < end; ) {
2750 unsigned long one = find_next_bit(bitmap, end, current);
2751 unsigned long zero, discard_length;
2752
2753 if (one >= end) {
2754 break;
2755 }
2756
2757 zero = find_next_zero_bit(bitmap, end, one + 1);
2758
2759 if (zero >= end) {
2760 discard_length = end - one;
2761 } else {
2762 discard_length = zero - one;
2763 }
2764 postcopy_discard_send_range(ms, one, discard_length);
2765 current = one + discard_length;
2766 }
2767 }
2768
2769 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2770
2771 /**
2772 * postcopy_each_ram_send_discard: discard all RAMBlocks
2773 *
2774 * Utility for the outgoing postcopy code.
2775 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2776 * passing it bitmap indexes and name.
2777 * (qemu_ram_foreach_block ends up passing unscaled lengths
2778 * which would mean postcopy code would have to deal with target page)
2779 *
2780 * @ms: current migration state
2781 */
2782 static void postcopy_each_ram_send_discard(MigrationState *ms)
2783 {
2784 struct RAMBlock *block;
2785
2786 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2787 postcopy_discard_send_init(ms, block->idstr);
2788
2789 /*
2790 * Deal with TPS != HPS and huge pages. It discard any partially sent
2791 * host-page size chunks, mark any partially dirty host-page size
2792 * chunks as all dirty. In this case the host-page is the host-page
2793 * for the particular RAMBlock, i.e. it might be a huge page.
2794 */
2795 postcopy_chunk_hostpages_pass(ms, block);
2796
2797 /*
2798 * Postcopy sends chunks of bitmap over the wire, but it
2799 * just needs indexes at this point, avoids it having
2800 * target page specific code.
2801 */
2802 postcopy_send_discard_bm_ram(ms, block);
2803 postcopy_discard_send_finish(ms);
2804 }
2805 }
2806
2807 /**
2808 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2809 *
2810 * Helper for postcopy_chunk_hostpages; it's called twice to
2811 * canonicalize the two bitmaps, that are similar, but one is
2812 * inverted.
2813 *
2814 * Postcopy requires that all target pages in a hostpage are dirty or
2815 * clean, not a mix. This function canonicalizes the bitmaps.
2816 *
2817 * @ms: current migration state
2818 * @block: block that contains the page we want to canonicalize
2819 */
2820 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2821 {
2822 RAMState *rs = ram_state;
2823 unsigned long *bitmap = block->bmap;
2824 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2825 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2826 unsigned long run_start;
2827
2828 if (block->page_size == TARGET_PAGE_SIZE) {
2829 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2830 return;
2831 }
2832
2833 /* Find a dirty page */
2834 run_start = find_next_bit(bitmap, pages, 0);
2835
2836 while (run_start < pages) {
2837
2838 /*
2839 * If the start of this run of pages is in the middle of a host
2840 * page, then we need to fixup this host page.
2841 */
2842 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2843 /* Find the end of this run */
2844 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2845 /*
2846 * If the end isn't at the start of a host page, then the
2847 * run doesn't finish at the end of a host page
2848 * and we need to discard.
2849 */
2850 }
2851
2852 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2853 unsigned long page;
2854 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2855 host_ratio);
2856 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2857
2858 /* Clean up the bitmap */
2859 for (page = fixup_start_addr;
2860 page < fixup_start_addr + host_ratio; page++) {
2861 /*
2862 * Remark them as dirty, updating the count for any pages
2863 * that weren't previously dirty.
2864 */
2865 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2866 }
2867 }
2868
2869 /* Find the next dirty page for the next iteration */
2870 run_start = find_next_bit(bitmap, pages, run_start);
2871 }
2872 }
2873
2874 /**
2875 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2876 *
2877 * Transmit the set of pages to be discarded after precopy to the target
2878 * these are pages that:
2879 * a) Have been previously transmitted but are now dirty again
2880 * b) Pages that have never been transmitted, this ensures that
2881 * any pages on the destination that have been mapped by background
2882 * tasks get discarded (transparent huge pages is the specific concern)
2883 * Hopefully this is pretty sparse
2884 *
2885 * @ms: current migration state
2886 */
2887 void ram_postcopy_send_discard_bitmap(MigrationState *ms)
2888 {
2889 RAMState *rs = ram_state;
2890
2891 RCU_READ_LOCK_GUARD();
2892
2893 /* This should be our last sync, the src is now paused */
2894 migration_bitmap_sync(rs);
2895
2896 /* Easiest way to make sure we don't resume in the middle of a host-page */
2897 rs->last_seen_block = NULL;
2898 rs->last_sent_block = NULL;
2899 rs->last_page = 0;
2900
2901 postcopy_each_ram_send_discard(ms);
2902
2903 trace_ram_postcopy_send_discard_bitmap();
2904 }
2905
2906 /**
2907 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2908 *
2909 * Returns zero on success
2910 *
2911 * @rbname: name of the RAMBlock of the request. NULL means the
2912 * same that last one.
2913 * @start: RAMBlock starting page
2914 * @length: RAMBlock size
2915 */
2916 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2917 {
2918 trace_ram_discard_range(rbname, start, length);
2919
2920 RCU_READ_LOCK_GUARD();
2921 RAMBlock *rb = qemu_ram_block_by_name(rbname);
2922
2923 if (!rb) {
2924 error_report("ram_discard_range: Failed to find block '%s'", rbname);
2925 return -1;
2926 }
2927
2928 /*
2929 * On source VM, we don't need to update the received bitmap since
2930 * we don't even have one.
2931 */
2932 if (rb->receivedmap) {
2933 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2934 length >> qemu_target_page_bits());
2935 }
2936
2937 return ram_block_discard_range(rb, start, length);
2938 }
2939
2940 /*
2941 * For every allocation, we will try not to crash the VM if the
2942 * allocation failed.
2943 */
2944 static int xbzrle_init(void)
2945 {
2946 Error *local_err = NULL;
2947
2948 if (!migrate_use_xbzrle()) {
2949 return 0;
2950 }
2951
2952 XBZRLE_cache_lock();
2953
2954 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2955 if (!XBZRLE.zero_target_page) {
2956 error_report("%s: Error allocating zero page", __func__);
2957 goto err_out;
2958 }
2959
2960 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2961 TARGET_PAGE_SIZE, &local_err);
2962 if (!XBZRLE.cache) {
2963 error_report_err(local_err);
2964 goto free_zero_page;
2965 }
2966
2967 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2968 if (!XBZRLE.encoded_buf) {
2969 error_report("%s: Error allocating encoded_buf", __func__);
2970 goto free_cache;
2971 }
2972
2973 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2974 if (!XBZRLE.current_buf) {
2975 error_report("%s: Error allocating current_buf", __func__);
2976 goto free_encoded_buf;
2977 }
2978
2979 /* We are all good */
2980 XBZRLE_cache_unlock();
2981 return 0;
2982
2983 free_encoded_buf:
2984 g_free(XBZRLE.encoded_buf);
2985 XBZRLE.encoded_buf = NULL;
2986 free_cache:
2987 cache_fini(XBZRLE.cache);
2988 XBZRLE.cache = NULL;
2989 free_zero_page:
2990 g_free(XBZRLE.zero_target_page);
2991 XBZRLE.zero_target_page = NULL;
2992 err_out:
2993 XBZRLE_cache_unlock();
2994 return -ENOMEM;
2995 }
2996
2997 static int ram_state_init(RAMState **rsp)
2998 {
2999 *rsp = g_try_new0(RAMState, 1);
3000
3001 if (!*rsp) {
3002 error_report("%s: Init ramstate fail", __func__);
3003 return -1;
3004 }
3005
3006 qemu_mutex_init(&(*rsp)->bitmap_mutex);
3007 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3008 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
3009
3010 /*
3011 * Count the total number of pages used by ram blocks not including any
3012 * gaps due to alignment or unplugs.
3013 * This must match with the initial values of dirty bitmap.
3014 */
3015 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
3016 ram_state_reset(*rsp);
3017
3018 return 0;
3019 }
3020
3021 static void ram_list_init_bitmaps(void)
3022 {
3023 MigrationState *ms = migrate_get_current();
3024 RAMBlock *block;
3025 unsigned long pages;
3026 uint8_t shift;
3027
3028 /* Skip setting bitmap if there is no RAM */
3029 if (ram_bytes_total()) {
3030 shift = ms->clear_bitmap_shift;
3031 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3032 error_report("clear_bitmap_shift (%u) too big, using "
3033 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3034 shift = CLEAR_BITMAP_SHIFT_MAX;
3035 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3036 error_report("clear_bitmap_shift (%u) too small, using "
3037 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3038 shift = CLEAR_BITMAP_SHIFT_MIN;
3039 }
3040
3041 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3042 pages = block->max_length >> TARGET_PAGE_BITS;
3043 /*
3044 * The initial dirty bitmap for migration must be set with all
3045 * ones to make sure we'll migrate every guest RAM page to
3046 * destination.
3047 * Here we set RAMBlock.bmap all to 1 because when rebegin a
3048 * new migration after a failed migration, ram_list.
3049 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3050 * guest memory.
3051 */
3052 block->bmap = bitmap_new(pages);
3053 bitmap_set(block->bmap, 0, pages);
3054 block->clear_bmap_shift = shift;
3055 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
3056 }
3057 }
3058 }
3059
3060 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
3061 {
3062 unsigned long pages;
3063 RAMBlock *rb;
3064
3065 RCU_READ_LOCK_GUARD();
3066
3067 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3068 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
3069 rs->migration_dirty_pages -= pages;
3070 }
3071 }
3072
3073 static void ram_init_bitmaps(RAMState *rs)
3074 {
3075 /* For memory_global_dirty_log_start below. */
3076 qemu_mutex_lock_iothread();
3077 qemu_mutex_lock_ramlist();
3078
3079 WITH_RCU_READ_LOCK_GUARD() {
3080 ram_list_init_bitmaps();
3081 /* We don't use dirty log with background snapshots */
3082 if (!migrate_background_snapshot()) {
3083 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3084 migration_bitmap_sync_precopy(rs);
3085 }
3086 }
3087 qemu_mutex_unlock_ramlist();
3088 qemu_mutex_unlock_iothread();
3089
3090 /*
3091 * After an eventual first bitmap sync, fixup the initial bitmap
3092 * containing all 1s to exclude any discarded pages from migration.
3093 */
3094 migration_bitmap_clear_discarded_pages(rs);
3095 }
3096
3097 static int ram_init_all(RAMState **rsp)
3098 {
3099 if (ram_state_init(rsp)) {
3100 return -1;
3101 }
3102
3103 if (xbzrle_init()) {
3104 ram_state_cleanup(rsp);
3105 return -1;
3106 }
3107
3108 ram_init_bitmaps(*rsp);
3109
3110 return 0;
3111 }
3112
3113 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3114 {
3115 RAMBlock *block;
3116 uint64_t pages = 0;
3117
3118 /*
3119 * Postcopy is not using xbzrle/compression, so no need for that.
3120 * Also, since source are already halted, we don't need to care
3121 * about dirty page logging as well.
3122 */
3123
3124 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3125 pages += bitmap_count_one(block->bmap,
3126 block->used_length >> TARGET_PAGE_BITS);
3127 }
3128
3129 /* This may not be aligned with current bitmaps. Recalculate. */
3130 rs->migration_dirty_pages = pages;
3131
3132 ram_state_reset(rs);
3133
3134 /* Update RAMState cache of output QEMUFile */
3135 rs->f = out;
3136
3137 trace_ram_state_resume_prepare(pages);
3138 }
3139
3140 /*
3141 * This function clears bits of the free pages reported by the caller from the
3142 * migration dirty bitmap. @addr is the host address corresponding to the
3143 * start of the continuous guest free pages, and @len is the total bytes of
3144 * those pages.
3145 */
3146 void qemu_guest_free_page_hint(void *addr, size_t len)
3147 {
3148 RAMBlock *block;
3149 ram_addr_t offset;
3150 size_t used_len, start, npages;
3151 MigrationState *s = migrate_get_current();
3152
3153 /* This function is currently expected to be used during live migration */
3154 if (!migration_is_setup_or_active(s->state)) {
3155 return;
3156 }
3157
3158 for (; len > 0; len -= used_len, addr += used_len) {
3159 block = qemu_ram_block_from_host(addr, false, &offset);
3160 if (unlikely(!block || offset >= block->used_length)) {
3161 /*
3162 * The implementation might not support RAMBlock resize during
3163 * live migration, but it could happen in theory with future
3164 * updates. So we add a check here to capture that case.
3165 */
3166 error_report_once("%s unexpected error", __func__);
3167 return;
3168 }
3169
3170 if (len <= block->used_length - offset) {
3171 used_len = len;
3172 } else {
3173 used_len = block->used_length - offset;
3174 }
3175
3176 start = offset >> TARGET_PAGE_BITS;
3177 npages = used_len >> TARGET_PAGE_BITS;
3178
3179 qemu_mutex_lock(&ram_state->bitmap_mutex);
3180 /*
3181 * The skipped free pages are equavalent to be sent from clear_bmap's
3182 * perspective, so clear the bits from the memory region bitmap which
3183 * are initially set. Otherwise those skipped pages will be sent in
3184 * the next round after syncing from the memory region bitmap.
3185 */
3186 migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
3187 ram_state->migration_dirty_pages -=
3188 bitmap_count_one_with_offset(block->bmap, start, npages);
3189 bitmap_clear(block->bmap, start, npages);
3190 qemu_mutex_unlock(&ram_state->bitmap_mutex);
3191 }
3192 }
3193
3194 /*
3195 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3196 * long-running RCU critical section. When rcu-reclaims in the code
3197 * start to become numerous it will be necessary to reduce the
3198 * granularity of these critical sections.
3199 */
3200
3201 /**
3202 * ram_save_setup: Setup RAM for migration
3203 *
3204 * Returns zero to indicate success and negative for error
3205 *
3206 * @f: QEMUFile where to send the data
3207 * @opaque: RAMState pointer
3208 */
3209 static int ram_save_setup(QEMUFile *f, void *opaque)
3210 {
3211 RAMState **rsp = opaque;
3212 RAMBlock *block;
3213 int ret;
3214
3215 if (compress_threads_save_setup()) {
3216 return -1;
3217 }
3218
3219 /* migration has already setup the bitmap, reuse it. */
3220 if (!migration_in_colo_state()) {
3221 if (ram_init_all(rsp) != 0) {
3222 compress_threads_save_cleanup();
3223 return -1;
3224 }
3225 }
3226 (*rsp)->f = f;
3227
3228 WITH_RCU_READ_LOCK_GUARD() {
3229 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
3230
3231 RAMBLOCK_FOREACH_MIGRATABLE(block) {
3232 qemu_put_byte(f, strlen(block->idstr));
3233 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3234 qemu_put_be64(f, block->used_length);
3235 if (migrate_postcopy_ram() && block->page_size !=
3236 qemu_host_page_size) {
3237 qemu_put_be64(f, block->page_size);
3238 }
3239 if (migrate_ignore_shared()) {
3240 qemu_put_be64(f, block->mr->addr);
3241 }
3242 }
3243 }
3244
3245 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3246 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3247
3248 ret = multifd_send_sync_main(f);
3249 if (ret < 0) {
3250 return ret;
3251 }
3252
3253 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3254 qemu_fflush(f);
3255
3256 return 0;
3257 }
3258
3259 /**
3260 * ram_save_iterate: iterative stage for migration
3261 *
3262 * Returns zero to indicate success and negative for error
3263 *
3264 * @f: QEMUFile where to send the data
3265 * @opaque: RAMState pointer
3266 */
3267 static int ram_save_iterate(QEMUFile *f, void *opaque)
3268 {
3269 RAMState **temp = opaque;
3270 RAMState *rs = *temp;
3271 int ret = 0;
3272 int i;
3273 int64_t t0;
3274 int done = 0;
3275
3276 if (blk_mig_bulk_active()) {
3277 /* Avoid transferring ram during bulk phase of block migration as
3278 * the bulk phase will usually take a long time and transferring
3279 * ram updates during that time is pointless. */
3280 goto out;
3281 }
3282
3283 /*
3284 * We'll take this lock a little bit long, but it's okay for two reasons.
3285 * Firstly, the only possible other thread to take it is who calls
3286 * qemu_guest_free_page_hint(), which should be rare; secondly, see
3287 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3288 * guarantees that we'll at least released it in a regular basis.
3289 */
3290 qemu_mutex_lock(&rs->bitmap_mutex);
3291 WITH_RCU_READ_LOCK_GUARD() {
3292 if (ram_list.version != rs->last_version) {
3293 ram_state_reset(rs);
3294 }
3295
3296 /* Read version before ram_list.blocks */
3297 smp_rmb();
3298
3299 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3300
3301 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3302 i = 0;
3303 while ((ret = qemu_file_rate_limit(f)) == 0 ||
3304 postcopy_has_request(rs)) {
3305 int pages;
3306
3307 if (qemu_file_get_error(f)) {
3308 break;
3309 }
3310
3311 pages = ram_find_and_save_block(rs);
3312 /* no more pages to sent */
3313 if (pages == 0) {
3314 done = 1;
3315 break;
3316 }
3317
3318 if (pages < 0) {
3319 qemu_file_set_error(f, pages);
3320 break;
3321 }
3322
3323 rs->target_page_count += pages;
3324
3325 /*
3326 * During postcopy, it is necessary to make sure one whole host
3327 * page is sent in one chunk.
3328 */
3329 if (migrate_postcopy_ram()) {
3330 flush_compressed_data(rs);
3331 }
3332
3333 /*
3334 * we want to check in the 1st loop, just in case it was the 1st
3335 * time and we had to sync the dirty bitmap.
3336 * qemu_clock_get_ns() is a bit expensive, so we only check each
3337 * some iterations
3338 */
3339 if ((i & 63) == 0) {
3340 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3341 1000000;
3342 if (t1 > MAX_WAIT) {
3343 trace_ram_save_iterate_big_wait(t1, i);
3344 break;
3345 }
3346 }
3347 i++;
3348 }
3349 }
3350 qemu_mutex_unlock(&rs->bitmap_mutex);
3351
3352 postcopy_preempt_reset_channel(rs);
3353
3354 /*
3355 * Must occur before EOS (or any QEMUFile operation)
3356 * because of RDMA protocol.
3357 */
3358 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3359
3360 out:
3361 if (ret >= 0
3362 && migration_is_setup_or_active(migrate_get_current()->state)) {
3363 ret = multifd_send_sync_main(rs->f);
3364 if (ret < 0) {
3365 return ret;
3366 }
3367
3368 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3369 qemu_fflush(f);
3370 ram_transferred_add(8);
3371
3372 ret = qemu_file_get_error(f);
3373 }
3374 if (ret < 0) {
3375 return ret;
3376 }
3377
3378 return done;
3379 }
3380
3381 /**
3382 * ram_save_complete: function called to send the remaining amount of ram
3383 *
3384 * Returns zero to indicate success or negative on error
3385 *
3386 * Called with iothread lock
3387 *
3388 * @f: QEMUFile where to send the data
3389 * @opaque: RAMState pointer
3390 */
3391 static int ram_save_complete(QEMUFile *f, void *opaque)
3392 {
3393 RAMState **temp = opaque;
3394 RAMState *rs = *temp;
3395 int ret = 0;
3396
3397 rs->last_stage = !migration_in_colo_state();
3398
3399 WITH_RCU_READ_LOCK_GUARD() {
3400 if (!migration_in_postcopy()) {
3401 migration_bitmap_sync_precopy(rs);
3402 }
3403
3404 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3405
3406 /* try transferring iterative blocks of memory */
3407
3408 /* flush all remaining blocks regardless of rate limiting */
3409 while (true) {
3410 int pages;
3411
3412 pages = ram_find_and_save_block(rs);
3413 /* no more blocks to sent */
3414 if (pages == 0) {
3415 break;
3416 }
3417 if (pages < 0) {
3418 ret = pages;
3419 break;
3420 }
3421 }
3422
3423 flush_compressed_data(rs);
3424 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3425 }
3426
3427 if (ret < 0) {
3428 return ret;
3429 }
3430
3431 postcopy_preempt_reset_channel(rs);
3432
3433 ret = multifd_send_sync_main(rs->f);
3434 if (ret < 0) {
3435 return ret;
3436 }
3437
3438 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3439 qemu_fflush(f);
3440
3441 return 0;
3442 }
3443
3444 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3445 uint64_t *res_precopy_only,
3446 uint64_t *res_compatible,
3447 uint64_t *res_postcopy_only)
3448 {
3449 RAMState **temp = opaque;
3450 RAMState *rs = *temp;
3451 uint64_t remaining_size;
3452
3453 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3454
3455 if (!migration_in_postcopy() &&
3456 remaining_size < max_size) {
3457 qemu_mutex_lock_iothread();
3458 WITH_RCU_READ_LOCK_GUARD() {
3459 migration_bitmap_sync_precopy(rs);
3460 }
3461 qemu_mutex_unlock_iothread();
3462 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3463 }
3464
3465 if (migrate_postcopy_ram()) {
3466 /* We can do postcopy, and all the data is postcopiable */
3467 *res_compatible += remaining_size;
3468 } else {
3469 *res_precopy_only += remaining_size;
3470 }
3471 }
3472
3473 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3474 {
3475 unsigned int xh_len;
3476 int xh_flags;
3477 uint8_t *loaded_data;
3478
3479 /* extract RLE header */
3480 xh_flags = qemu_get_byte(f);
3481 xh_len = qemu_get_be16(f);
3482
3483 if (xh_flags != ENCODING_FLAG_XBZRLE) {
3484 error_report("Failed to load XBZRLE page - wrong compression!");
3485 return -1;
3486 }
3487
3488 if (xh_len > TARGET_PAGE_SIZE) {
3489 error_report("Failed to load XBZRLE page - len overflow!");
3490 return -1;
3491 }
3492 loaded_data = XBZRLE.decoded_buf;
3493 /* load data and decode */
3494 /* it can change loaded_data to point to an internal buffer */
3495 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3496
3497 /* decode RLE */
3498 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3499 TARGET_PAGE_SIZE) == -1) {
3500 error_report("Failed to load XBZRLE page - decode error!");
3501 return -1;
3502 }
3503
3504 return 0;
3505 }
3506
3507 /**
3508 * ram_block_from_stream: read a RAMBlock id from the migration stream
3509 *
3510 * Must be called from within a rcu critical section.
3511 *
3512 * Returns a pointer from within the RCU-protected ram_list.
3513 *
3514 * @mis: the migration incoming state pointer
3515 * @f: QEMUFile where to read the data from
3516 * @flags: Page flags (mostly to see if it's a continuation of previous block)
3517 * @channel: the channel we're using
3518 */
3519 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
3520 QEMUFile *f, int flags,
3521 int channel)
3522 {
3523 RAMBlock *block = mis->last_recv_block[channel];
3524 char id[256];
3525 uint8_t len;
3526
3527 if (flags & RAM_SAVE_FLAG_CONTINUE) {
3528 if (!block) {
3529 error_report("Ack, bad migration stream!");
3530 return NULL;
3531 }
3532 return block;
3533 }
3534
3535 len = qemu_get_byte(f);
3536 qemu_get_buffer(f, (uint8_t *)id, len);
3537 id[len] = 0;
3538
3539 block = qemu_ram_block_by_name(id);
3540 if (!block) {
3541 error_report("Can't find block %s", id);
3542 return NULL;
3543 }
3544
3545 if (ramblock_is_ignored(block)) {
3546 error_report("block %s should not be migrated !", id);
3547 return NULL;
3548 }
3549
3550 mis->last_recv_block[channel] = block;
3551
3552 return block;
3553 }
3554
3555 static inline void *host_from_ram_block_offset(RAMBlock *block,
3556 ram_addr_t offset)
3557 {
3558 if (!offset_in_ramblock(block, offset)) {
3559 return NULL;
3560 }
3561
3562 return block->host + offset;
3563 }
3564
3565 static void *host_page_from_ram_block_offset(RAMBlock *block,
3566 ram_addr_t offset)
3567 {
3568 /* Note: Explicitly no check against offset_in_ramblock(). */
3569 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3570 block->page_size);
3571 }
3572
3573 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3574 ram_addr_t offset)
3575 {
3576 return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3577 }
3578
3579 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3580 ram_addr_t offset, bool record_bitmap)
3581 {
3582 if (!offset_in_ramblock(block, offset)) {
3583 return NULL;
3584 }
3585 if (!block->colo_cache) {
3586 error_report("%s: colo_cache is NULL in block :%s",
3587 __func__, block->idstr);
3588 return NULL;
3589 }
3590
3591 /*
3592 * During colo checkpoint, we need bitmap of these migrated pages.
3593 * It help us to decide which pages in ram cache should be flushed
3594 * into VM's RAM later.
3595 */
3596 if (record_bitmap &&
3597 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3598 ram_state->migration_dirty_pages++;
3599 }
3600 return block->colo_cache + offset;
3601 }
3602
3603 /**
3604 * ram_handle_compressed: handle the zero page case
3605 *
3606 * If a page (or a whole RDMA chunk) has been
3607 * determined to be zero, then zap it.
3608 *
3609 * @host: host address for the zero page
3610 * @ch: what the page is filled from. We only support zero
3611 * @size: size of the zero page
3612 */
3613 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3614 {
3615 if (ch != 0 || !buffer_is_zero(host, size)) {
3616 memset(host, ch, size);
3617 }
3618 }
3619
3620 /* return the size after decompression, or negative value on error */
3621 static int
3622 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3623 const uint8_t *source, size_t source_len)
3624 {
3625 int err;
3626
3627 err = inflateReset(stream);
3628 if (err != Z_OK) {
3629 return -1;
3630 }
3631
3632 stream->avail_in = source_len;
3633 stream->next_in = (uint8_t *)source;
3634 stream->avail_out = dest_len;
3635 stream->next_out = dest;
3636
3637 err = inflate(stream, Z_NO_FLUSH);
3638 if (err != Z_STREAM_END) {
3639 return -1;
3640 }
3641
3642 return stream->total_out;
3643 }
3644
3645 static void *do_data_decompress(void *opaque)
3646 {
3647 DecompressParam *param = opaque;
3648 unsigned long pagesize;
3649 uint8_t *des;
3650 int len, ret;
3651
3652 qemu_mutex_lock(&param->mutex);
3653 while (!param->quit) {
3654 if (param->des) {
3655 des = param->des;
3656 len = param->len;
3657 param->des = 0;
3658 qemu_mutex_unlock(&param->mutex);
3659
3660 pagesize = TARGET_PAGE_SIZE;
3661
3662 ret = qemu_uncompress_data(&param->stream, des, pagesize,
3663 param->compbuf, len);
3664 if (ret < 0 && migrate_get_current()->decompress_error_check) {
3665 error_report("decompress data failed");
3666 qemu_file_set_error(decomp_file, ret);
3667 }
3668
3669 qemu_mutex_lock(&decomp_done_lock);
3670 param->done = true;
3671 qemu_cond_signal(&decomp_done_cond);
3672 qemu_mutex_unlock(&decomp_done_lock);
3673
3674 qemu_mutex_lock(&param->mutex);
3675 } else {
3676 qemu_cond_wait(&param->cond, &param->mutex);
3677 }
3678 }
3679 qemu_mutex_unlock(&param->mutex);
3680
3681 return NULL;
3682 }
3683
3684 static int wait_for_decompress_done(void)
3685 {
3686 int idx, thread_count;
3687
3688 if (!migrate_use_compression()) {
3689 return 0;
3690 }
3691
3692 thread_count = migrate_decompress_threads();
3693 qemu_mutex_lock(&decomp_done_lock);
3694 for (idx = 0; idx < thread_count; idx++) {
3695 while (!decomp_param[idx].done) {
3696 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3697 }
3698 }
3699 qemu_mutex_unlock(&decomp_done_lock);
3700 return qemu_file_get_error(decomp_file);
3701 }
3702
3703 static void compress_threads_load_cleanup(void)
3704 {
3705 int i, thread_count;
3706
3707 if (!migrate_use_compression()) {
3708 return;
3709 }
3710 thread_count = migrate_decompress_threads();
3711 for (i = 0; i < thread_count; i++) {
3712 /*
3713 * we use it as a indicator which shows if the thread is
3714 * properly init'd or not
3715 */
3716 if (!decomp_param[i].compbuf) {
3717 break;
3718 }
3719
3720 qemu_mutex_lock(&decomp_param[i].mutex);
3721 decomp_param[i].quit = true;
3722 qemu_cond_signal(&decomp_param[i].cond);
3723 qemu_mutex_unlock(&decomp_param[i].mutex);
3724 }
3725 for (i = 0; i < thread_count; i++) {
3726 if (!decomp_param[i].compbuf) {
3727 break;
3728 }
3729
3730 qemu_thread_join(decompress_threads + i);
3731 qemu_mutex_destroy(&decomp_param[i].mutex);
3732 qemu_cond_destroy(&decomp_param[i].cond);
3733 inflateEnd(&decomp_param[i].stream);
3734 g_free(decomp_param[i].compbuf);
3735 decomp_param[i].compbuf = NULL;
3736 }
3737 g_free(decompress_threads);
3738 g_free(decomp_param);
3739 decompress_threads = NULL;
3740 decomp_param = NULL;
3741 decomp_file = NULL;
3742 }
3743
3744 static int compress_threads_load_setup(QEMUFile *f)
3745 {
3746 int i, thread_count;
3747
3748 if (!migrate_use_compression()) {
3749 return 0;
3750 }
3751
3752 thread_count = migrate_decompress_threads();
3753 decompress_threads = g_new0(QemuThread, thread_count);
3754 decomp_param = g_new0(DecompressParam, thread_count);
3755 qemu_mutex_init(&decomp_done_lock);
3756 qemu_cond_init(&decomp_done_cond);
3757 decomp_file = f;
3758 for (i = 0; i < thread_count; i++) {
3759 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3760 goto exit;
3761 }
3762
3763 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3764 qemu_mutex_init(&decomp_param[i].mutex);
3765 qemu_cond_init(&decomp_param[i].cond);
3766 decomp_param[i].done = true;
3767 decomp_param[i].quit = false;
3768 qemu_thread_create(decompress_threads + i, "decompress",
3769 do_data_decompress, decomp_param + i,
3770 QEMU_THREAD_JOINABLE);
3771 }
3772 return 0;
3773 exit:
3774 compress_threads_load_cleanup();
3775 return -1;
3776 }
3777
3778 static void decompress_data_with_multi_threads(QEMUFile *f,
3779 void *host, int len)
3780 {
3781 int idx, thread_count;
3782
3783 thread_count = migrate_decompress_threads();
3784 QEMU_LOCK_GUARD(&decomp_done_lock);
3785 while (true) {
3786 for (idx = 0; idx < thread_count; idx++) {
3787 if (decomp_param[idx].done) {
3788 decomp_param[idx].done = false;
3789 qemu_mutex_lock(&decomp_param[idx].mutex);
3790 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3791 decomp_param[idx].des = host;
3792 decomp_param[idx].len = len;
3793 qemu_cond_signal(&decomp_param[idx].cond);
3794 qemu_mutex_unlock(&decomp_param[idx].mutex);
3795 break;
3796 }
3797 }
3798 if (idx < thread_count) {
3799 break;
3800 } else {
3801 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3802 }
3803 }
3804 }
3805
3806 static void colo_init_ram_state(void)
3807 {
3808 ram_state_init(&ram_state);
3809 }
3810
3811 /*
3812 * colo cache: this is for secondary VM, we cache the whole
3813 * memory of the secondary VM, it is need to hold the global lock
3814 * to call this helper.
3815 */
3816 int colo_init_ram_cache(void)
3817 {
3818 RAMBlock *block;
3819
3820 WITH_RCU_READ_LOCK_GUARD() {
3821 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3822 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3823 NULL, false, false);
3824 if (!block->colo_cache) {
3825 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3826 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3827 block->used_length);
3828 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3829 if (block->colo_cache) {
3830 qemu_anon_ram_free(block->colo_cache, block->used_length);
3831 block->colo_cache = NULL;
3832 }
3833 }
3834 return -errno;
3835 }
3836 if (!machine_dump_guest_core(current_machine)) {
3837 qemu_madvise(block->colo_cache, block->used_length,
3838 QEMU_MADV_DONTDUMP);
3839 }
3840 }
3841 }
3842
3843 /*
3844 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3845 * with to decide which page in cache should be flushed into SVM's RAM. Here
3846 * we use the same name 'ram_bitmap' as for migration.
3847 */
3848 if (ram_bytes_total()) {
3849 RAMBlock *block;
3850
3851 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3852 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3853 block->bmap = bitmap_new(pages);
3854 }
3855 }
3856
3857 colo_init_ram_state();
3858 return 0;
3859 }
3860
3861 /* TODO: duplicated with ram_init_bitmaps */
3862 void colo_incoming_start_dirty_log(void)
3863 {
3864 RAMBlock *block = NULL;
3865 /* For memory_global_dirty_log_start below. */
3866 qemu_mutex_lock_iothread();
3867 qemu_mutex_lock_ramlist();
3868
3869 memory_global_dirty_log_sync();
3870 WITH_RCU_READ_LOCK_GUARD() {
3871 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3872 ramblock_sync_dirty_bitmap(ram_state, block);
3873 /* Discard this dirty bitmap record */
3874 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3875 }
3876 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3877 }
3878 ram_state->migration_dirty_pages = 0;
3879 qemu_mutex_unlock_ramlist();
3880 qemu_mutex_unlock_iothread();
3881 }
3882
3883 /* It is need to hold the global lock to call this helper */
3884 void colo_release_ram_cache(void)
3885 {
3886 RAMBlock *block;
3887
3888 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3889 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3890 g_free(block->bmap);
3891 block->bmap = NULL;
3892 }
3893
3894 WITH_RCU_READ_LOCK_GUARD() {
3895 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3896 if (block->colo_cache) {
3897 qemu_anon_ram_free(block->colo_cache, block->used_length);
3898 block->colo_cache = NULL;
3899 }
3900 }
3901 }
3902 ram_state_cleanup(&ram_state);
3903 }
3904
3905 /**
3906 * ram_load_setup: Setup RAM for migration incoming side
3907 *
3908 * Returns zero to indicate success and negative for error
3909 *
3910 * @f: QEMUFile where to receive the data
3911 * @opaque: RAMState pointer
3912 */
3913 static int ram_load_setup(QEMUFile *f, void *opaque)
3914 {
3915 if (compress_threads_load_setup(f)) {
3916 return -1;
3917 }
3918
3919 xbzrle_load_setup();
3920 ramblock_recv_map_init();
3921
3922 return 0;
3923 }
3924
3925 static int ram_load_cleanup(void *opaque)
3926 {
3927 RAMBlock *rb;
3928
3929 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3930 qemu_ram_block_writeback(rb);
3931 }
3932
3933 xbzrle_load_cleanup();
3934 compress_threads_load_cleanup();
3935
3936 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3937 g_free(rb->receivedmap);
3938 rb->receivedmap = NULL;
3939 }
3940
3941 return 0;
3942 }
3943
3944 /**
3945 * ram_postcopy_incoming_init: allocate postcopy data structures
3946 *
3947 * Returns 0 for success and negative if there was one error
3948 *
3949 * @mis: current migration incoming state
3950 *
3951 * Allocate data structures etc needed by incoming migration with
3952 * postcopy-ram. postcopy-ram's similarly names
3953 * postcopy_ram_incoming_init does the work.
3954 */
3955 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3956 {
3957 return postcopy_ram_incoming_init(mis);
3958 }
3959
3960 /**
3961 * ram_load_postcopy: load a page in postcopy case
3962 *
3963 * Returns 0 for success or -errno in case of error
3964 *
3965 * Called in postcopy mode by ram_load().
3966 * rcu_read_lock is taken prior to this being called.
3967 *
3968 * @f: QEMUFile where to send the data
3969 * @channel: the channel to use for loading
3970 */
3971 int ram_load_postcopy(QEMUFile *f, int channel)
3972 {
3973 int flags = 0, ret = 0;
3974 bool place_needed = false;
3975 bool matches_target_page_size = false;
3976 MigrationIncomingState *mis = migration_incoming_get_current();
3977 PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
3978
3979 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3980 ram_addr_t addr;
3981 void *page_buffer = NULL;
3982 void *place_source = NULL;
3983 RAMBlock *block = NULL;
3984 uint8_t ch;
3985 int len;
3986
3987 addr = qemu_get_be64(f);
3988
3989 /*
3990 * If qemu file error, we should stop here, and then "addr"
3991 * may be invalid
3992 */
3993 ret = qemu_file_get_error(f);
3994 if (ret) {
3995 break;
3996 }
3997
3998 flags = addr & ~TARGET_PAGE_MASK;
3999 addr &= TARGET_PAGE_MASK;
4000
4001 trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
4002 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4003 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
4004 block = ram_block_from_stream(mis, f, flags, channel);
4005 if (!block) {
4006 ret = -EINVAL;
4007 break;
4008 }
4009
4010 /*
4011 * Relying on used_length is racy and can result in false positives.
4012 * We might place pages beyond used_length in case RAM was shrunk
4013 * while in postcopy, which is fine - trying to place via
4014 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
4015 */
4016 if (!block->host || addr >= block->postcopy_length) {
4017 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4018 ret = -EINVAL;
4019 break;
4020 }
4021 tmp_page->target_pages++;
4022 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
4023 /*
4024 * Postcopy requires that we place whole host pages atomically;
4025 * these may be huge pages for RAMBlocks that are backed by
4026 * hugetlbfs.
4027 * To make it atomic, the data is read into a temporary page
4028 * that's moved into place later.
4029 * The migration protocol uses, possibly smaller, target-pages
4030 * however the source ensures it always sends all the components
4031 * of a host page in one chunk.
4032 */
4033 page_buffer = tmp_page->tmp_huge_page +
4034 host_page_offset_from_ram_block_offset(block, addr);
4035 /* If all TP are zero then we can optimise the place */
4036 if (tmp_page->target_pages == 1) {
4037 tmp_page->host_addr =
4038 host_page_from_ram_block_offset(block, addr);
4039 } else if (tmp_page->host_addr !=
4040 host_page_from_ram_block_offset(block, addr)) {
4041 /* not the 1st TP within the HP */
4042 error_report("Non-same host page detected on channel %d: "
4043 "Target host page %p, received host page %p "
4044 "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
4045 channel, tmp_page->host_addr,
4046 host_page_from_ram_block_offset(block, addr),
4047 block->idstr, addr, tmp_page->target_pages);
4048 ret = -EINVAL;
4049 break;
4050 }
4051
4052 /*
4053 * If it's the last part of a host page then we place the host
4054 * page
4055 */
4056 if (tmp_page->target_pages ==
4057 (block->page_size / TARGET_PAGE_SIZE)) {
4058 place_needed = true;
4059 }
4060 place_source = tmp_page->tmp_huge_page;
4061 }
4062
4063 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4064 case RAM_SAVE_FLAG_ZERO:
4065 ch = qemu_get_byte(f);
4066 /*
4067 * Can skip to set page_buffer when
4068 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
4069 */
4070 if (ch || !matches_target_page_size) {
4071 memset(page_buffer, ch, TARGET_PAGE_SIZE);
4072 }
4073 if (ch) {
4074 tmp_page->all_zero = false;
4075 }
4076 break;
4077
4078 case RAM_SAVE_FLAG_PAGE:
4079 tmp_page->all_zero = false;
4080 if (!matches_target_page_size) {
4081 /* For huge pages, we always use temporary buffer */
4082 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4083 } else {
4084 /*
4085 * For small pages that matches target page size, we
4086 * avoid the qemu_file copy. Instead we directly use
4087 * the buffer of QEMUFile to place the page. Note: we
4088 * cannot do any QEMUFile operation before using that
4089 * buffer to make sure the buffer is valid when
4090 * placing the page.
4091 */
4092 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4093 TARGET_PAGE_SIZE);
4094 }
4095 break;
4096 case RAM_SAVE_FLAG_COMPRESS_PAGE:
4097 tmp_page->all_zero = false;
4098 len = qemu_get_be32(f);
4099 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4100 error_report("Invalid compressed data length: %d", len);
4101 ret = -EINVAL;
4102 break;
4103 }
4104 decompress_data_with_multi_threads(f, page_buffer, len);
4105 break;
4106
4107 case RAM_SAVE_FLAG_EOS:
4108 /* normal exit */
4109 multifd_recv_sync_main();
4110 break;
4111 default:
4112 error_report("Unknown combination of migration flags: 0x%x"
4113 " (postcopy mode)", flags);
4114 ret = -EINVAL;
4115 break;
4116 }
4117
4118 /* Got the whole host page, wait for decompress before placing. */
4119 if (place_needed) {
4120 ret |= wait_for_decompress_done();
4121 }
4122
4123 /* Detect for any possible file errors */
4124 if (!ret && qemu_file_get_error(f)) {
4125 ret = qemu_file_get_error(f);
4126 }
4127
4128 if (!ret && place_needed) {
4129 if (tmp_page->all_zero) {
4130 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
4131 } else {
4132 ret = postcopy_place_page(mis, tmp_page->host_addr,
4133 place_source, block);
4134 }
4135 place_needed = false;
4136 postcopy_temp_page_reset(tmp_page);
4137 }
4138 }
4139
4140 return ret;
4141 }
4142
4143 static bool postcopy_is_advised(void)
4144 {
4145 PostcopyState ps = postcopy_state_get();
4146 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
4147 }
4148
4149 static bool postcopy_is_running(void)
4150 {
4151 PostcopyState ps = postcopy_state_get();
4152 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4153 }
4154
4155 /*
4156 * Flush content of RAM cache into SVM's memory.
4157 * Only flush the pages that be dirtied by PVM or SVM or both.
4158 */
4159 void colo_flush_ram_cache(void)
4160 {
4161 RAMBlock *block = NULL;
4162 void *dst_host;
4163 void *src_host;
4164 unsigned long offset = 0;
4165
4166 memory_global_dirty_log_sync();
4167 WITH_RCU_READ_LOCK_GUARD() {
4168 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4169 ramblock_sync_dirty_bitmap(ram_state, block);
4170 }
4171 }
4172
4173 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4174 WITH_RCU_READ_LOCK_GUARD() {
4175 block = QLIST_FIRST_RCU(&ram_list.blocks);
4176
4177 while (block) {
4178 unsigned long num = 0;
4179
4180 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
4181 if (!offset_in_ramblock(block,
4182 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
4183 offset = 0;
4184 num = 0;
4185 block = QLIST_NEXT_RCU(block, next);
4186 } else {
4187 unsigned long i = 0;
4188
4189 for (i = 0; i < num; i++) {
4190 migration_bitmap_clear_dirty(ram_state, block, offset + i);
4191 }
4192 dst_host = block->host
4193 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4194 src_host = block->colo_cache
4195 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4196 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
4197 offset += num;
4198 }
4199 }
4200 }
4201 trace_colo_flush_ram_cache_end();
4202 }
4203
4204 /**
4205 * ram_load_precopy: load pages in precopy case
4206 *
4207 * Returns 0 for success or -errno in case of error
4208 *
4209 * Called in precopy mode by ram_load().
4210 * rcu_read_lock is taken prior to this being called.
4211 *
4212 * @f: QEMUFile where to send the data
4213 */
4214 static int ram_load_precopy(QEMUFile *f)
4215 {
4216 MigrationIncomingState *mis = migration_incoming_get_current();
4217 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
4218 /* ADVISE is earlier, it shows the source has the postcopy capability on */
4219 bool postcopy_advised = postcopy_is_advised();
4220 if (!migrate_use_compression()) {
4221 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4222 }
4223
4224 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4225 ram_addr_t addr, total_ram_bytes;
4226 void *host = NULL, *host_bak = NULL;
4227 uint8_t ch;
4228
4229 /*
4230 * Yield periodically to let main loop run, but an iteration of
4231 * the main loop is expensive, so do it each some iterations
4232 */
4233 if ((i & 32767) == 0 && qemu_in_coroutine()) {
4234 aio_co_schedule(qemu_get_current_aio_context(),
4235 qemu_coroutine_self());
4236 qemu_coroutine_yield();
4237 }
4238 i++;
4239
4240 addr = qemu_get_be64(f);
4241 flags = addr & ~TARGET_PAGE_MASK;
4242 addr &= TARGET_PAGE_MASK;
4243
4244 if (flags & invalid_flags) {
4245 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4246 error_report("Received an unexpected compressed page");
4247 }
4248
4249 ret = -EINVAL;
4250 break;
4251 }
4252
4253 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4254 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4255 RAMBlock *block = ram_block_from_stream(mis, f, flags,
4256 RAM_CHANNEL_PRECOPY);
4257
4258 host = host_from_ram_block_offset(block, addr);
4259 /*
4260 * After going into COLO stage, we should not load the page
4261 * into SVM's memory directly, we put them into colo_cache firstly.
4262 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
4263 * Previously, we copied all these memory in preparing stage of COLO
4264 * while we need to stop VM, which is a time-consuming process.
4265 * Here we optimize it by a trick, back-up every page while in
4266 * migration process while COLO is enabled, though it affects the
4267 * speed of the migration, but it obviously reduce the downtime of
4268 * back-up all SVM'S memory in COLO preparing stage.
4269 */
4270 if (migration_incoming_colo_enabled()) {
4271 if (migration_incoming_in_colo_state()) {
4272 /* In COLO stage, put all pages into cache temporarily */
4273 host = colo_cache_from_block_offset(block, addr, true);
4274 } else {
4275 /*
4276 * In migration stage but before COLO stage,
4277 * Put all pages into both cache and SVM's memory.
4278 */
4279 host_bak = colo_cache_from_block_offset(block, addr, false);
4280 }
4281 }
4282 if (!host) {
4283 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4284 ret = -EINVAL;
4285 break;
4286 }
4287 if (!migration_incoming_in_colo_state()) {
4288 ramblock_recv_bitmap_set(block, host);
4289 }
4290
4291 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4292 }
4293
4294 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4295 case RAM_SAVE_FLAG_MEM_SIZE:
4296 /* Synchronize RAM block list */
4297 total_ram_bytes = addr;
4298 while (!ret && total_ram_bytes) {
4299 RAMBlock *block;
4300 char id[256];
4301 ram_addr_t length;
4302
4303 len = qemu_get_byte(f);
4304 qemu_get_buffer(f, (uint8_t *)id, len);
4305 id[len] = 0;
4306 length = qemu_get_be64(f);
4307
4308 block = qemu_ram_block_by_name(id);
4309 if (block && !qemu_ram_is_migratable(block)) {
4310 error_report("block %s should not be migrated !", id);
4311 ret = -EINVAL;
4312 } else if (block) {
4313 if (length != block->used_length) {
4314 Error *local_err = NULL;
4315
4316 ret = qemu_ram_resize(block, length,
4317 &local_err);
4318 if (local_err) {
4319 error_report_err(local_err);
4320 }
4321 }
4322 /* For postcopy we need to check hugepage sizes match */
4323 if (postcopy_advised && migrate_postcopy_ram() &&
4324 block->page_size != qemu_host_page_size) {
4325 uint64_t remote_page_size = qemu_get_be64(f);
4326 if (remote_page_size != block->page_size) {
4327 error_report("Mismatched RAM page size %s "
4328 "(local) %zd != %" PRId64,
4329 id, block->page_size,
4330 remote_page_size);
4331 ret = -EINVAL;
4332 }
4333 }
4334 if (migrate_ignore_shared()) {
4335 hwaddr addr = qemu_get_be64(f);
4336 if (ramblock_is_ignored(block) &&
4337 block->mr->addr != addr) {
4338 error_report("Mismatched GPAs for block %s "
4339 "%" PRId64 "!= %" PRId64,
4340 id, (uint64_t)addr,
4341 (uint64_t)block->mr->addr);
4342 ret = -EINVAL;
4343 }
4344 }
4345 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4346 block->idstr);
4347 } else {
4348 error_report("Unknown ramblock \"%s\", cannot "
4349 "accept migration", id);
4350 ret = -EINVAL;
4351 }
4352
4353 total_ram_bytes -= length;
4354 }
4355 break;
4356
4357 case RAM_SAVE_FLAG_ZERO:
4358 ch = qemu_get_byte(f);
4359 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4360 break;
4361
4362 case RAM_SAVE_FLAG_PAGE:
4363 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4364 break;
4365
4366 case RAM_SAVE_FLAG_COMPRESS_PAGE:
4367 len = qemu_get_be32(f);
4368 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4369 error_report("Invalid compressed data length: %d", len);
4370 ret = -EINVAL;
4371 break;
4372 }
4373 decompress_data_with_multi_threads(f, host, len);
4374 break;
4375
4376 case RAM_SAVE_FLAG_XBZRLE:
4377 if (load_xbzrle(f, addr, host) < 0) {
4378 error_report("Failed to decompress XBZRLE page at "
4379 RAM_ADDR_FMT, addr);
4380 ret = -EINVAL;
4381 break;
4382 }
4383 break;
4384 case RAM_SAVE_FLAG_EOS:
4385 /* normal exit */
4386 multifd_recv_sync_main();
4387 break;
4388 default:
4389 if (flags & RAM_SAVE_FLAG_HOOK) {
4390 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4391 } else {
4392 error_report("Unknown combination of migration flags: 0x%x",
4393 flags);
4394 ret = -EINVAL;
4395 }
4396 }
4397 if (!ret) {
4398 ret = qemu_file_get_error(f);
4399 }
4400 if (!ret && host_bak) {
4401 memcpy(host_bak, host, TARGET_PAGE_SIZE);
4402 }
4403 }
4404
4405 ret |= wait_for_decompress_done();
4406 return ret;
4407 }
4408
4409 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4410 {
4411 int ret = 0;
4412 static uint64_t seq_iter;
4413 /*
4414 * If system is running in postcopy mode, page inserts to host memory must
4415 * be atomic
4416 */
4417 bool postcopy_running = postcopy_is_running();
4418
4419 seq_iter++;
4420
4421 if (version_id != 4) {
4422 return -EINVAL;
4423 }
4424
4425 /*
4426 * This RCU critical section can be very long running.
4427 * When RCU reclaims in the code start to become numerous,
4428 * it will be necessary to reduce the granularity of this
4429 * critical section.
4430 */
4431 WITH_RCU_READ_LOCK_GUARD() {
4432 if (postcopy_running) {
4433 /*
4434 * Note! Here RAM_CHANNEL_PRECOPY is the precopy channel of
4435 * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
4436 * service fast page faults.
4437 */
4438 ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
4439 } else {
4440 ret = ram_load_precopy(f);
4441 }
4442 }
4443 trace_ram_load_complete(ret, seq_iter);
4444
4445 return ret;
4446 }
4447
4448 static bool ram_has_postcopy(void *opaque)
4449 {
4450 RAMBlock *rb;
4451 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4452 if (ramblock_is_pmem(rb)) {
4453 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4454 "is not supported now!", rb->idstr, rb->host);
4455 return false;
4456 }
4457 }
4458
4459 return migrate_postcopy_ram();
4460 }
4461
4462 /* Sync all the dirty bitmap with destination VM. */
4463 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4464 {
4465 RAMBlock *block;
4466 QEMUFile *file = s->to_dst_file;
4467 int ramblock_count = 0;
4468
4469 trace_ram_dirty_bitmap_sync_start();
4470
4471 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4472 qemu_savevm_send_recv_bitmap(file, block->idstr);
4473 trace_ram_dirty_bitmap_request(block->idstr);
4474 ramblock_count++;
4475 }
4476
4477 trace_ram_dirty_bitmap_sync_wait();
4478
4479 /* Wait until all the ramblocks' dirty bitmap synced */
4480 while (ramblock_count--) {
4481 qemu_sem_wait(&s->rp_state.rp_sem);
4482 }
4483
4484 trace_ram_dirty_bitmap_sync_complete();
4485
4486 return 0;
4487 }
4488
4489 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4490 {
4491 qemu_sem_post(&s->rp_state.rp_sem);
4492 }
4493
4494 /*
4495 * Read the received bitmap, revert it as the initial dirty bitmap.
4496 * This is only used when the postcopy migration is paused but wants
4497 * to resume from a middle point.
4498 */
4499 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4500 {
4501 int ret = -EINVAL;
4502 /* from_dst_file is always valid because we're within rp_thread */
4503 QEMUFile *file = s->rp_state.from_dst_file;
4504 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4505 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4506 uint64_t size, end_mark;
4507
4508 trace_ram_dirty_bitmap_reload_begin(block->idstr);
4509
4510 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4511 error_report("%s: incorrect state %s", __func__,
4512 MigrationStatus_str(s->state));
4513 return -EINVAL;
4514 }
4515
4516 /*
4517 * Note: see comments in ramblock_recv_bitmap_send() on why we
4518 * need the endianness conversion, and the paddings.
4519 */
4520 local_size = ROUND_UP(local_size, 8);
4521
4522 /* Add paddings */
4523 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4524
4525 size = qemu_get_be64(file);
4526
4527 /* The size of the bitmap should match with our ramblock */
4528 if (size != local_size) {
4529 error_report("%s: ramblock '%s' bitmap size mismatch "
4530 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4531 block->idstr, size, local_size);
4532 ret = -EINVAL;
4533 goto out;
4534 }
4535
4536 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4537 end_mark = qemu_get_be64(file);
4538
4539 ret = qemu_file_get_error(file);
4540 if (ret || size != local_size) {
4541 error_report("%s: read bitmap failed for ramblock '%s': %d"
4542 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4543 __func__, block->idstr, ret, local_size, size);
4544 ret = -EIO;
4545 goto out;
4546 }
4547
4548 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4549 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4550 __func__, block->idstr, end_mark);
4551 ret = -EINVAL;
4552 goto out;
4553 }
4554
4555 /*
4556 * Endianness conversion. We are during postcopy (though paused).
4557 * The dirty bitmap won't change. We can directly modify it.
4558 */
4559 bitmap_from_le(block->bmap, le_bitmap, nbits);
4560
4561 /*
4562 * What we received is "received bitmap". Revert it as the initial
4563 * dirty bitmap for this ramblock.
4564 */
4565 bitmap_complement(block->bmap, block->bmap, nbits);
4566
4567 /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4568 ramblock_dirty_bitmap_clear_discarded_pages(block);
4569
4570 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4571 trace_ram_dirty_bitmap_reload_complete(block->idstr);
4572
4573 /*
4574 * We succeeded to sync bitmap for current ramblock. If this is
4575 * the last one to sync, we need to notify the main send thread.
4576 */
4577 ram_dirty_bitmap_reload_notify(s);
4578
4579 ret = 0;
4580 out:
4581 g_free(le_bitmap);
4582 return ret;
4583 }
4584
4585 static int ram_resume_prepare(MigrationState *s, void *opaque)
4586 {
4587 RAMState *rs = *(RAMState **)opaque;
4588 int ret;
4589
4590 ret = ram_dirty_bitmap_sync_all(s, rs);
4591 if (ret) {
4592 return ret;
4593 }
4594
4595 ram_state_resume_prepare(rs, s->to_dst_file);
4596
4597 return 0;
4598 }
4599
4600 void postcopy_preempt_shutdown_file(MigrationState *s)
4601 {
4602 qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
4603 qemu_fflush(s->postcopy_qemufile_src);
4604 }
4605
4606 static SaveVMHandlers savevm_ram_handlers = {
4607 .save_setup = ram_save_setup,
4608 .save_live_iterate = ram_save_iterate,
4609 .save_live_complete_postcopy = ram_save_complete,
4610 .save_live_complete_precopy = ram_save_complete,
4611 .has_postcopy = ram_has_postcopy,
4612 .save_live_pending = ram_save_pending,
4613 .load_state = ram_load,
4614 .save_cleanup = ram_save_cleanup,
4615 .load_setup = ram_load_setup,
4616 .load_cleanup = ram_load_cleanup,
4617 .resume_prepare = ram_resume_prepare,
4618 };
4619
4620 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4621 size_t old_size, size_t new_size)
4622 {
4623 PostcopyState ps = postcopy_state_get();
4624 ram_addr_t offset;
4625 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4626 Error *err = NULL;
4627
4628 if (ramblock_is_ignored(rb)) {
4629 return;
4630 }
4631
4632 if (!migration_is_idle()) {
4633 /*
4634 * Precopy code on the source cannot deal with the size of RAM blocks
4635 * changing at random points in time - especially after sending the
4636 * RAM block sizes in the migration stream, they must no longer change.
4637 * Abort and indicate a proper reason.
4638 */
4639 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4640 migration_cancel(err);
4641 error_free(err);
4642 }
4643
4644 switch (ps) {
4645 case POSTCOPY_INCOMING_ADVISE:
4646 /*
4647 * Update what ram_postcopy_incoming_init()->init_range() does at the
4648 * time postcopy was advised. Syncing RAM blocks with the source will
4649 * result in RAM resizes.
4650 */
4651 if (old_size < new_size) {
4652 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4653 error_report("RAM block '%s' discard of resized RAM failed",
4654 rb->idstr);
4655 }
4656 }
4657 rb->postcopy_length = new_size;
4658 break;
4659 case POSTCOPY_INCOMING_NONE:
4660 case POSTCOPY_INCOMING_RUNNING:
4661 case POSTCOPY_INCOMING_END:
4662 /*
4663 * Once our guest is running, postcopy does no longer care about
4664 * resizes. When growing, the new memory was not available on the
4665 * source, no handler needed.
4666 */
4667 break;
4668 default:
4669 error_report("RAM block '%s' resized during postcopy state: %d",
4670 rb->idstr, ps);
4671 exit(-1);
4672 }
4673 }
4674
4675 static RAMBlockNotifier ram_mig_ram_notifier = {
4676 .ram_block_resized = ram_mig_ram_block_resized,
4677 };
4678
4679 void ram_mig_init(void)
4680 {
4681 qemu_mutex_init(&XBZRLE.lock);
4682 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4683 ram_block_notifier_add(&ram_mig_ram_notifier);
4684 }