]> git.proxmox.com Git - mirror_qemu.git/blob - migration/ram.c
Merge remote-tracking branch 'remotes/mdroth/tags/qga-pull-2021-07-13-tag' into staging
[mirror_qemu.git] / migration / ram.c
1 /*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
28
29 #include "qemu/osdep.h"
30 #include "qemu/cutils.h"
31 #include "qemu/bitops.h"
32 #include "qemu/bitmap.h"
33 #include "qemu/main-loop.h"
34 #include "xbzrle.h"
35 #include "ram.h"
36 #include "migration.h"
37 #include "migration/register.h"
38 #include "migration/misc.h"
39 #include "qemu-file.h"
40 #include "postcopy-ram.h"
41 #include "page_cache.h"
42 #include "qemu/error-report.h"
43 #include "qapi/error.h"
44 #include "qapi/qapi-types-migration.h"
45 #include "qapi/qapi-events-migration.h"
46 #include "qapi/qmp/qerror.h"
47 #include "trace.h"
48 #include "exec/ram_addr.h"
49 #include "exec/target_page.h"
50 #include "qemu/rcu_queue.h"
51 #include "migration/colo.h"
52 #include "block.h"
53 #include "sysemu/cpu-throttle.h"
54 #include "savevm.h"
55 #include "qemu/iov.h"
56 #include "multifd.h"
57 #include "sysemu/runstate.h"
58
59 #if defined(__linux__)
60 #include "qemu/userfaultfd.h"
61 #endif /* defined(__linux__) */
62
63 /***********************************************************/
64 /* ram save/restore */
65
66 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
67 * worked for pages that where filled with the same char. We switched
68 * it to only search for the zero value. And to avoid confusion with
69 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
70 */
71
72 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
73 #define RAM_SAVE_FLAG_ZERO 0x02
74 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
75 #define RAM_SAVE_FLAG_PAGE 0x08
76 #define RAM_SAVE_FLAG_EOS 0x10
77 #define RAM_SAVE_FLAG_CONTINUE 0x20
78 #define RAM_SAVE_FLAG_XBZRLE 0x40
79 /* 0x80 is reserved in migration.h start with 0x100 next */
80 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
81
82 static inline bool is_zero_range(uint8_t *p, uint64_t size)
83 {
84 return buffer_is_zero(p, size);
85 }
86
87 XBZRLECacheStats xbzrle_counters;
88
89 /* struct contains XBZRLE cache and a static page
90 used by the compression */
91 static struct {
92 /* buffer used for XBZRLE encoding */
93 uint8_t *encoded_buf;
94 /* buffer for storing page content */
95 uint8_t *current_buf;
96 /* Cache for XBZRLE, Protected by lock. */
97 PageCache *cache;
98 QemuMutex lock;
99 /* it will store a page full of zeros */
100 uint8_t *zero_target_page;
101 /* buffer used for XBZRLE decoding */
102 uint8_t *decoded_buf;
103 } XBZRLE;
104
105 static void XBZRLE_cache_lock(void)
106 {
107 if (migrate_use_xbzrle()) {
108 qemu_mutex_lock(&XBZRLE.lock);
109 }
110 }
111
112 static void XBZRLE_cache_unlock(void)
113 {
114 if (migrate_use_xbzrle()) {
115 qemu_mutex_unlock(&XBZRLE.lock);
116 }
117 }
118
119 /**
120 * xbzrle_cache_resize: resize the xbzrle cache
121 *
122 * This function is called from migrate_params_apply in main
123 * thread, possibly while a migration is in progress. A running
124 * migration may be using the cache and might finish during this call,
125 * hence changes to the cache are protected by XBZRLE.lock().
126 *
127 * Returns 0 for success or -1 for error
128 *
129 * @new_size: new cache size
130 * @errp: set *errp if the check failed, with reason
131 */
132 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
133 {
134 PageCache *new_cache;
135 int64_t ret = 0;
136
137 /* Check for truncation */
138 if (new_size != (size_t)new_size) {
139 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
140 "exceeding address space");
141 return -1;
142 }
143
144 if (new_size == migrate_xbzrle_cache_size()) {
145 /* nothing to do */
146 return 0;
147 }
148
149 XBZRLE_cache_lock();
150
151 if (XBZRLE.cache != NULL) {
152 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
153 if (!new_cache) {
154 ret = -1;
155 goto out;
156 }
157
158 cache_fini(XBZRLE.cache);
159 XBZRLE.cache = new_cache;
160 }
161 out:
162 XBZRLE_cache_unlock();
163 return ret;
164 }
165
166 bool ramblock_is_ignored(RAMBlock *block)
167 {
168 return !qemu_ram_is_migratable(block) ||
169 (migrate_ignore_shared() && qemu_ram_is_shared(block));
170 }
171
172 #undef RAMBLOCK_FOREACH
173
174 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
175 {
176 RAMBlock *block;
177 int ret = 0;
178
179 RCU_READ_LOCK_GUARD();
180
181 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
182 ret = func(block, opaque);
183 if (ret) {
184 break;
185 }
186 }
187 return ret;
188 }
189
190 static void ramblock_recv_map_init(void)
191 {
192 RAMBlock *rb;
193
194 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
195 assert(!rb->receivedmap);
196 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
197 }
198 }
199
200 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
201 {
202 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
203 rb->receivedmap);
204 }
205
206 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
207 {
208 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
209 }
210
211 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
212 {
213 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
214 }
215
216 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
217 size_t nr)
218 {
219 bitmap_set_atomic(rb->receivedmap,
220 ramblock_recv_bitmap_offset(host_addr, rb),
221 nr);
222 }
223
224 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
225
226 /*
227 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
228 *
229 * Returns >0 if success with sent bytes, or <0 if error.
230 */
231 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
232 const char *block_name)
233 {
234 RAMBlock *block = qemu_ram_block_by_name(block_name);
235 unsigned long *le_bitmap, nbits;
236 uint64_t size;
237
238 if (!block) {
239 error_report("%s: invalid block name: %s", __func__, block_name);
240 return -1;
241 }
242
243 nbits = block->postcopy_length >> TARGET_PAGE_BITS;
244
245 /*
246 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
247 * machines we may need 4 more bytes for padding (see below
248 * comment). So extend it a bit before hand.
249 */
250 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
251
252 /*
253 * Always use little endian when sending the bitmap. This is
254 * required that when source and destination VMs are not using the
255 * same endianness. (Note: big endian won't work.)
256 */
257 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
258
259 /* Size of the bitmap, in bytes */
260 size = DIV_ROUND_UP(nbits, 8);
261
262 /*
263 * size is always aligned to 8 bytes for 64bit machines, but it
264 * may not be true for 32bit machines. We need this padding to
265 * make sure the migration can survive even between 32bit and
266 * 64bit machines.
267 */
268 size = ROUND_UP(size, 8);
269
270 qemu_put_be64(file, size);
271 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
272 /*
273 * Mark as an end, in case the middle part is screwed up due to
274 * some "mysterious" reason.
275 */
276 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
277 qemu_fflush(file);
278
279 g_free(le_bitmap);
280
281 if (qemu_file_get_error(file)) {
282 return qemu_file_get_error(file);
283 }
284
285 return size + sizeof(size);
286 }
287
288 /*
289 * An outstanding page request, on the source, having been received
290 * and queued
291 */
292 struct RAMSrcPageRequest {
293 RAMBlock *rb;
294 hwaddr offset;
295 hwaddr len;
296
297 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
298 };
299
300 /* State of RAM for migration */
301 struct RAMState {
302 /* QEMUFile used for this migration */
303 QEMUFile *f;
304 /* UFFD file descriptor, used in 'write-tracking' migration */
305 int uffdio_fd;
306 /* Last block that we have visited searching for dirty pages */
307 RAMBlock *last_seen_block;
308 /* Last block from where we have sent data */
309 RAMBlock *last_sent_block;
310 /* Last dirty target page we have sent */
311 ram_addr_t last_page;
312 /* last ram version we have seen */
313 uint32_t last_version;
314 /* How many times we have dirty too many pages */
315 int dirty_rate_high_cnt;
316 /* these variables are used for bitmap sync */
317 /* last time we did a full bitmap_sync */
318 int64_t time_last_bitmap_sync;
319 /* bytes transferred at start_time */
320 uint64_t bytes_xfer_prev;
321 /* number of dirty pages since start_time */
322 uint64_t num_dirty_pages_period;
323 /* xbzrle misses since the beginning of the period */
324 uint64_t xbzrle_cache_miss_prev;
325 /* Amount of xbzrle pages since the beginning of the period */
326 uint64_t xbzrle_pages_prev;
327 /* Amount of xbzrle encoded bytes since the beginning of the period */
328 uint64_t xbzrle_bytes_prev;
329 /* Start using XBZRLE (e.g., after the first round). */
330 bool xbzrle_enabled;
331
332 /* compression statistics since the beginning of the period */
333 /* amount of count that no free thread to compress data */
334 uint64_t compress_thread_busy_prev;
335 /* amount bytes after compression */
336 uint64_t compressed_size_prev;
337 /* amount of compressed pages */
338 uint64_t compress_pages_prev;
339
340 /* total handled target pages at the beginning of period */
341 uint64_t target_page_count_prev;
342 /* total handled target pages since start */
343 uint64_t target_page_count;
344 /* number of dirty bits in the bitmap */
345 uint64_t migration_dirty_pages;
346 /* Protects modification of the bitmap and migration dirty pages */
347 QemuMutex bitmap_mutex;
348 /* The RAMBlock used in the last src_page_requests */
349 RAMBlock *last_req_rb;
350 /* Queue of outstanding page requests from the destination */
351 QemuMutex src_page_req_mutex;
352 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
353 };
354 typedef struct RAMState RAMState;
355
356 static RAMState *ram_state;
357
358 static NotifierWithReturnList precopy_notifier_list;
359
360 void precopy_infrastructure_init(void)
361 {
362 notifier_with_return_list_init(&precopy_notifier_list);
363 }
364
365 void precopy_add_notifier(NotifierWithReturn *n)
366 {
367 notifier_with_return_list_add(&precopy_notifier_list, n);
368 }
369
370 void precopy_remove_notifier(NotifierWithReturn *n)
371 {
372 notifier_with_return_remove(n);
373 }
374
375 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
376 {
377 PrecopyNotifyData pnd;
378 pnd.reason = reason;
379 pnd.errp = errp;
380
381 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
382 }
383
384 uint64_t ram_bytes_remaining(void)
385 {
386 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
387 0;
388 }
389
390 MigrationStats ram_counters;
391
392 /* used by the search for pages to send */
393 struct PageSearchStatus {
394 /* Current block being searched */
395 RAMBlock *block;
396 /* Current page to search from */
397 unsigned long page;
398 /* Set once we wrap around */
399 bool complete_round;
400 };
401 typedef struct PageSearchStatus PageSearchStatus;
402
403 CompressionStats compression_counters;
404
405 struct CompressParam {
406 bool done;
407 bool quit;
408 bool zero_page;
409 QEMUFile *file;
410 QemuMutex mutex;
411 QemuCond cond;
412 RAMBlock *block;
413 ram_addr_t offset;
414
415 /* internally used fields */
416 z_stream stream;
417 uint8_t *originbuf;
418 };
419 typedef struct CompressParam CompressParam;
420
421 struct DecompressParam {
422 bool done;
423 bool quit;
424 QemuMutex mutex;
425 QemuCond cond;
426 void *des;
427 uint8_t *compbuf;
428 int len;
429 z_stream stream;
430 };
431 typedef struct DecompressParam DecompressParam;
432
433 static CompressParam *comp_param;
434 static QemuThread *compress_threads;
435 /* comp_done_cond is used to wake up the migration thread when
436 * one of the compression threads has finished the compression.
437 * comp_done_lock is used to co-work with comp_done_cond.
438 */
439 static QemuMutex comp_done_lock;
440 static QemuCond comp_done_cond;
441 /* The empty QEMUFileOps will be used by file in CompressParam */
442 static const QEMUFileOps empty_ops = { };
443
444 static QEMUFile *decomp_file;
445 static DecompressParam *decomp_param;
446 static QemuThread *decompress_threads;
447 static QemuMutex decomp_done_lock;
448 static QemuCond decomp_done_cond;
449
450 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
451 ram_addr_t offset, uint8_t *source_buf);
452
453 static void *do_data_compress(void *opaque)
454 {
455 CompressParam *param = opaque;
456 RAMBlock *block;
457 ram_addr_t offset;
458 bool zero_page;
459
460 qemu_mutex_lock(&param->mutex);
461 while (!param->quit) {
462 if (param->block) {
463 block = param->block;
464 offset = param->offset;
465 param->block = NULL;
466 qemu_mutex_unlock(&param->mutex);
467
468 zero_page = do_compress_ram_page(param->file, &param->stream,
469 block, offset, param->originbuf);
470
471 qemu_mutex_lock(&comp_done_lock);
472 param->done = true;
473 param->zero_page = zero_page;
474 qemu_cond_signal(&comp_done_cond);
475 qemu_mutex_unlock(&comp_done_lock);
476
477 qemu_mutex_lock(&param->mutex);
478 } else {
479 qemu_cond_wait(&param->cond, &param->mutex);
480 }
481 }
482 qemu_mutex_unlock(&param->mutex);
483
484 return NULL;
485 }
486
487 static void compress_threads_save_cleanup(void)
488 {
489 int i, thread_count;
490
491 if (!migrate_use_compression() || !comp_param) {
492 return;
493 }
494
495 thread_count = migrate_compress_threads();
496 for (i = 0; i < thread_count; i++) {
497 /*
498 * we use it as a indicator which shows if the thread is
499 * properly init'd or not
500 */
501 if (!comp_param[i].file) {
502 break;
503 }
504
505 qemu_mutex_lock(&comp_param[i].mutex);
506 comp_param[i].quit = true;
507 qemu_cond_signal(&comp_param[i].cond);
508 qemu_mutex_unlock(&comp_param[i].mutex);
509
510 qemu_thread_join(compress_threads + i);
511 qemu_mutex_destroy(&comp_param[i].mutex);
512 qemu_cond_destroy(&comp_param[i].cond);
513 deflateEnd(&comp_param[i].stream);
514 g_free(comp_param[i].originbuf);
515 qemu_fclose(comp_param[i].file);
516 comp_param[i].file = NULL;
517 }
518 qemu_mutex_destroy(&comp_done_lock);
519 qemu_cond_destroy(&comp_done_cond);
520 g_free(compress_threads);
521 g_free(comp_param);
522 compress_threads = NULL;
523 comp_param = NULL;
524 }
525
526 static int compress_threads_save_setup(void)
527 {
528 int i, thread_count;
529
530 if (!migrate_use_compression()) {
531 return 0;
532 }
533 thread_count = migrate_compress_threads();
534 compress_threads = g_new0(QemuThread, thread_count);
535 comp_param = g_new0(CompressParam, thread_count);
536 qemu_cond_init(&comp_done_cond);
537 qemu_mutex_init(&comp_done_lock);
538 for (i = 0; i < thread_count; i++) {
539 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
540 if (!comp_param[i].originbuf) {
541 goto exit;
542 }
543
544 if (deflateInit(&comp_param[i].stream,
545 migrate_compress_level()) != Z_OK) {
546 g_free(comp_param[i].originbuf);
547 goto exit;
548 }
549
550 /* comp_param[i].file is just used as a dummy buffer to save data,
551 * set its ops to empty.
552 */
553 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
554 comp_param[i].done = true;
555 comp_param[i].quit = false;
556 qemu_mutex_init(&comp_param[i].mutex);
557 qemu_cond_init(&comp_param[i].cond);
558 qemu_thread_create(compress_threads + i, "compress",
559 do_data_compress, comp_param + i,
560 QEMU_THREAD_JOINABLE);
561 }
562 return 0;
563
564 exit:
565 compress_threads_save_cleanup();
566 return -1;
567 }
568
569 /**
570 * save_page_header: write page header to wire
571 *
572 * If this is the 1st block, it also writes the block identification
573 *
574 * Returns the number of bytes written
575 *
576 * @f: QEMUFile where to send the data
577 * @block: block that contains the page we want to send
578 * @offset: offset inside the block for the page
579 * in the lower bits, it contains flags
580 */
581 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
582 ram_addr_t offset)
583 {
584 size_t size, len;
585
586 if (block == rs->last_sent_block) {
587 offset |= RAM_SAVE_FLAG_CONTINUE;
588 }
589 qemu_put_be64(f, offset);
590 size = 8;
591
592 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
593 len = strlen(block->idstr);
594 qemu_put_byte(f, len);
595 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
596 size += 1 + len;
597 rs->last_sent_block = block;
598 }
599 return size;
600 }
601
602 /**
603 * mig_throttle_guest_down: throttle down the guest
604 *
605 * Reduce amount of guest cpu execution to hopefully slow down memory
606 * writes. If guest dirty memory rate is reduced below the rate at
607 * which we can transfer pages to the destination then we should be
608 * able to complete migration. Some workloads dirty memory way too
609 * fast and will not effectively converge, even with auto-converge.
610 */
611 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
612 uint64_t bytes_dirty_threshold)
613 {
614 MigrationState *s = migrate_get_current();
615 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
616 uint64_t pct_increment = s->parameters.cpu_throttle_increment;
617 bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
618 int pct_max = s->parameters.max_cpu_throttle;
619
620 uint64_t throttle_now = cpu_throttle_get_percentage();
621 uint64_t cpu_now, cpu_ideal, throttle_inc;
622
623 /* We have not started throttling yet. Let's start it. */
624 if (!cpu_throttle_active()) {
625 cpu_throttle_set(pct_initial);
626 } else {
627 /* Throttling already on, just increase the rate */
628 if (!pct_tailslow) {
629 throttle_inc = pct_increment;
630 } else {
631 /* Compute the ideal CPU percentage used by Guest, which may
632 * make the dirty rate match the dirty rate threshold. */
633 cpu_now = 100 - throttle_now;
634 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
635 bytes_dirty_period);
636 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
637 }
638 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
639 }
640 }
641
642 /**
643 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
644 *
645 * @rs: current RAM state
646 * @current_addr: address for the zero page
647 *
648 * Update the xbzrle cache to reflect a page that's been sent as all 0.
649 * The important thing is that a stale (not-yet-0'd) page be replaced
650 * by the new data.
651 * As a bonus, if the page wasn't in the cache it gets added so that
652 * when a small write is made into the 0'd page it gets XBZRLE sent.
653 */
654 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
655 {
656 if (!rs->xbzrle_enabled) {
657 return;
658 }
659
660 /* We don't care if this fails to allocate a new cache page
661 * as long as it updated an old one */
662 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
663 ram_counters.dirty_sync_count);
664 }
665
666 #define ENCODING_FLAG_XBZRLE 0x1
667
668 /**
669 * save_xbzrle_page: compress and send current page
670 *
671 * Returns: 1 means that we wrote the page
672 * 0 means that page is identical to the one already sent
673 * -1 means that xbzrle would be longer than normal
674 *
675 * @rs: current RAM state
676 * @current_data: pointer to the address of the page contents
677 * @current_addr: addr of the page
678 * @block: block that contains the page we want to send
679 * @offset: offset inside the block for the page
680 * @last_stage: if we are at the completion stage
681 */
682 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
683 ram_addr_t current_addr, RAMBlock *block,
684 ram_addr_t offset, bool last_stage)
685 {
686 int encoded_len = 0, bytes_xbzrle;
687 uint8_t *prev_cached_page;
688
689 if (!cache_is_cached(XBZRLE.cache, current_addr,
690 ram_counters.dirty_sync_count)) {
691 xbzrle_counters.cache_miss++;
692 if (!last_stage) {
693 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
694 ram_counters.dirty_sync_count) == -1) {
695 return -1;
696 } else {
697 /* update *current_data when the page has been
698 inserted into cache */
699 *current_data = get_cached_data(XBZRLE.cache, current_addr);
700 }
701 }
702 return -1;
703 }
704
705 /*
706 * Reaching here means the page has hit the xbzrle cache, no matter what
707 * encoding result it is (normal encoding, overflow or skipping the page),
708 * count the page as encoded. This is used to calculate the encoding rate.
709 *
710 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
711 * 2nd page turns out to be skipped (i.e. no new bytes written to the
712 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
713 * skipped page included. In this way, the encoding rate can tell if the
714 * guest page is good for xbzrle encoding.
715 */
716 xbzrle_counters.pages++;
717 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
718
719 /* save current buffer into memory */
720 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
721
722 /* XBZRLE encoding (if there is no overflow) */
723 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
724 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
725 TARGET_PAGE_SIZE);
726
727 /*
728 * Update the cache contents, so that it corresponds to the data
729 * sent, in all cases except where we skip the page.
730 */
731 if (!last_stage && encoded_len != 0) {
732 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
733 /*
734 * In the case where we couldn't compress, ensure that the caller
735 * sends the data from the cache, since the guest might have
736 * changed the RAM since we copied it.
737 */
738 *current_data = prev_cached_page;
739 }
740
741 if (encoded_len == 0) {
742 trace_save_xbzrle_page_skipping();
743 return 0;
744 } else if (encoded_len == -1) {
745 trace_save_xbzrle_page_overflow();
746 xbzrle_counters.overflow++;
747 xbzrle_counters.bytes += TARGET_PAGE_SIZE;
748 return -1;
749 }
750
751 /* Send XBZRLE based compressed page */
752 bytes_xbzrle = save_page_header(rs, rs->f, block,
753 offset | RAM_SAVE_FLAG_XBZRLE);
754 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
755 qemu_put_be16(rs->f, encoded_len);
756 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
757 bytes_xbzrle += encoded_len + 1 + 2;
758 /*
759 * Like compressed_size (please see update_compress_thread_counts),
760 * the xbzrle encoded bytes don't count the 8 byte header with
761 * RAM_SAVE_FLAG_CONTINUE.
762 */
763 xbzrle_counters.bytes += bytes_xbzrle - 8;
764 ram_counters.transferred += bytes_xbzrle;
765
766 return 1;
767 }
768
769 /**
770 * migration_bitmap_find_dirty: find the next dirty page from start
771 *
772 * Returns the page offset within memory region of the start of a dirty page
773 *
774 * @rs: current RAM state
775 * @rb: RAMBlock where to search for dirty pages
776 * @start: page where we start the search
777 */
778 static inline
779 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
780 unsigned long start)
781 {
782 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
783 unsigned long *bitmap = rb->bmap;
784
785 if (ramblock_is_ignored(rb)) {
786 return size;
787 }
788
789 return find_next_bit(bitmap, size, start);
790 }
791
792 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
793 RAMBlock *rb,
794 unsigned long page)
795 {
796 bool ret;
797
798 /*
799 * Clear dirty bitmap if needed. This _must_ be called before we
800 * send any of the page in the chunk because we need to make sure
801 * we can capture further page content changes when we sync dirty
802 * log the next time. So as long as we are going to send any of
803 * the page in the chunk we clear the remote dirty bitmap for all.
804 * Clearing it earlier won't be a problem, but too late will.
805 */
806 if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
807 uint8_t shift = rb->clear_bmap_shift;
808 hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
809 hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
810
811 /*
812 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
813 * can make things easier sometimes since then start address
814 * of the small chunk will always be 64 pages aligned so the
815 * bitmap will always be aligned to unsigned long. We should
816 * even be able to remove this restriction but I'm simply
817 * keeping it.
818 */
819 assert(shift >= 6);
820 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
821 memory_region_clear_dirty_bitmap(rb->mr, start, size);
822 }
823
824 ret = test_and_clear_bit(page, rb->bmap);
825
826 if (ret) {
827 rs->migration_dirty_pages--;
828 }
829
830 return ret;
831 }
832
833 /* Called with RCU critical section */
834 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
835 {
836 uint64_t new_dirty_pages =
837 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
838
839 rs->migration_dirty_pages += new_dirty_pages;
840 rs->num_dirty_pages_period += new_dirty_pages;
841 }
842
843 /**
844 * ram_pagesize_summary: calculate all the pagesizes of a VM
845 *
846 * Returns a summary bitmap of the page sizes of all RAMBlocks
847 *
848 * For VMs with just normal pages this is equivalent to the host page
849 * size. If it's got some huge pages then it's the OR of all the
850 * different page sizes.
851 */
852 uint64_t ram_pagesize_summary(void)
853 {
854 RAMBlock *block;
855 uint64_t summary = 0;
856
857 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
858 summary |= block->page_size;
859 }
860
861 return summary;
862 }
863
864 uint64_t ram_get_total_transferred_pages(void)
865 {
866 return ram_counters.normal + ram_counters.duplicate +
867 compression_counters.pages + xbzrle_counters.pages;
868 }
869
870 static void migration_update_rates(RAMState *rs, int64_t end_time)
871 {
872 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
873 double compressed_size;
874
875 /* calculate period counters */
876 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
877 / (end_time - rs->time_last_bitmap_sync);
878
879 if (!page_count) {
880 return;
881 }
882
883 if (migrate_use_xbzrle()) {
884 double encoded_size, unencoded_size;
885
886 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
887 rs->xbzrle_cache_miss_prev) / page_count;
888 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
889 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
890 TARGET_PAGE_SIZE;
891 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
892 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
893 xbzrle_counters.encoding_rate = 0;
894 } else {
895 xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
896 }
897 rs->xbzrle_pages_prev = xbzrle_counters.pages;
898 rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
899 }
900
901 if (migrate_use_compression()) {
902 compression_counters.busy_rate = (double)(compression_counters.busy -
903 rs->compress_thread_busy_prev) / page_count;
904 rs->compress_thread_busy_prev = compression_counters.busy;
905
906 compressed_size = compression_counters.compressed_size -
907 rs->compressed_size_prev;
908 if (compressed_size) {
909 double uncompressed_size = (compression_counters.pages -
910 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
911
912 /* Compression-Ratio = Uncompressed-size / Compressed-size */
913 compression_counters.compression_rate =
914 uncompressed_size / compressed_size;
915
916 rs->compress_pages_prev = compression_counters.pages;
917 rs->compressed_size_prev = compression_counters.compressed_size;
918 }
919 }
920 }
921
922 static void migration_trigger_throttle(RAMState *rs)
923 {
924 MigrationState *s = migrate_get_current();
925 uint64_t threshold = s->parameters.throttle_trigger_threshold;
926
927 uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
928 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
929 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
930
931 /* During block migration the auto-converge logic incorrectly detects
932 * that ram migration makes no progress. Avoid this by disabling the
933 * throttling logic during the bulk phase of block migration. */
934 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
935 /* The following detection logic can be refined later. For now:
936 Check to see if the ratio between dirtied bytes and the approx.
937 amount of bytes that just got transferred since the last time
938 we were in this routine reaches the threshold. If that happens
939 twice, start or increase throttling. */
940
941 if ((bytes_dirty_period > bytes_dirty_threshold) &&
942 (++rs->dirty_rate_high_cnt >= 2)) {
943 trace_migration_throttle();
944 rs->dirty_rate_high_cnt = 0;
945 mig_throttle_guest_down(bytes_dirty_period,
946 bytes_dirty_threshold);
947 }
948 }
949 }
950
951 static void migration_bitmap_sync(RAMState *rs)
952 {
953 RAMBlock *block;
954 int64_t end_time;
955
956 ram_counters.dirty_sync_count++;
957
958 if (!rs->time_last_bitmap_sync) {
959 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
960 }
961
962 trace_migration_bitmap_sync_start();
963 memory_global_dirty_log_sync();
964
965 qemu_mutex_lock(&rs->bitmap_mutex);
966 WITH_RCU_READ_LOCK_GUARD() {
967 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
968 ramblock_sync_dirty_bitmap(rs, block);
969 }
970 ram_counters.remaining = ram_bytes_remaining();
971 }
972 qemu_mutex_unlock(&rs->bitmap_mutex);
973
974 memory_global_after_dirty_log_sync();
975 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
976
977 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
978
979 /* more than 1 second = 1000 millisecons */
980 if (end_time > rs->time_last_bitmap_sync + 1000) {
981 migration_trigger_throttle(rs);
982
983 migration_update_rates(rs, end_time);
984
985 rs->target_page_count_prev = rs->target_page_count;
986
987 /* reset period counters */
988 rs->time_last_bitmap_sync = end_time;
989 rs->num_dirty_pages_period = 0;
990 rs->bytes_xfer_prev = ram_counters.transferred;
991 }
992 if (migrate_use_events()) {
993 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
994 }
995 }
996
997 static void migration_bitmap_sync_precopy(RAMState *rs)
998 {
999 Error *local_err = NULL;
1000
1001 /*
1002 * The current notifier usage is just an optimization to migration, so we
1003 * don't stop the normal migration process in the error case.
1004 */
1005 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1006 error_report_err(local_err);
1007 local_err = NULL;
1008 }
1009
1010 migration_bitmap_sync(rs);
1011
1012 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1013 error_report_err(local_err);
1014 }
1015 }
1016
1017 /**
1018 * save_zero_page_to_file: send the zero page to the file
1019 *
1020 * Returns the size of data written to the file, 0 means the page is not
1021 * a zero page
1022 *
1023 * @rs: current RAM state
1024 * @file: the file where the data is saved
1025 * @block: block that contains the page we want to send
1026 * @offset: offset inside the block for the page
1027 */
1028 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1029 RAMBlock *block, ram_addr_t offset)
1030 {
1031 uint8_t *p = block->host + offset;
1032 int len = 0;
1033
1034 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1035 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1036 qemu_put_byte(file, 0);
1037 len += 1;
1038 }
1039 return len;
1040 }
1041
1042 /**
1043 * save_zero_page: send the zero page to the stream
1044 *
1045 * Returns the number of pages written.
1046 *
1047 * @rs: current RAM state
1048 * @block: block that contains the page we want to send
1049 * @offset: offset inside the block for the page
1050 */
1051 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1052 {
1053 int len = save_zero_page_to_file(rs, rs->f, block, offset);
1054
1055 if (len) {
1056 ram_counters.duplicate++;
1057 ram_counters.transferred += len;
1058 return 1;
1059 }
1060 return -1;
1061 }
1062
1063 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1064 {
1065 if (!migrate_release_ram() || !migration_in_postcopy()) {
1066 return;
1067 }
1068
1069 ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1070 }
1071
1072 /*
1073 * @pages: the number of pages written by the control path,
1074 * < 0 - error
1075 * > 0 - number of pages written
1076 *
1077 * Return true if the pages has been saved, otherwise false is returned.
1078 */
1079 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1080 int *pages)
1081 {
1082 uint64_t bytes_xmit = 0;
1083 int ret;
1084
1085 *pages = -1;
1086 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1087 &bytes_xmit);
1088 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1089 return false;
1090 }
1091
1092 if (bytes_xmit) {
1093 ram_counters.transferred += bytes_xmit;
1094 *pages = 1;
1095 }
1096
1097 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1098 return true;
1099 }
1100
1101 if (bytes_xmit > 0) {
1102 ram_counters.normal++;
1103 } else if (bytes_xmit == 0) {
1104 ram_counters.duplicate++;
1105 }
1106
1107 return true;
1108 }
1109
1110 /*
1111 * directly send the page to the stream
1112 *
1113 * Returns the number of pages written.
1114 *
1115 * @rs: current RAM state
1116 * @block: block that contains the page we want to send
1117 * @offset: offset inside the block for the page
1118 * @buf: the page to be sent
1119 * @async: send to page asyncly
1120 */
1121 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1122 uint8_t *buf, bool async)
1123 {
1124 ram_counters.transferred += save_page_header(rs, rs->f, block,
1125 offset | RAM_SAVE_FLAG_PAGE);
1126 if (async) {
1127 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1128 migrate_release_ram() &
1129 migration_in_postcopy());
1130 } else {
1131 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1132 }
1133 ram_counters.transferred += TARGET_PAGE_SIZE;
1134 ram_counters.normal++;
1135 return 1;
1136 }
1137
1138 /**
1139 * ram_save_page: send the given page to the stream
1140 *
1141 * Returns the number of pages written.
1142 * < 0 - error
1143 * >=0 - Number of pages written - this might legally be 0
1144 * if xbzrle noticed the page was the same.
1145 *
1146 * @rs: current RAM state
1147 * @block: block that contains the page we want to send
1148 * @offset: offset inside the block for the page
1149 * @last_stage: if we are at the completion stage
1150 */
1151 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1152 {
1153 int pages = -1;
1154 uint8_t *p;
1155 bool send_async = true;
1156 RAMBlock *block = pss->block;
1157 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1158 ram_addr_t current_addr = block->offset + offset;
1159
1160 p = block->host + offset;
1161 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1162
1163 XBZRLE_cache_lock();
1164 if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1165 pages = save_xbzrle_page(rs, &p, current_addr, block,
1166 offset, last_stage);
1167 if (!last_stage) {
1168 /* Can't send this cached data async, since the cache page
1169 * might get updated before it gets to the wire
1170 */
1171 send_async = false;
1172 }
1173 }
1174
1175 /* XBZRLE overflow or normal page */
1176 if (pages == -1) {
1177 pages = save_normal_page(rs, block, offset, p, send_async);
1178 }
1179
1180 XBZRLE_cache_unlock();
1181
1182 return pages;
1183 }
1184
1185 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1186 ram_addr_t offset)
1187 {
1188 if (multifd_queue_page(rs->f, block, offset) < 0) {
1189 return -1;
1190 }
1191 ram_counters.normal++;
1192
1193 return 1;
1194 }
1195
1196 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1197 ram_addr_t offset, uint8_t *source_buf)
1198 {
1199 RAMState *rs = ram_state;
1200 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1201 bool zero_page = false;
1202 int ret;
1203
1204 if (save_zero_page_to_file(rs, f, block, offset)) {
1205 zero_page = true;
1206 goto exit;
1207 }
1208
1209 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1210
1211 /*
1212 * copy it to a internal buffer to avoid it being modified by VM
1213 * so that we can catch up the error during compression and
1214 * decompression
1215 */
1216 memcpy(source_buf, p, TARGET_PAGE_SIZE);
1217 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1218 if (ret < 0) {
1219 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1220 error_report("compressed data failed!");
1221 return false;
1222 }
1223
1224 exit:
1225 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1226 return zero_page;
1227 }
1228
1229 static void
1230 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1231 {
1232 ram_counters.transferred += bytes_xmit;
1233
1234 if (param->zero_page) {
1235 ram_counters.duplicate++;
1236 return;
1237 }
1238
1239 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1240 compression_counters.compressed_size += bytes_xmit - 8;
1241 compression_counters.pages++;
1242 }
1243
1244 static bool save_page_use_compression(RAMState *rs);
1245
1246 static void flush_compressed_data(RAMState *rs)
1247 {
1248 int idx, len, thread_count;
1249
1250 if (!save_page_use_compression(rs)) {
1251 return;
1252 }
1253 thread_count = migrate_compress_threads();
1254
1255 qemu_mutex_lock(&comp_done_lock);
1256 for (idx = 0; idx < thread_count; idx++) {
1257 while (!comp_param[idx].done) {
1258 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1259 }
1260 }
1261 qemu_mutex_unlock(&comp_done_lock);
1262
1263 for (idx = 0; idx < thread_count; idx++) {
1264 qemu_mutex_lock(&comp_param[idx].mutex);
1265 if (!comp_param[idx].quit) {
1266 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1267 /*
1268 * it's safe to fetch zero_page without holding comp_done_lock
1269 * as there is no further request submitted to the thread,
1270 * i.e, the thread should be waiting for a request at this point.
1271 */
1272 update_compress_thread_counts(&comp_param[idx], len);
1273 }
1274 qemu_mutex_unlock(&comp_param[idx].mutex);
1275 }
1276 }
1277
1278 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1279 ram_addr_t offset)
1280 {
1281 param->block = block;
1282 param->offset = offset;
1283 }
1284
1285 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1286 ram_addr_t offset)
1287 {
1288 int idx, thread_count, bytes_xmit = -1, pages = -1;
1289 bool wait = migrate_compress_wait_thread();
1290
1291 thread_count = migrate_compress_threads();
1292 qemu_mutex_lock(&comp_done_lock);
1293 retry:
1294 for (idx = 0; idx < thread_count; idx++) {
1295 if (comp_param[idx].done) {
1296 comp_param[idx].done = false;
1297 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1298 qemu_mutex_lock(&comp_param[idx].mutex);
1299 set_compress_params(&comp_param[idx], block, offset);
1300 qemu_cond_signal(&comp_param[idx].cond);
1301 qemu_mutex_unlock(&comp_param[idx].mutex);
1302 pages = 1;
1303 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1304 break;
1305 }
1306 }
1307
1308 /*
1309 * wait for the free thread if the user specifies 'compress-wait-thread',
1310 * otherwise we will post the page out in the main thread as normal page.
1311 */
1312 if (pages < 0 && wait) {
1313 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1314 goto retry;
1315 }
1316 qemu_mutex_unlock(&comp_done_lock);
1317
1318 return pages;
1319 }
1320
1321 /**
1322 * find_dirty_block: find the next dirty page and update any state
1323 * associated with the search process.
1324 *
1325 * Returns true if a page is found
1326 *
1327 * @rs: current RAM state
1328 * @pss: data about the state of the current dirty page scan
1329 * @again: set to false if the search has scanned the whole of RAM
1330 */
1331 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1332 {
1333 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1334 if (pss->complete_round && pss->block == rs->last_seen_block &&
1335 pss->page >= rs->last_page) {
1336 /*
1337 * We've been once around the RAM and haven't found anything.
1338 * Give up.
1339 */
1340 *again = false;
1341 return false;
1342 }
1343 if (!offset_in_ramblock(pss->block,
1344 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1345 /* Didn't find anything in this RAM Block */
1346 pss->page = 0;
1347 pss->block = QLIST_NEXT_RCU(pss->block, next);
1348 if (!pss->block) {
1349 /*
1350 * If memory migration starts over, we will meet a dirtied page
1351 * which may still exists in compression threads's ring, so we
1352 * should flush the compressed data to make sure the new page
1353 * is not overwritten by the old one in the destination.
1354 *
1355 * Also If xbzrle is on, stop using the data compression at this
1356 * point. In theory, xbzrle can do better than compression.
1357 */
1358 flush_compressed_data(rs);
1359
1360 /* Hit the end of the list */
1361 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1362 /* Flag that we've looped */
1363 pss->complete_round = true;
1364 /* After the first round, enable XBZRLE. */
1365 if (migrate_use_xbzrle()) {
1366 rs->xbzrle_enabled = true;
1367 }
1368 }
1369 /* Didn't find anything this time, but try again on the new block */
1370 *again = true;
1371 return false;
1372 } else {
1373 /* Can go around again, but... */
1374 *again = true;
1375 /* We've found something so probably don't need to */
1376 return true;
1377 }
1378 }
1379
1380 /**
1381 * unqueue_page: gets a page of the queue
1382 *
1383 * Helper for 'get_queued_page' - gets a page off the queue
1384 *
1385 * Returns the block of the page (or NULL if none available)
1386 *
1387 * @rs: current RAM state
1388 * @offset: used to return the offset within the RAMBlock
1389 */
1390 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1391 {
1392 RAMBlock *block = NULL;
1393
1394 if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1395 return NULL;
1396 }
1397
1398 QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1399 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1400 struct RAMSrcPageRequest *entry =
1401 QSIMPLEQ_FIRST(&rs->src_page_requests);
1402 block = entry->rb;
1403 *offset = entry->offset;
1404
1405 if (entry->len > TARGET_PAGE_SIZE) {
1406 entry->len -= TARGET_PAGE_SIZE;
1407 entry->offset += TARGET_PAGE_SIZE;
1408 } else {
1409 memory_region_unref(block->mr);
1410 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1411 g_free(entry);
1412 migration_consume_urgent_request();
1413 }
1414 }
1415
1416 return block;
1417 }
1418
1419 #if defined(__linux__)
1420 /**
1421 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1422 * is found, return RAM block pointer and page offset
1423 *
1424 * Returns pointer to the RAMBlock containing faulting page,
1425 * NULL if no write faults are pending
1426 *
1427 * @rs: current RAM state
1428 * @offset: page offset from the beginning of the block
1429 */
1430 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1431 {
1432 struct uffd_msg uffd_msg;
1433 void *page_address;
1434 RAMBlock *block;
1435 int res;
1436
1437 if (!migrate_background_snapshot()) {
1438 return NULL;
1439 }
1440
1441 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1442 if (res <= 0) {
1443 return NULL;
1444 }
1445
1446 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1447 block = qemu_ram_block_from_host(page_address, false, offset);
1448 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1449 return block;
1450 }
1451
1452 /**
1453 * ram_save_release_protection: release UFFD write protection after
1454 * a range of pages has been saved
1455 *
1456 * @rs: current RAM state
1457 * @pss: page-search-status structure
1458 * @start_page: index of the first page in the range relative to pss->block
1459 *
1460 * Returns 0 on success, negative value in case of an error
1461 */
1462 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1463 unsigned long start_page)
1464 {
1465 int res = 0;
1466
1467 /* Check if page is from UFFD-managed region. */
1468 if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1469 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1470 uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS;
1471
1472 /* Flush async buffers before un-protect. */
1473 qemu_fflush(rs->f);
1474 /* Un-protect memory range. */
1475 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1476 false, false);
1477 }
1478
1479 return res;
1480 }
1481
1482 /* ram_write_tracking_available: check if kernel supports required UFFD features
1483 *
1484 * Returns true if supports, false otherwise
1485 */
1486 bool ram_write_tracking_available(void)
1487 {
1488 uint64_t uffd_features;
1489 int res;
1490
1491 res = uffd_query_features(&uffd_features);
1492 return (res == 0 &&
1493 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1494 }
1495
1496 /* ram_write_tracking_compatible: check if guest configuration is
1497 * compatible with 'write-tracking'
1498 *
1499 * Returns true if compatible, false otherwise
1500 */
1501 bool ram_write_tracking_compatible(void)
1502 {
1503 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1504 int uffd_fd;
1505 RAMBlock *block;
1506 bool ret = false;
1507
1508 /* Open UFFD file descriptor */
1509 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1510 if (uffd_fd < 0) {
1511 return false;
1512 }
1513
1514 RCU_READ_LOCK_GUARD();
1515
1516 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1517 uint64_t uffd_ioctls;
1518
1519 /* Nothing to do with read-only and MMIO-writable regions */
1520 if (block->mr->readonly || block->mr->rom_device) {
1521 continue;
1522 }
1523 /* Try to register block memory via UFFD-IO to track writes */
1524 if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1525 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1526 goto out;
1527 }
1528 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1529 goto out;
1530 }
1531 }
1532 ret = true;
1533
1534 out:
1535 uffd_close_fd(uffd_fd);
1536 return ret;
1537 }
1538
1539 /*
1540 * ram_block_populate_pages: populate memory in the RAM block by reading
1541 * an integer from the beginning of each page.
1542 *
1543 * Since it's solely used for userfault_fd WP feature, here we just
1544 * hardcode page size to qemu_real_host_page_size.
1545 *
1546 * @block: RAM block to populate
1547 */
1548 static void ram_block_populate_pages(RAMBlock *block)
1549 {
1550 char *ptr = (char *) block->host;
1551
1552 for (ram_addr_t offset = 0; offset < block->used_length;
1553 offset += qemu_real_host_page_size) {
1554 char tmp = *(ptr + offset);
1555
1556 /* Don't optimize the read out */
1557 asm volatile("" : "+r" (tmp));
1558 }
1559 }
1560
1561 /*
1562 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1563 */
1564 void ram_write_tracking_prepare(void)
1565 {
1566 RAMBlock *block;
1567
1568 RCU_READ_LOCK_GUARD();
1569
1570 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1571 /* Nothing to do with read-only and MMIO-writable regions */
1572 if (block->mr->readonly || block->mr->rom_device) {
1573 continue;
1574 }
1575
1576 /*
1577 * Populate pages of the RAM block before enabling userfault_fd
1578 * write protection.
1579 *
1580 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1581 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1582 * pages with pte_none() entries in page table.
1583 */
1584 ram_block_populate_pages(block);
1585 }
1586 }
1587
1588 /*
1589 * ram_write_tracking_start: start UFFD-WP memory tracking
1590 *
1591 * Returns 0 for success or negative value in case of error
1592 */
1593 int ram_write_tracking_start(void)
1594 {
1595 int uffd_fd;
1596 RAMState *rs = ram_state;
1597 RAMBlock *block;
1598
1599 /* Open UFFD file descriptor */
1600 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1601 if (uffd_fd < 0) {
1602 return uffd_fd;
1603 }
1604 rs->uffdio_fd = uffd_fd;
1605
1606 RCU_READ_LOCK_GUARD();
1607
1608 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1609 /* Nothing to do with read-only and MMIO-writable regions */
1610 if (block->mr->readonly || block->mr->rom_device) {
1611 continue;
1612 }
1613
1614 /* Register block memory with UFFD to track writes */
1615 if (uffd_register_memory(rs->uffdio_fd, block->host,
1616 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1617 goto fail;
1618 }
1619 /* Apply UFFD write protection to the block memory range */
1620 if (uffd_change_protection(rs->uffdio_fd, block->host,
1621 block->max_length, true, false)) {
1622 goto fail;
1623 }
1624 block->flags |= RAM_UF_WRITEPROTECT;
1625 memory_region_ref(block->mr);
1626
1627 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1628 block->host, block->max_length);
1629 }
1630
1631 return 0;
1632
1633 fail:
1634 error_report("ram_write_tracking_start() failed: restoring initial memory state");
1635
1636 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1637 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1638 continue;
1639 }
1640 /*
1641 * In case some memory block failed to be write-protected
1642 * remove protection and unregister all succeeded RAM blocks
1643 */
1644 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1645 false, false);
1646 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1647 /* Cleanup flags and remove reference */
1648 block->flags &= ~RAM_UF_WRITEPROTECT;
1649 memory_region_unref(block->mr);
1650 }
1651
1652 uffd_close_fd(uffd_fd);
1653 rs->uffdio_fd = -1;
1654 return -1;
1655 }
1656
1657 /**
1658 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1659 */
1660 void ram_write_tracking_stop(void)
1661 {
1662 RAMState *rs = ram_state;
1663 RAMBlock *block;
1664
1665 RCU_READ_LOCK_GUARD();
1666
1667 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1668 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1669 continue;
1670 }
1671 /* Remove protection and unregister all affected RAM blocks */
1672 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1673 false, false);
1674 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1675
1676 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1677 block->host, block->max_length);
1678
1679 /* Cleanup flags and remove reference */
1680 block->flags &= ~RAM_UF_WRITEPROTECT;
1681 memory_region_unref(block->mr);
1682 }
1683
1684 /* Finally close UFFD file descriptor */
1685 uffd_close_fd(rs->uffdio_fd);
1686 rs->uffdio_fd = -1;
1687 }
1688
1689 #else
1690 /* No target OS support, stubs just fail or ignore */
1691
1692 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1693 {
1694 (void) rs;
1695 (void) offset;
1696
1697 return NULL;
1698 }
1699
1700 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1701 unsigned long start_page)
1702 {
1703 (void) rs;
1704 (void) pss;
1705 (void) start_page;
1706
1707 return 0;
1708 }
1709
1710 bool ram_write_tracking_available(void)
1711 {
1712 return false;
1713 }
1714
1715 bool ram_write_tracking_compatible(void)
1716 {
1717 assert(0);
1718 return false;
1719 }
1720
1721 int ram_write_tracking_start(void)
1722 {
1723 assert(0);
1724 return -1;
1725 }
1726
1727 void ram_write_tracking_stop(void)
1728 {
1729 assert(0);
1730 }
1731 #endif /* defined(__linux__) */
1732
1733 /**
1734 * get_queued_page: unqueue a page from the postcopy requests
1735 *
1736 * Skips pages that are already sent (!dirty)
1737 *
1738 * Returns true if a queued page is found
1739 *
1740 * @rs: current RAM state
1741 * @pss: data about the state of the current dirty page scan
1742 */
1743 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1744 {
1745 RAMBlock *block;
1746 ram_addr_t offset;
1747 bool dirty;
1748
1749 do {
1750 block = unqueue_page(rs, &offset);
1751 /*
1752 * We're sending this page, and since it's postcopy nothing else
1753 * will dirty it, and we must make sure it doesn't get sent again
1754 * even if this queue request was received after the background
1755 * search already sent it.
1756 */
1757 if (block) {
1758 unsigned long page;
1759
1760 page = offset >> TARGET_PAGE_BITS;
1761 dirty = test_bit(page, block->bmap);
1762 if (!dirty) {
1763 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1764 page);
1765 } else {
1766 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1767 }
1768 }
1769
1770 } while (block && !dirty);
1771
1772 if (!block) {
1773 /*
1774 * Poll write faults too if background snapshot is enabled; that's
1775 * when we have vcpus got blocked by the write protected pages.
1776 */
1777 block = poll_fault_page(rs, &offset);
1778 }
1779
1780 if (block) {
1781 /*
1782 * We want the background search to continue from the queued page
1783 * since the guest is likely to want other pages near to the page
1784 * it just requested.
1785 */
1786 pss->block = block;
1787 pss->page = offset >> TARGET_PAGE_BITS;
1788
1789 /*
1790 * This unqueued page would break the "one round" check, even is
1791 * really rare.
1792 */
1793 pss->complete_round = false;
1794 }
1795
1796 return !!block;
1797 }
1798
1799 /**
1800 * migration_page_queue_free: drop any remaining pages in the ram
1801 * request queue
1802 *
1803 * It should be empty at the end anyway, but in error cases there may
1804 * be some left. in case that there is any page left, we drop it.
1805 *
1806 */
1807 static void migration_page_queue_free(RAMState *rs)
1808 {
1809 struct RAMSrcPageRequest *mspr, *next_mspr;
1810 /* This queue generally should be empty - but in the case of a failed
1811 * migration might have some droppings in.
1812 */
1813 RCU_READ_LOCK_GUARD();
1814 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1815 memory_region_unref(mspr->rb->mr);
1816 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1817 g_free(mspr);
1818 }
1819 }
1820
1821 /**
1822 * ram_save_queue_pages: queue the page for transmission
1823 *
1824 * A request from postcopy destination for example.
1825 *
1826 * Returns zero on success or negative on error
1827 *
1828 * @rbname: Name of the RAMBLock of the request. NULL means the
1829 * same that last one.
1830 * @start: starting address from the start of the RAMBlock
1831 * @len: length (in bytes) to send
1832 */
1833 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1834 {
1835 RAMBlock *ramblock;
1836 RAMState *rs = ram_state;
1837
1838 ram_counters.postcopy_requests++;
1839 RCU_READ_LOCK_GUARD();
1840
1841 if (!rbname) {
1842 /* Reuse last RAMBlock */
1843 ramblock = rs->last_req_rb;
1844
1845 if (!ramblock) {
1846 /*
1847 * Shouldn't happen, we can't reuse the last RAMBlock if
1848 * it's the 1st request.
1849 */
1850 error_report("ram_save_queue_pages no previous block");
1851 return -1;
1852 }
1853 } else {
1854 ramblock = qemu_ram_block_by_name(rbname);
1855
1856 if (!ramblock) {
1857 /* We shouldn't be asked for a non-existent RAMBlock */
1858 error_report("ram_save_queue_pages no block '%s'", rbname);
1859 return -1;
1860 }
1861 rs->last_req_rb = ramblock;
1862 }
1863 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1864 if (!offset_in_ramblock(ramblock, start + len - 1)) {
1865 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1866 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1867 __func__, start, len, ramblock->used_length);
1868 return -1;
1869 }
1870
1871 struct RAMSrcPageRequest *new_entry =
1872 g_malloc0(sizeof(struct RAMSrcPageRequest));
1873 new_entry->rb = ramblock;
1874 new_entry->offset = start;
1875 new_entry->len = len;
1876
1877 memory_region_ref(ramblock->mr);
1878 qemu_mutex_lock(&rs->src_page_req_mutex);
1879 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1880 migration_make_urgent_request();
1881 qemu_mutex_unlock(&rs->src_page_req_mutex);
1882
1883 return 0;
1884 }
1885
1886 static bool save_page_use_compression(RAMState *rs)
1887 {
1888 if (!migrate_use_compression()) {
1889 return false;
1890 }
1891
1892 /*
1893 * If xbzrle is enabled (e.g., after first round of migration), stop
1894 * using the data compression. In theory, xbzrle can do better than
1895 * compression.
1896 */
1897 if (rs->xbzrle_enabled) {
1898 return false;
1899 }
1900
1901 return true;
1902 }
1903
1904 /*
1905 * try to compress the page before posting it out, return true if the page
1906 * has been properly handled by compression, otherwise needs other
1907 * paths to handle it
1908 */
1909 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1910 {
1911 if (!save_page_use_compression(rs)) {
1912 return false;
1913 }
1914
1915 /*
1916 * When starting the process of a new block, the first page of
1917 * the block should be sent out before other pages in the same
1918 * block, and all the pages in last block should have been sent
1919 * out, keeping this order is important, because the 'cont' flag
1920 * is used to avoid resending the block name.
1921 *
1922 * We post the fist page as normal page as compression will take
1923 * much CPU resource.
1924 */
1925 if (block != rs->last_sent_block) {
1926 flush_compressed_data(rs);
1927 return false;
1928 }
1929
1930 if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1931 return true;
1932 }
1933
1934 compression_counters.busy++;
1935 return false;
1936 }
1937
1938 /**
1939 * ram_save_target_page: save one target page
1940 *
1941 * Returns the number of pages written
1942 *
1943 * @rs: current RAM state
1944 * @pss: data about the page we want to send
1945 * @last_stage: if we are at the completion stage
1946 */
1947 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1948 bool last_stage)
1949 {
1950 RAMBlock *block = pss->block;
1951 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1952 int res;
1953
1954 if (control_save_page(rs, block, offset, &res)) {
1955 return res;
1956 }
1957
1958 if (save_compress_page(rs, block, offset)) {
1959 return 1;
1960 }
1961
1962 res = save_zero_page(rs, block, offset);
1963 if (res > 0) {
1964 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1965 * page would be stale
1966 */
1967 if (!save_page_use_compression(rs)) {
1968 XBZRLE_cache_lock();
1969 xbzrle_cache_zero_page(rs, block->offset + offset);
1970 XBZRLE_cache_unlock();
1971 }
1972 ram_release_pages(block->idstr, offset, res);
1973 return res;
1974 }
1975
1976 /*
1977 * Do not use multifd for:
1978 * 1. Compression as the first page in the new block should be posted out
1979 * before sending the compressed page
1980 * 2. In postcopy as one whole host page should be placed
1981 */
1982 if (!save_page_use_compression(rs) && migrate_use_multifd()
1983 && !migration_in_postcopy()) {
1984 return ram_save_multifd_page(rs, block, offset);
1985 }
1986
1987 return ram_save_page(rs, pss, last_stage);
1988 }
1989
1990 /**
1991 * ram_save_host_page: save a whole host page
1992 *
1993 * Starting at *offset send pages up to the end of the current host
1994 * page. It's valid for the initial offset to point into the middle of
1995 * a host page in which case the remainder of the hostpage is sent.
1996 * Only dirty target pages are sent. Note that the host page size may
1997 * be a huge page for this block.
1998 * The saving stops at the boundary of the used_length of the block
1999 * if the RAMBlock isn't a multiple of the host page size.
2000 *
2001 * Returns the number of pages written or negative on error
2002 *
2003 * @rs: current RAM state
2004 * @ms: current migration state
2005 * @pss: data about the page we want to send
2006 * @last_stage: if we are at the completion stage
2007 */
2008 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2009 bool last_stage)
2010 {
2011 int tmppages, pages = 0;
2012 size_t pagesize_bits =
2013 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2014 unsigned long hostpage_boundary =
2015 QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
2016 unsigned long start_page = pss->page;
2017 int res;
2018
2019 if (ramblock_is_ignored(pss->block)) {
2020 error_report("block %s should not be migrated !", pss->block->idstr);
2021 return 0;
2022 }
2023
2024 do {
2025 /* Check the pages is dirty and if it is send it */
2026 if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2027 tmppages = ram_save_target_page(rs, pss, last_stage);
2028 if (tmppages < 0) {
2029 return tmppages;
2030 }
2031
2032 pages += tmppages;
2033 /*
2034 * Allow rate limiting to happen in the middle of huge pages if
2035 * something is sent in the current iteration.
2036 */
2037 if (pagesize_bits > 1 && tmppages > 0) {
2038 migration_rate_limit();
2039 }
2040 }
2041 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2042 } while ((pss->page < hostpage_boundary) &&
2043 offset_in_ramblock(pss->block,
2044 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2045 /* The offset we leave with is the min boundary of host page and block */
2046 pss->page = MIN(pss->page, hostpage_boundary) - 1;
2047
2048 res = ram_save_release_protection(rs, pss, start_page);
2049 return (res < 0 ? res : pages);
2050 }
2051
2052 /**
2053 * ram_find_and_save_block: finds a dirty page and sends it to f
2054 *
2055 * Called within an RCU critical section.
2056 *
2057 * Returns the number of pages written where zero means no dirty pages,
2058 * or negative on error
2059 *
2060 * @rs: current RAM state
2061 * @last_stage: if we are at the completion stage
2062 *
2063 * On systems where host-page-size > target-page-size it will send all the
2064 * pages in a host page that are dirty.
2065 */
2066
2067 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2068 {
2069 PageSearchStatus pss;
2070 int pages = 0;
2071 bool again, found;
2072
2073 /* No dirty page as there is zero RAM */
2074 if (!ram_bytes_total()) {
2075 return pages;
2076 }
2077
2078 pss.block = rs->last_seen_block;
2079 pss.page = rs->last_page;
2080 pss.complete_round = false;
2081
2082 if (!pss.block) {
2083 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2084 }
2085
2086 do {
2087 again = true;
2088 found = get_queued_page(rs, &pss);
2089
2090 if (!found) {
2091 /* priority queue empty, so just search for something dirty */
2092 found = find_dirty_block(rs, &pss, &again);
2093 }
2094
2095 if (found) {
2096 pages = ram_save_host_page(rs, &pss, last_stage);
2097 }
2098 } while (!pages && again);
2099
2100 rs->last_seen_block = pss.block;
2101 rs->last_page = pss.page;
2102
2103 return pages;
2104 }
2105
2106 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2107 {
2108 uint64_t pages = size / TARGET_PAGE_SIZE;
2109
2110 if (zero) {
2111 ram_counters.duplicate += pages;
2112 } else {
2113 ram_counters.normal += pages;
2114 ram_counters.transferred += size;
2115 qemu_update_position(f, size);
2116 }
2117 }
2118
2119 static uint64_t ram_bytes_total_common(bool count_ignored)
2120 {
2121 RAMBlock *block;
2122 uint64_t total = 0;
2123
2124 RCU_READ_LOCK_GUARD();
2125
2126 if (count_ignored) {
2127 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2128 total += block->used_length;
2129 }
2130 } else {
2131 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2132 total += block->used_length;
2133 }
2134 }
2135 return total;
2136 }
2137
2138 uint64_t ram_bytes_total(void)
2139 {
2140 return ram_bytes_total_common(false);
2141 }
2142
2143 static void xbzrle_load_setup(void)
2144 {
2145 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2146 }
2147
2148 static void xbzrle_load_cleanup(void)
2149 {
2150 g_free(XBZRLE.decoded_buf);
2151 XBZRLE.decoded_buf = NULL;
2152 }
2153
2154 static void ram_state_cleanup(RAMState **rsp)
2155 {
2156 if (*rsp) {
2157 migration_page_queue_free(*rsp);
2158 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2159 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2160 g_free(*rsp);
2161 *rsp = NULL;
2162 }
2163 }
2164
2165 static void xbzrle_cleanup(void)
2166 {
2167 XBZRLE_cache_lock();
2168 if (XBZRLE.cache) {
2169 cache_fini(XBZRLE.cache);
2170 g_free(XBZRLE.encoded_buf);
2171 g_free(XBZRLE.current_buf);
2172 g_free(XBZRLE.zero_target_page);
2173 XBZRLE.cache = NULL;
2174 XBZRLE.encoded_buf = NULL;
2175 XBZRLE.current_buf = NULL;
2176 XBZRLE.zero_target_page = NULL;
2177 }
2178 XBZRLE_cache_unlock();
2179 }
2180
2181 static void ram_save_cleanup(void *opaque)
2182 {
2183 RAMState **rsp = opaque;
2184 RAMBlock *block;
2185
2186 /* We don't use dirty log with background snapshots */
2187 if (!migrate_background_snapshot()) {
2188 /* caller have hold iothread lock or is in a bh, so there is
2189 * no writing race against the migration bitmap
2190 */
2191 memory_global_dirty_log_stop();
2192 }
2193
2194 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2195 g_free(block->clear_bmap);
2196 block->clear_bmap = NULL;
2197 g_free(block->bmap);
2198 block->bmap = NULL;
2199 }
2200
2201 xbzrle_cleanup();
2202 compress_threads_save_cleanup();
2203 ram_state_cleanup(rsp);
2204 }
2205
2206 static void ram_state_reset(RAMState *rs)
2207 {
2208 rs->last_seen_block = NULL;
2209 rs->last_sent_block = NULL;
2210 rs->last_page = 0;
2211 rs->last_version = ram_list.version;
2212 rs->xbzrle_enabled = false;
2213 }
2214
2215 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2216
2217 /*
2218 * 'expected' is the value you expect the bitmap mostly to be full
2219 * of; it won't bother printing lines that are all this value.
2220 * If 'todump' is null the migration bitmap is dumped.
2221 */
2222 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2223 unsigned long pages)
2224 {
2225 int64_t cur;
2226 int64_t linelen = 128;
2227 char linebuf[129];
2228
2229 for (cur = 0; cur < pages; cur += linelen) {
2230 int64_t curb;
2231 bool found = false;
2232 /*
2233 * Last line; catch the case where the line length
2234 * is longer than remaining ram
2235 */
2236 if (cur + linelen > pages) {
2237 linelen = pages - cur;
2238 }
2239 for (curb = 0; curb < linelen; curb++) {
2240 bool thisbit = test_bit(cur + curb, todump);
2241 linebuf[curb] = thisbit ? '1' : '.';
2242 found = found || (thisbit != expected);
2243 }
2244 if (found) {
2245 linebuf[curb] = '\0';
2246 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
2247 }
2248 }
2249 }
2250
2251 /* **** functions for postcopy ***** */
2252
2253 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2254 {
2255 struct RAMBlock *block;
2256
2257 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2258 unsigned long *bitmap = block->bmap;
2259 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2260 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2261
2262 while (run_start < range) {
2263 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2264 ram_discard_range(block->idstr,
2265 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2266 ((ram_addr_t)(run_end - run_start))
2267 << TARGET_PAGE_BITS);
2268 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2269 }
2270 }
2271 }
2272
2273 /**
2274 * postcopy_send_discard_bm_ram: discard a RAMBlock
2275 *
2276 * Returns zero on success
2277 *
2278 * Callback from postcopy_each_ram_send_discard for each RAMBlock
2279 *
2280 * @ms: current migration state
2281 * @block: RAMBlock to discard
2282 */
2283 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2284 {
2285 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2286 unsigned long current;
2287 unsigned long *bitmap = block->bmap;
2288
2289 for (current = 0; current < end; ) {
2290 unsigned long one = find_next_bit(bitmap, end, current);
2291 unsigned long zero, discard_length;
2292
2293 if (one >= end) {
2294 break;
2295 }
2296
2297 zero = find_next_zero_bit(bitmap, end, one + 1);
2298
2299 if (zero >= end) {
2300 discard_length = end - one;
2301 } else {
2302 discard_length = zero - one;
2303 }
2304 postcopy_discard_send_range(ms, one, discard_length);
2305 current = one + discard_length;
2306 }
2307
2308 return 0;
2309 }
2310
2311 /**
2312 * postcopy_each_ram_send_discard: discard all RAMBlocks
2313 *
2314 * Returns 0 for success or negative for error
2315 *
2316 * Utility for the outgoing postcopy code.
2317 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2318 * passing it bitmap indexes and name.
2319 * (qemu_ram_foreach_block ends up passing unscaled lengths
2320 * which would mean postcopy code would have to deal with target page)
2321 *
2322 * @ms: current migration state
2323 */
2324 static int postcopy_each_ram_send_discard(MigrationState *ms)
2325 {
2326 struct RAMBlock *block;
2327 int ret;
2328
2329 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2330 postcopy_discard_send_init(ms, block->idstr);
2331
2332 /*
2333 * Postcopy sends chunks of bitmap over the wire, but it
2334 * just needs indexes at this point, avoids it having
2335 * target page specific code.
2336 */
2337 ret = postcopy_send_discard_bm_ram(ms, block);
2338 postcopy_discard_send_finish(ms);
2339 if (ret) {
2340 return ret;
2341 }
2342 }
2343
2344 return 0;
2345 }
2346
2347 /**
2348 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2349 *
2350 * Helper for postcopy_chunk_hostpages; it's called twice to
2351 * canonicalize the two bitmaps, that are similar, but one is
2352 * inverted.
2353 *
2354 * Postcopy requires that all target pages in a hostpage are dirty or
2355 * clean, not a mix. This function canonicalizes the bitmaps.
2356 *
2357 * @ms: current migration state
2358 * @block: block that contains the page we want to canonicalize
2359 */
2360 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2361 {
2362 RAMState *rs = ram_state;
2363 unsigned long *bitmap = block->bmap;
2364 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2365 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2366 unsigned long run_start;
2367
2368 if (block->page_size == TARGET_PAGE_SIZE) {
2369 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2370 return;
2371 }
2372
2373 /* Find a dirty page */
2374 run_start = find_next_bit(bitmap, pages, 0);
2375
2376 while (run_start < pages) {
2377
2378 /*
2379 * If the start of this run of pages is in the middle of a host
2380 * page, then we need to fixup this host page.
2381 */
2382 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2383 /* Find the end of this run */
2384 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2385 /*
2386 * If the end isn't at the start of a host page, then the
2387 * run doesn't finish at the end of a host page
2388 * and we need to discard.
2389 */
2390 }
2391
2392 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2393 unsigned long page;
2394 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2395 host_ratio);
2396 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2397
2398 /* Clean up the bitmap */
2399 for (page = fixup_start_addr;
2400 page < fixup_start_addr + host_ratio; page++) {
2401 /*
2402 * Remark them as dirty, updating the count for any pages
2403 * that weren't previously dirty.
2404 */
2405 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2406 }
2407 }
2408
2409 /* Find the next dirty page for the next iteration */
2410 run_start = find_next_bit(bitmap, pages, run_start);
2411 }
2412 }
2413
2414 /**
2415 * postcopy_chunk_hostpages: discard any partially sent host page
2416 *
2417 * Utility for the outgoing postcopy code.
2418 *
2419 * Discard any partially sent host-page size chunks, mark any partially
2420 * dirty host-page size chunks as all dirty. In this case the host-page
2421 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2422 *
2423 * Returns zero on success
2424 *
2425 * @ms: current migration state
2426 * @block: block we want to work with
2427 */
2428 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2429 {
2430 postcopy_discard_send_init(ms, block->idstr);
2431
2432 /*
2433 * Ensure that all partially dirty host pages are made fully dirty.
2434 */
2435 postcopy_chunk_hostpages_pass(ms, block);
2436
2437 postcopy_discard_send_finish(ms);
2438 return 0;
2439 }
2440
2441 /**
2442 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2443 *
2444 * Returns zero on success
2445 *
2446 * Transmit the set of pages to be discarded after precopy to the target
2447 * these are pages that:
2448 * a) Have been previously transmitted but are now dirty again
2449 * b) Pages that have never been transmitted, this ensures that
2450 * any pages on the destination that have been mapped by background
2451 * tasks get discarded (transparent huge pages is the specific concern)
2452 * Hopefully this is pretty sparse
2453 *
2454 * @ms: current migration state
2455 */
2456 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2457 {
2458 RAMState *rs = ram_state;
2459 RAMBlock *block;
2460 int ret;
2461
2462 RCU_READ_LOCK_GUARD();
2463
2464 /* This should be our last sync, the src is now paused */
2465 migration_bitmap_sync(rs);
2466
2467 /* Easiest way to make sure we don't resume in the middle of a host-page */
2468 rs->last_seen_block = NULL;
2469 rs->last_sent_block = NULL;
2470 rs->last_page = 0;
2471
2472 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2473 /* Deal with TPS != HPS and huge pages */
2474 ret = postcopy_chunk_hostpages(ms, block);
2475 if (ret) {
2476 return ret;
2477 }
2478
2479 #ifdef DEBUG_POSTCOPY
2480 ram_debug_dump_bitmap(block->bmap, true,
2481 block->used_length >> TARGET_PAGE_BITS);
2482 #endif
2483 }
2484 trace_ram_postcopy_send_discard_bitmap();
2485
2486 return postcopy_each_ram_send_discard(ms);
2487 }
2488
2489 /**
2490 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2491 *
2492 * Returns zero on success
2493 *
2494 * @rbname: name of the RAMBlock of the request. NULL means the
2495 * same that last one.
2496 * @start: RAMBlock starting page
2497 * @length: RAMBlock size
2498 */
2499 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2500 {
2501 trace_ram_discard_range(rbname, start, length);
2502
2503 RCU_READ_LOCK_GUARD();
2504 RAMBlock *rb = qemu_ram_block_by_name(rbname);
2505
2506 if (!rb) {
2507 error_report("ram_discard_range: Failed to find block '%s'", rbname);
2508 return -1;
2509 }
2510
2511 /*
2512 * On source VM, we don't need to update the received bitmap since
2513 * we don't even have one.
2514 */
2515 if (rb->receivedmap) {
2516 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2517 length >> qemu_target_page_bits());
2518 }
2519
2520 return ram_block_discard_range(rb, start, length);
2521 }
2522
2523 /*
2524 * For every allocation, we will try not to crash the VM if the
2525 * allocation failed.
2526 */
2527 static int xbzrle_init(void)
2528 {
2529 Error *local_err = NULL;
2530
2531 if (!migrate_use_xbzrle()) {
2532 return 0;
2533 }
2534
2535 XBZRLE_cache_lock();
2536
2537 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2538 if (!XBZRLE.zero_target_page) {
2539 error_report("%s: Error allocating zero page", __func__);
2540 goto err_out;
2541 }
2542
2543 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2544 TARGET_PAGE_SIZE, &local_err);
2545 if (!XBZRLE.cache) {
2546 error_report_err(local_err);
2547 goto free_zero_page;
2548 }
2549
2550 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2551 if (!XBZRLE.encoded_buf) {
2552 error_report("%s: Error allocating encoded_buf", __func__);
2553 goto free_cache;
2554 }
2555
2556 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2557 if (!XBZRLE.current_buf) {
2558 error_report("%s: Error allocating current_buf", __func__);
2559 goto free_encoded_buf;
2560 }
2561
2562 /* We are all good */
2563 XBZRLE_cache_unlock();
2564 return 0;
2565
2566 free_encoded_buf:
2567 g_free(XBZRLE.encoded_buf);
2568 XBZRLE.encoded_buf = NULL;
2569 free_cache:
2570 cache_fini(XBZRLE.cache);
2571 XBZRLE.cache = NULL;
2572 free_zero_page:
2573 g_free(XBZRLE.zero_target_page);
2574 XBZRLE.zero_target_page = NULL;
2575 err_out:
2576 XBZRLE_cache_unlock();
2577 return -ENOMEM;
2578 }
2579
2580 static int ram_state_init(RAMState **rsp)
2581 {
2582 *rsp = g_try_new0(RAMState, 1);
2583
2584 if (!*rsp) {
2585 error_report("%s: Init ramstate fail", __func__);
2586 return -1;
2587 }
2588
2589 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2590 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2591 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2592
2593 /*
2594 * Count the total number of pages used by ram blocks not including any
2595 * gaps due to alignment or unplugs.
2596 * This must match with the initial values of dirty bitmap.
2597 */
2598 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2599 ram_state_reset(*rsp);
2600
2601 return 0;
2602 }
2603
2604 static void ram_list_init_bitmaps(void)
2605 {
2606 MigrationState *ms = migrate_get_current();
2607 RAMBlock *block;
2608 unsigned long pages;
2609 uint8_t shift;
2610
2611 /* Skip setting bitmap if there is no RAM */
2612 if (ram_bytes_total()) {
2613 shift = ms->clear_bitmap_shift;
2614 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2615 error_report("clear_bitmap_shift (%u) too big, using "
2616 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2617 shift = CLEAR_BITMAP_SHIFT_MAX;
2618 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2619 error_report("clear_bitmap_shift (%u) too small, using "
2620 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2621 shift = CLEAR_BITMAP_SHIFT_MIN;
2622 }
2623
2624 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2625 pages = block->max_length >> TARGET_PAGE_BITS;
2626 /*
2627 * The initial dirty bitmap for migration must be set with all
2628 * ones to make sure we'll migrate every guest RAM page to
2629 * destination.
2630 * Here we set RAMBlock.bmap all to 1 because when rebegin a
2631 * new migration after a failed migration, ram_list.
2632 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2633 * guest memory.
2634 */
2635 block->bmap = bitmap_new(pages);
2636 bitmap_set(block->bmap, 0, pages);
2637 block->clear_bmap_shift = shift;
2638 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2639 }
2640 }
2641 }
2642
2643 static void ram_init_bitmaps(RAMState *rs)
2644 {
2645 /* For memory_global_dirty_log_start below. */
2646 qemu_mutex_lock_iothread();
2647 qemu_mutex_lock_ramlist();
2648
2649 WITH_RCU_READ_LOCK_GUARD() {
2650 ram_list_init_bitmaps();
2651 /* We don't use dirty log with background snapshots */
2652 if (!migrate_background_snapshot()) {
2653 memory_global_dirty_log_start();
2654 migration_bitmap_sync_precopy(rs);
2655 }
2656 }
2657 qemu_mutex_unlock_ramlist();
2658 qemu_mutex_unlock_iothread();
2659 }
2660
2661 static int ram_init_all(RAMState **rsp)
2662 {
2663 if (ram_state_init(rsp)) {
2664 return -1;
2665 }
2666
2667 if (xbzrle_init()) {
2668 ram_state_cleanup(rsp);
2669 return -1;
2670 }
2671
2672 ram_init_bitmaps(*rsp);
2673
2674 return 0;
2675 }
2676
2677 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2678 {
2679 RAMBlock *block;
2680 uint64_t pages = 0;
2681
2682 /*
2683 * Postcopy is not using xbzrle/compression, so no need for that.
2684 * Also, since source are already halted, we don't need to care
2685 * about dirty page logging as well.
2686 */
2687
2688 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2689 pages += bitmap_count_one(block->bmap,
2690 block->used_length >> TARGET_PAGE_BITS);
2691 }
2692
2693 /* This may not be aligned with current bitmaps. Recalculate. */
2694 rs->migration_dirty_pages = pages;
2695
2696 ram_state_reset(rs);
2697
2698 /* Update RAMState cache of output QEMUFile */
2699 rs->f = out;
2700
2701 trace_ram_state_resume_prepare(pages);
2702 }
2703
2704 /*
2705 * This function clears bits of the free pages reported by the caller from the
2706 * migration dirty bitmap. @addr is the host address corresponding to the
2707 * start of the continuous guest free pages, and @len is the total bytes of
2708 * those pages.
2709 */
2710 void qemu_guest_free_page_hint(void *addr, size_t len)
2711 {
2712 RAMBlock *block;
2713 ram_addr_t offset;
2714 size_t used_len, start, npages;
2715 MigrationState *s = migrate_get_current();
2716
2717 /* This function is currently expected to be used during live migration */
2718 if (!migration_is_setup_or_active(s->state)) {
2719 return;
2720 }
2721
2722 for (; len > 0; len -= used_len, addr += used_len) {
2723 block = qemu_ram_block_from_host(addr, false, &offset);
2724 if (unlikely(!block || offset >= block->used_length)) {
2725 /*
2726 * The implementation might not support RAMBlock resize during
2727 * live migration, but it could happen in theory with future
2728 * updates. So we add a check here to capture that case.
2729 */
2730 error_report_once("%s unexpected error", __func__);
2731 return;
2732 }
2733
2734 if (len <= block->used_length - offset) {
2735 used_len = len;
2736 } else {
2737 used_len = block->used_length - offset;
2738 }
2739
2740 start = offset >> TARGET_PAGE_BITS;
2741 npages = used_len >> TARGET_PAGE_BITS;
2742
2743 qemu_mutex_lock(&ram_state->bitmap_mutex);
2744 ram_state->migration_dirty_pages -=
2745 bitmap_count_one_with_offset(block->bmap, start, npages);
2746 bitmap_clear(block->bmap, start, npages);
2747 qemu_mutex_unlock(&ram_state->bitmap_mutex);
2748 }
2749 }
2750
2751 /*
2752 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2753 * long-running RCU critical section. When rcu-reclaims in the code
2754 * start to become numerous it will be necessary to reduce the
2755 * granularity of these critical sections.
2756 */
2757
2758 /**
2759 * ram_save_setup: Setup RAM for migration
2760 *
2761 * Returns zero to indicate success and negative for error
2762 *
2763 * @f: QEMUFile where to send the data
2764 * @opaque: RAMState pointer
2765 */
2766 static int ram_save_setup(QEMUFile *f, void *opaque)
2767 {
2768 RAMState **rsp = opaque;
2769 RAMBlock *block;
2770
2771 if (compress_threads_save_setup()) {
2772 return -1;
2773 }
2774
2775 /* migration has already setup the bitmap, reuse it. */
2776 if (!migration_in_colo_state()) {
2777 if (ram_init_all(rsp) != 0) {
2778 compress_threads_save_cleanup();
2779 return -1;
2780 }
2781 }
2782 (*rsp)->f = f;
2783
2784 WITH_RCU_READ_LOCK_GUARD() {
2785 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2786
2787 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2788 qemu_put_byte(f, strlen(block->idstr));
2789 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2790 qemu_put_be64(f, block->used_length);
2791 if (migrate_postcopy_ram() && block->page_size !=
2792 qemu_host_page_size) {
2793 qemu_put_be64(f, block->page_size);
2794 }
2795 if (migrate_ignore_shared()) {
2796 qemu_put_be64(f, block->mr->addr);
2797 }
2798 }
2799 }
2800
2801 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2802 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2803
2804 multifd_send_sync_main(f);
2805 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2806 qemu_fflush(f);
2807
2808 return 0;
2809 }
2810
2811 /**
2812 * ram_save_iterate: iterative stage for migration
2813 *
2814 * Returns zero to indicate success and negative for error
2815 *
2816 * @f: QEMUFile where to send the data
2817 * @opaque: RAMState pointer
2818 */
2819 static int ram_save_iterate(QEMUFile *f, void *opaque)
2820 {
2821 RAMState **temp = opaque;
2822 RAMState *rs = *temp;
2823 int ret = 0;
2824 int i;
2825 int64_t t0;
2826 int done = 0;
2827
2828 if (blk_mig_bulk_active()) {
2829 /* Avoid transferring ram during bulk phase of block migration as
2830 * the bulk phase will usually take a long time and transferring
2831 * ram updates during that time is pointless. */
2832 goto out;
2833 }
2834
2835 /*
2836 * We'll take this lock a little bit long, but it's okay for two reasons.
2837 * Firstly, the only possible other thread to take it is who calls
2838 * qemu_guest_free_page_hint(), which should be rare; secondly, see
2839 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
2840 * guarantees that we'll at least released it in a regular basis.
2841 */
2842 qemu_mutex_lock(&rs->bitmap_mutex);
2843 WITH_RCU_READ_LOCK_GUARD() {
2844 if (ram_list.version != rs->last_version) {
2845 ram_state_reset(rs);
2846 }
2847
2848 /* Read version before ram_list.blocks */
2849 smp_rmb();
2850
2851 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2852
2853 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2854 i = 0;
2855 while ((ret = qemu_file_rate_limit(f)) == 0 ||
2856 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2857 int pages;
2858
2859 if (qemu_file_get_error(f)) {
2860 break;
2861 }
2862
2863 pages = ram_find_and_save_block(rs, false);
2864 /* no more pages to sent */
2865 if (pages == 0) {
2866 done = 1;
2867 break;
2868 }
2869
2870 if (pages < 0) {
2871 qemu_file_set_error(f, pages);
2872 break;
2873 }
2874
2875 rs->target_page_count += pages;
2876
2877 /*
2878 * During postcopy, it is necessary to make sure one whole host
2879 * page is sent in one chunk.
2880 */
2881 if (migrate_postcopy_ram()) {
2882 flush_compressed_data(rs);
2883 }
2884
2885 /*
2886 * we want to check in the 1st loop, just in case it was the 1st
2887 * time and we had to sync the dirty bitmap.
2888 * qemu_clock_get_ns() is a bit expensive, so we only check each
2889 * some iterations
2890 */
2891 if ((i & 63) == 0) {
2892 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2893 1000000;
2894 if (t1 > MAX_WAIT) {
2895 trace_ram_save_iterate_big_wait(t1, i);
2896 break;
2897 }
2898 }
2899 i++;
2900 }
2901 }
2902 qemu_mutex_unlock(&rs->bitmap_mutex);
2903
2904 /*
2905 * Must occur before EOS (or any QEMUFile operation)
2906 * because of RDMA protocol.
2907 */
2908 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2909
2910 out:
2911 if (ret >= 0
2912 && migration_is_setup_or_active(migrate_get_current()->state)) {
2913 multifd_send_sync_main(rs->f);
2914 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2915 qemu_fflush(f);
2916 ram_counters.transferred += 8;
2917
2918 ret = qemu_file_get_error(f);
2919 }
2920 if (ret < 0) {
2921 return ret;
2922 }
2923
2924 return done;
2925 }
2926
2927 /**
2928 * ram_save_complete: function called to send the remaining amount of ram
2929 *
2930 * Returns zero to indicate success or negative on error
2931 *
2932 * Called with iothread lock
2933 *
2934 * @f: QEMUFile where to send the data
2935 * @opaque: RAMState pointer
2936 */
2937 static int ram_save_complete(QEMUFile *f, void *opaque)
2938 {
2939 RAMState **temp = opaque;
2940 RAMState *rs = *temp;
2941 int ret = 0;
2942
2943 WITH_RCU_READ_LOCK_GUARD() {
2944 if (!migration_in_postcopy()) {
2945 migration_bitmap_sync_precopy(rs);
2946 }
2947
2948 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2949
2950 /* try transferring iterative blocks of memory */
2951
2952 /* flush all remaining blocks regardless of rate limiting */
2953 while (true) {
2954 int pages;
2955
2956 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2957 /* no more blocks to sent */
2958 if (pages == 0) {
2959 break;
2960 }
2961 if (pages < 0) {
2962 ret = pages;
2963 break;
2964 }
2965 }
2966
2967 flush_compressed_data(rs);
2968 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2969 }
2970
2971 if (ret >= 0) {
2972 multifd_send_sync_main(rs->f);
2973 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2974 qemu_fflush(f);
2975 }
2976
2977 return ret;
2978 }
2979
2980 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2981 uint64_t *res_precopy_only,
2982 uint64_t *res_compatible,
2983 uint64_t *res_postcopy_only)
2984 {
2985 RAMState **temp = opaque;
2986 RAMState *rs = *temp;
2987 uint64_t remaining_size;
2988
2989 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2990
2991 if (!migration_in_postcopy() &&
2992 remaining_size < max_size) {
2993 qemu_mutex_lock_iothread();
2994 WITH_RCU_READ_LOCK_GUARD() {
2995 migration_bitmap_sync_precopy(rs);
2996 }
2997 qemu_mutex_unlock_iothread();
2998 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2999 }
3000
3001 if (migrate_postcopy_ram()) {
3002 /* We can do postcopy, and all the data is postcopiable */
3003 *res_compatible += remaining_size;
3004 } else {
3005 *res_precopy_only += remaining_size;
3006 }
3007 }
3008
3009 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3010 {
3011 unsigned int xh_len;
3012 int xh_flags;
3013 uint8_t *loaded_data;
3014
3015 /* extract RLE header */
3016 xh_flags = qemu_get_byte(f);
3017 xh_len = qemu_get_be16(f);
3018
3019 if (xh_flags != ENCODING_FLAG_XBZRLE) {
3020 error_report("Failed to load XBZRLE page - wrong compression!");
3021 return -1;
3022 }
3023
3024 if (xh_len > TARGET_PAGE_SIZE) {
3025 error_report("Failed to load XBZRLE page - len overflow!");
3026 return -1;
3027 }
3028 loaded_data = XBZRLE.decoded_buf;
3029 /* load data and decode */
3030 /* it can change loaded_data to point to an internal buffer */
3031 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3032
3033 /* decode RLE */
3034 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3035 TARGET_PAGE_SIZE) == -1) {
3036 error_report("Failed to load XBZRLE page - decode error!");
3037 return -1;
3038 }
3039
3040 return 0;
3041 }
3042
3043 /**
3044 * ram_block_from_stream: read a RAMBlock id from the migration stream
3045 *
3046 * Must be called from within a rcu critical section.
3047 *
3048 * Returns a pointer from within the RCU-protected ram_list.
3049 *
3050 * @f: QEMUFile where to read the data from
3051 * @flags: Page flags (mostly to see if it's a continuation of previous block)
3052 */
3053 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3054 {
3055 static RAMBlock *block;
3056 char id[256];
3057 uint8_t len;
3058
3059 if (flags & RAM_SAVE_FLAG_CONTINUE) {
3060 if (!block) {
3061 error_report("Ack, bad migration stream!");
3062 return NULL;
3063 }
3064 return block;
3065 }
3066
3067 len = qemu_get_byte(f);
3068 qemu_get_buffer(f, (uint8_t *)id, len);
3069 id[len] = 0;
3070
3071 block = qemu_ram_block_by_name(id);
3072 if (!block) {
3073 error_report("Can't find block %s", id);
3074 return NULL;
3075 }
3076
3077 if (ramblock_is_ignored(block)) {
3078 error_report("block %s should not be migrated !", id);
3079 return NULL;
3080 }
3081
3082 return block;
3083 }
3084
3085 static inline void *host_from_ram_block_offset(RAMBlock *block,
3086 ram_addr_t offset)
3087 {
3088 if (!offset_in_ramblock(block, offset)) {
3089 return NULL;
3090 }
3091
3092 return block->host + offset;
3093 }
3094
3095 static void *host_page_from_ram_block_offset(RAMBlock *block,
3096 ram_addr_t offset)
3097 {
3098 /* Note: Explicitly no check against offset_in_ramblock(). */
3099 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3100 block->page_size);
3101 }
3102
3103 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3104 ram_addr_t offset)
3105 {
3106 return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3107 }
3108
3109 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3110 ram_addr_t offset, bool record_bitmap)
3111 {
3112 if (!offset_in_ramblock(block, offset)) {
3113 return NULL;
3114 }
3115 if (!block->colo_cache) {
3116 error_report("%s: colo_cache is NULL in block :%s",
3117 __func__, block->idstr);
3118 return NULL;
3119 }
3120
3121 /*
3122 * During colo checkpoint, we need bitmap of these migrated pages.
3123 * It help us to decide which pages in ram cache should be flushed
3124 * into VM's RAM later.
3125 */
3126 if (record_bitmap &&
3127 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3128 ram_state->migration_dirty_pages++;
3129 }
3130 return block->colo_cache + offset;
3131 }
3132
3133 /**
3134 * ram_handle_compressed: handle the zero page case
3135 *
3136 * If a page (or a whole RDMA chunk) has been
3137 * determined to be zero, then zap it.
3138 *
3139 * @host: host address for the zero page
3140 * @ch: what the page is filled from. We only support zero
3141 * @size: size of the zero page
3142 */
3143 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3144 {
3145 if (ch != 0 || !is_zero_range(host, size)) {
3146 memset(host, ch, size);
3147 }
3148 }
3149
3150 /* return the size after decompression, or negative value on error */
3151 static int
3152 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3153 const uint8_t *source, size_t source_len)
3154 {
3155 int err;
3156
3157 err = inflateReset(stream);
3158 if (err != Z_OK) {
3159 return -1;
3160 }
3161
3162 stream->avail_in = source_len;
3163 stream->next_in = (uint8_t *)source;
3164 stream->avail_out = dest_len;
3165 stream->next_out = dest;
3166
3167 err = inflate(stream, Z_NO_FLUSH);
3168 if (err != Z_STREAM_END) {
3169 return -1;
3170 }
3171
3172 return stream->total_out;
3173 }
3174
3175 static void *do_data_decompress(void *opaque)
3176 {
3177 DecompressParam *param = opaque;
3178 unsigned long pagesize;
3179 uint8_t *des;
3180 int len, ret;
3181
3182 qemu_mutex_lock(&param->mutex);
3183 while (!param->quit) {
3184 if (param->des) {
3185 des = param->des;
3186 len = param->len;
3187 param->des = 0;
3188 qemu_mutex_unlock(&param->mutex);
3189
3190 pagesize = TARGET_PAGE_SIZE;
3191
3192 ret = qemu_uncompress_data(&param->stream, des, pagesize,
3193 param->compbuf, len);
3194 if (ret < 0 && migrate_get_current()->decompress_error_check) {
3195 error_report("decompress data failed");
3196 qemu_file_set_error(decomp_file, ret);
3197 }
3198
3199 qemu_mutex_lock(&decomp_done_lock);
3200 param->done = true;
3201 qemu_cond_signal(&decomp_done_cond);
3202 qemu_mutex_unlock(&decomp_done_lock);
3203
3204 qemu_mutex_lock(&param->mutex);
3205 } else {
3206 qemu_cond_wait(&param->cond, &param->mutex);
3207 }
3208 }
3209 qemu_mutex_unlock(&param->mutex);
3210
3211 return NULL;
3212 }
3213
3214 static int wait_for_decompress_done(void)
3215 {
3216 int idx, thread_count;
3217
3218 if (!migrate_use_compression()) {
3219 return 0;
3220 }
3221
3222 thread_count = migrate_decompress_threads();
3223 qemu_mutex_lock(&decomp_done_lock);
3224 for (idx = 0; idx < thread_count; idx++) {
3225 while (!decomp_param[idx].done) {
3226 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3227 }
3228 }
3229 qemu_mutex_unlock(&decomp_done_lock);
3230 return qemu_file_get_error(decomp_file);
3231 }
3232
3233 static void compress_threads_load_cleanup(void)
3234 {
3235 int i, thread_count;
3236
3237 if (!migrate_use_compression()) {
3238 return;
3239 }
3240 thread_count = migrate_decompress_threads();
3241 for (i = 0; i < thread_count; i++) {
3242 /*
3243 * we use it as a indicator which shows if the thread is
3244 * properly init'd or not
3245 */
3246 if (!decomp_param[i].compbuf) {
3247 break;
3248 }
3249
3250 qemu_mutex_lock(&decomp_param[i].mutex);
3251 decomp_param[i].quit = true;
3252 qemu_cond_signal(&decomp_param[i].cond);
3253 qemu_mutex_unlock(&decomp_param[i].mutex);
3254 }
3255 for (i = 0; i < thread_count; i++) {
3256 if (!decomp_param[i].compbuf) {
3257 break;
3258 }
3259
3260 qemu_thread_join(decompress_threads + i);
3261 qemu_mutex_destroy(&decomp_param[i].mutex);
3262 qemu_cond_destroy(&decomp_param[i].cond);
3263 inflateEnd(&decomp_param[i].stream);
3264 g_free(decomp_param[i].compbuf);
3265 decomp_param[i].compbuf = NULL;
3266 }
3267 g_free(decompress_threads);
3268 g_free(decomp_param);
3269 decompress_threads = NULL;
3270 decomp_param = NULL;
3271 decomp_file = NULL;
3272 }
3273
3274 static int compress_threads_load_setup(QEMUFile *f)
3275 {
3276 int i, thread_count;
3277
3278 if (!migrate_use_compression()) {
3279 return 0;
3280 }
3281
3282 thread_count = migrate_decompress_threads();
3283 decompress_threads = g_new0(QemuThread, thread_count);
3284 decomp_param = g_new0(DecompressParam, thread_count);
3285 qemu_mutex_init(&decomp_done_lock);
3286 qemu_cond_init(&decomp_done_cond);
3287 decomp_file = f;
3288 for (i = 0; i < thread_count; i++) {
3289 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3290 goto exit;
3291 }
3292
3293 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3294 qemu_mutex_init(&decomp_param[i].mutex);
3295 qemu_cond_init(&decomp_param[i].cond);
3296 decomp_param[i].done = true;
3297 decomp_param[i].quit = false;
3298 qemu_thread_create(decompress_threads + i, "decompress",
3299 do_data_decompress, decomp_param + i,
3300 QEMU_THREAD_JOINABLE);
3301 }
3302 return 0;
3303 exit:
3304 compress_threads_load_cleanup();
3305 return -1;
3306 }
3307
3308 static void decompress_data_with_multi_threads(QEMUFile *f,
3309 void *host, int len)
3310 {
3311 int idx, thread_count;
3312
3313 thread_count = migrate_decompress_threads();
3314 QEMU_LOCK_GUARD(&decomp_done_lock);
3315 while (true) {
3316 for (idx = 0; idx < thread_count; idx++) {
3317 if (decomp_param[idx].done) {
3318 decomp_param[idx].done = false;
3319 qemu_mutex_lock(&decomp_param[idx].mutex);
3320 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3321 decomp_param[idx].des = host;
3322 decomp_param[idx].len = len;
3323 qemu_cond_signal(&decomp_param[idx].cond);
3324 qemu_mutex_unlock(&decomp_param[idx].mutex);
3325 break;
3326 }
3327 }
3328 if (idx < thread_count) {
3329 break;
3330 } else {
3331 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3332 }
3333 }
3334 }
3335
3336 static void colo_init_ram_state(void)
3337 {
3338 ram_state_init(&ram_state);
3339 }
3340
3341 /*
3342 * colo cache: this is for secondary VM, we cache the whole
3343 * memory of the secondary VM, it is need to hold the global lock
3344 * to call this helper.
3345 */
3346 int colo_init_ram_cache(void)
3347 {
3348 RAMBlock *block;
3349
3350 WITH_RCU_READ_LOCK_GUARD() {
3351 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3352 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3353 NULL, false, false);
3354 if (!block->colo_cache) {
3355 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3356 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3357 block->used_length);
3358 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3359 if (block->colo_cache) {
3360 qemu_anon_ram_free(block->colo_cache, block->used_length);
3361 block->colo_cache = NULL;
3362 }
3363 }
3364 return -errno;
3365 }
3366 }
3367 }
3368
3369 /*
3370 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3371 * with to decide which page in cache should be flushed into SVM's RAM. Here
3372 * we use the same name 'ram_bitmap' as for migration.
3373 */
3374 if (ram_bytes_total()) {
3375 RAMBlock *block;
3376
3377 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3378 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3379 block->bmap = bitmap_new(pages);
3380 }
3381 }
3382
3383 colo_init_ram_state();
3384 return 0;
3385 }
3386
3387 /* TODO: duplicated with ram_init_bitmaps */
3388 void colo_incoming_start_dirty_log(void)
3389 {
3390 RAMBlock *block = NULL;
3391 /* For memory_global_dirty_log_start below. */
3392 qemu_mutex_lock_iothread();
3393 qemu_mutex_lock_ramlist();
3394
3395 memory_global_dirty_log_sync();
3396 WITH_RCU_READ_LOCK_GUARD() {
3397 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3398 ramblock_sync_dirty_bitmap(ram_state, block);
3399 /* Discard this dirty bitmap record */
3400 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3401 }
3402 memory_global_dirty_log_start();
3403 }
3404 ram_state->migration_dirty_pages = 0;
3405 qemu_mutex_unlock_ramlist();
3406 qemu_mutex_unlock_iothread();
3407 }
3408
3409 /* It is need to hold the global lock to call this helper */
3410 void colo_release_ram_cache(void)
3411 {
3412 RAMBlock *block;
3413
3414 memory_global_dirty_log_stop();
3415 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3416 g_free(block->bmap);
3417 block->bmap = NULL;
3418 }
3419
3420 WITH_RCU_READ_LOCK_GUARD() {
3421 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3422 if (block->colo_cache) {
3423 qemu_anon_ram_free(block->colo_cache, block->used_length);
3424 block->colo_cache = NULL;
3425 }
3426 }
3427 }
3428 ram_state_cleanup(&ram_state);
3429 }
3430
3431 /**
3432 * ram_load_setup: Setup RAM for migration incoming side
3433 *
3434 * Returns zero to indicate success and negative for error
3435 *
3436 * @f: QEMUFile where to receive the data
3437 * @opaque: RAMState pointer
3438 */
3439 static int ram_load_setup(QEMUFile *f, void *opaque)
3440 {
3441 if (compress_threads_load_setup(f)) {
3442 return -1;
3443 }
3444
3445 xbzrle_load_setup();
3446 ramblock_recv_map_init();
3447
3448 return 0;
3449 }
3450
3451 static int ram_load_cleanup(void *opaque)
3452 {
3453 RAMBlock *rb;
3454
3455 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3456 qemu_ram_block_writeback(rb);
3457 }
3458
3459 xbzrle_load_cleanup();
3460 compress_threads_load_cleanup();
3461
3462 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3463 g_free(rb->receivedmap);
3464 rb->receivedmap = NULL;
3465 }
3466
3467 return 0;
3468 }
3469
3470 /**
3471 * ram_postcopy_incoming_init: allocate postcopy data structures
3472 *
3473 * Returns 0 for success and negative if there was one error
3474 *
3475 * @mis: current migration incoming state
3476 *
3477 * Allocate data structures etc needed by incoming migration with
3478 * postcopy-ram. postcopy-ram's similarly names
3479 * postcopy_ram_incoming_init does the work.
3480 */
3481 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3482 {
3483 return postcopy_ram_incoming_init(mis);
3484 }
3485
3486 /**
3487 * ram_load_postcopy: load a page in postcopy case
3488 *
3489 * Returns 0 for success or -errno in case of error
3490 *
3491 * Called in postcopy mode by ram_load().
3492 * rcu_read_lock is taken prior to this being called.
3493 *
3494 * @f: QEMUFile where to send the data
3495 */
3496 static int ram_load_postcopy(QEMUFile *f)
3497 {
3498 int flags = 0, ret = 0;
3499 bool place_needed = false;
3500 bool matches_target_page_size = false;
3501 MigrationIncomingState *mis = migration_incoming_get_current();
3502 /* Temporary page that is later 'placed' */
3503 void *postcopy_host_page = mis->postcopy_tmp_page;
3504 void *host_page = NULL;
3505 bool all_zero = true;
3506 int target_pages = 0;
3507
3508 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3509 ram_addr_t addr;
3510 void *page_buffer = NULL;
3511 void *place_source = NULL;
3512 RAMBlock *block = NULL;
3513 uint8_t ch;
3514 int len;
3515
3516 addr = qemu_get_be64(f);
3517
3518 /*
3519 * If qemu file error, we should stop here, and then "addr"
3520 * may be invalid
3521 */
3522 ret = qemu_file_get_error(f);
3523 if (ret) {
3524 break;
3525 }
3526
3527 flags = addr & ~TARGET_PAGE_MASK;
3528 addr &= TARGET_PAGE_MASK;
3529
3530 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3531 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3532 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3533 block = ram_block_from_stream(f, flags);
3534 if (!block) {
3535 ret = -EINVAL;
3536 break;
3537 }
3538
3539 /*
3540 * Relying on used_length is racy and can result in false positives.
3541 * We might place pages beyond used_length in case RAM was shrunk
3542 * while in postcopy, which is fine - trying to place via
3543 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3544 */
3545 if (!block->host || addr >= block->postcopy_length) {
3546 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3547 ret = -EINVAL;
3548 break;
3549 }
3550 target_pages++;
3551 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3552 /*
3553 * Postcopy requires that we place whole host pages atomically;
3554 * these may be huge pages for RAMBlocks that are backed by
3555 * hugetlbfs.
3556 * To make it atomic, the data is read into a temporary page
3557 * that's moved into place later.
3558 * The migration protocol uses, possibly smaller, target-pages
3559 * however the source ensures it always sends all the components
3560 * of a host page in one chunk.
3561 */
3562 page_buffer = postcopy_host_page +
3563 host_page_offset_from_ram_block_offset(block, addr);
3564 /* If all TP are zero then we can optimise the place */
3565 if (target_pages == 1) {
3566 host_page = host_page_from_ram_block_offset(block, addr);
3567 } else if (host_page != host_page_from_ram_block_offset(block,
3568 addr)) {
3569 /* not the 1st TP within the HP */
3570 error_report("Non-same host page %p/%p", host_page,
3571 host_page_from_ram_block_offset(block, addr));
3572 ret = -EINVAL;
3573 break;
3574 }
3575
3576 /*
3577 * If it's the last part of a host page then we place the host
3578 * page
3579 */
3580 if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3581 place_needed = true;
3582 }
3583 place_source = postcopy_host_page;
3584 }
3585
3586 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3587 case RAM_SAVE_FLAG_ZERO:
3588 ch = qemu_get_byte(f);
3589 /*
3590 * Can skip to set page_buffer when
3591 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3592 */
3593 if (ch || !matches_target_page_size) {
3594 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3595 }
3596 if (ch) {
3597 all_zero = false;
3598 }
3599 break;
3600
3601 case RAM_SAVE_FLAG_PAGE:
3602 all_zero = false;
3603 if (!matches_target_page_size) {
3604 /* For huge pages, we always use temporary buffer */
3605 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3606 } else {
3607 /*
3608 * For small pages that matches target page size, we
3609 * avoid the qemu_file copy. Instead we directly use
3610 * the buffer of QEMUFile to place the page. Note: we
3611 * cannot do any QEMUFile operation before using that
3612 * buffer to make sure the buffer is valid when
3613 * placing the page.
3614 */
3615 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3616 TARGET_PAGE_SIZE);
3617 }
3618 break;
3619 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3620 all_zero = false;
3621 len = qemu_get_be32(f);
3622 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3623 error_report("Invalid compressed data length: %d", len);
3624 ret = -EINVAL;
3625 break;
3626 }
3627 decompress_data_with_multi_threads(f, page_buffer, len);
3628 break;
3629
3630 case RAM_SAVE_FLAG_EOS:
3631 /* normal exit */
3632 multifd_recv_sync_main();
3633 break;
3634 default:
3635 error_report("Unknown combination of migration flags: 0x%x"
3636 " (postcopy mode)", flags);
3637 ret = -EINVAL;
3638 break;
3639 }
3640
3641 /* Got the whole host page, wait for decompress before placing. */
3642 if (place_needed) {
3643 ret |= wait_for_decompress_done();
3644 }
3645
3646 /* Detect for any possible file errors */
3647 if (!ret && qemu_file_get_error(f)) {
3648 ret = qemu_file_get_error(f);
3649 }
3650
3651 if (!ret && place_needed) {
3652 if (all_zero) {
3653 ret = postcopy_place_page_zero(mis, host_page, block);
3654 } else {
3655 ret = postcopy_place_page(mis, host_page, place_source,
3656 block);
3657 }
3658 place_needed = false;
3659 target_pages = 0;
3660 /* Assume we have a zero page until we detect something different */
3661 all_zero = true;
3662 }
3663 }
3664
3665 return ret;
3666 }
3667
3668 static bool postcopy_is_advised(void)
3669 {
3670 PostcopyState ps = postcopy_state_get();
3671 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3672 }
3673
3674 static bool postcopy_is_running(void)
3675 {
3676 PostcopyState ps = postcopy_state_get();
3677 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3678 }
3679
3680 /*
3681 * Flush content of RAM cache into SVM's memory.
3682 * Only flush the pages that be dirtied by PVM or SVM or both.
3683 */
3684 void colo_flush_ram_cache(void)
3685 {
3686 RAMBlock *block = NULL;
3687 void *dst_host;
3688 void *src_host;
3689 unsigned long offset = 0;
3690
3691 memory_global_dirty_log_sync();
3692 qemu_mutex_lock(&ram_state->bitmap_mutex);
3693 WITH_RCU_READ_LOCK_GUARD() {
3694 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3695 ramblock_sync_dirty_bitmap(ram_state, block);
3696 }
3697 }
3698
3699 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3700 WITH_RCU_READ_LOCK_GUARD() {
3701 block = QLIST_FIRST_RCU(&ram_list.blocks);
3702
3703 while (block) {
3704 offset = migration_bitmap_find_dirty(ram_state, block, offset);
3705
3706 if (!offset_in_ramblock(block,
3707 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
3708 offset = 0;
3709 block = QLIST_NEXT_RCU(block, next);
3710 } else {
3711 migration_bitmap_clear_dirty(ram_state, block, offset);
3712 dst_host = block->host
3713 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3714 src_host = block->colo_cache
3715 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3716 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3717 }
3718 }
3719 }
3720 trace_colo_flush_ram_cache_end();
3721 qemu_mutex_unlock(&ram_state->bitmap_mutex);
3722 }
3723
3724 /**
3725 * ram_load_precopy: load pages in precopy case
3726 *
3727 * Returns 0 for success or -errno in case of error
3728 *
3729 * Called in precopy mode by ram_load().
3730 * rcu_read_lock is taken prior to this being called.
3731 *
3732 * @f: QEMUFile where to send the data
3733 */
3734 static int ram_load_precopy(QEMUFile *f)
3735 {
3736 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3737 /* ADVISE is earlier, it shows the source has the postcopy capability on */
3738 bool postcopy_advised = postcopy_is_advised();
3739 if (!migrate_use_compression()) {
3740 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3741 }
3742
3743 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3744 ram_addr_t addr, total_ram_bytes;
3745 void *host = NULL, *host_bak = NULL;
3746 uint8_t ch;
3747
3748 /*
3749 * Yield periodically to let main loop run, but an iteration of
3750 * the main loop is expensive, so do it each some iterations
3751 */
3752 if ((i & 32767) == 0 && qemu_in_coroutine()) {
3753 aio_co_schedule(qemu_get_current_aio_context(),
3754 qemu_coroutine_self());
3755 qemu_coroutine_yield();
3756 }
3757 i++;
3758
3759 addr = qemu_get_be64(f);
3760 flags = addr & ~TARGET_PAGE_MASK;
3761 addr &= TARGET_PAGE_MASK;
3762
3763 if (flags & invalid_flags) {
3764 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3765 error_report("Received an unexpected compressed page");
3766 }
3767
3768 ret = -EINVAL;
3769 break;
3770 }
3771
3772 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3773 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3774 RAMBlock *block = ram_block_from_stream(f, flags);
3775
3776 host = host_from_ram_block_offset(block, addr);
3777 /*
3778 * After going into COLO stage, we should not load the page
3779 * into SVM's memory directly, we put them into colo_cache firstly.
3780 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3781 * Previously, we copied all these memory in preparing stage of COLO
3782 * while we need to stop VM, which is a time-consuming process.
3783 * Here we optimize it by a trick, back-up every page while in
3784 * migration process while COLO is enabled, though it affects the
3785 * speed of the migration, but it obviously reduce the downtime of
3786 * back-up all SVM'S memory in COLO preparing stage.
3787 */
3788 if (migration_incoming_colo_enabled()) {
3789 if (migration_incoming_in_colo_state()) {
3790 /* In COLO stage, put all pages into cache temporarily */
3791 host = colo_cache_from_block_offset(block, addr, true);
3792 } else {
3793 /*
3794 * In migration stage but before COLO stage,
3795 * Put all pages into both cache and SVM's memory.
3796 */
3797 host_bak = colo_cache_from_block_offset(block, addr, false);
3798 }
3799 }
3800 if (!host) {
3801 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3802 ret = -EINVAL;
3803 break;
3804 }
3805 if (!migration_incoming_in_colo_state()) {
3806 ramblock_recv_bitmap_set(block, host);
3807 }
3808
3809 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3810 }
3811
3812 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3813 case RAM_SAVE_FLAG_MEM_SIZE:
3814 /* Synchronize RAM block list */
3815 total_ram_bytes = addr;
3816 while (!ret && total_ram_bytes) {
3817 RAMBlock *block;
3818 char id[256];
3819 ram_addr_t length;
3820
3821 len = qemu_get_byte(f);
3822 qemu_get_buffer(f, (uint8_t *)id, len);
3823 id[len] = 0;
3824 length = qemu_get_be64(f);
3825
3826 block = qemu_ram_block_by_name(id);
3827 if (block && !qemu_ram_is_migratable(block)) {
3828 error_report("block %s should not be migrated !", id);
3829 ret = -EINVAL;
3830 } else if (block) {
3831 if (length != block->used_length) {
3832 Error *local_err = NULL;
3833
3834 ret = qemu_ram_resize(block, length,
3835 &local_err);
3836 if (local_err) {
3837 error_report_err(local_err);
3838 }
3839 }
3840 /* For postcopy we need to check hugepage sizes match */
3841 if (postcopy_advised && migrate_postcopy_ram() &&
3842 block->page_size != qemu_host_page_size) {
3843 uint64_t remote_page_size = qemu_get_be64(f);
3844 if (remote_page_size != block->page_size) {
3845 error_report("Mismatched RAM page size %s "
3846 "(local) %zd != %" PRId64,
3847 id, block->page_size,
3848 remote_page_size);
3849 ret = -EINVAL;
3850 }
3851 }
3852 if (migrate_ignore_shared()) {
3853 hwaddr addr = qemu_get_be64(f);
3854 if (ramblock_is_ignored(block) &&
3855 block->mr->addr != addr) {
3856 error_report("Mismatched GPAs for block %s "
3857 "%" PRId64 "!= %" PRId64,
3858 id, (uint64_t)addr,
3859 (uint64_t)block->mr->addr);
3860 ret = -EINVAL;
3861 }
3862 }
3863 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3864 block->idstr);
3865 } else {
3866 error_report("Unknown ramblock \"%s\", cannot "
3867 "accept migration", id);
3868 ret = -EINVAL;
3869 }
3870
3871 total_ram_bytes -= length;
3872 }
3873 break;
3874
3875 case RAM_SAVE_FLAG_ZERO:
3876 ch = qemu_get_byte(f);
3877 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3878 break;
3879
3880 case RAM_SAVE_FLAG_PAGE:
3881 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3882 break;
3883
3884 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3885 len = qemu_get_be32(f);
3886 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3887 error_report("Invalid compressed data length: %d", len);
3888 ret = -EINVAL;
3889 break;
3890 }
3891 decompress_data_with_multi_threads(f, host, len);
3892 break;
3893
3894 case RAM_SAVE_FLAG_XBZRLE:
3895 if (load_xbzrle(f, addr, host) < 0) {
3896 error_report("Failed to decompress XBZRLE page at "
3897 RAM_ADDR_FMT, addr);
3898 ret = -EINVAL;
3899 break;
3900 }
3901 break;
3902 case RAM_SAVE_FLAG_EOS:
3903 /* normal exit */
3904 multifd_recv_sync_main();
3905 break;
3906 default:
3907 if (flags & RAM_SAVE_FLAG_HOOK) {
3908 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3909 } else {
3910 error_report("Unknown combination of migration flags: 0x%x",
3911 flags);
3912 ret = -EINVAL;
3913 }
3914 }
3915 if (!ret) {
3916 ret = qemu_file_get_error(f);
3917 }
3918 if (!ret && host_bak) {
3919 memcpy(host_bak, host, TARGET_PAGE_SIZE);
3920 }
3921 }
3922
3923 ret |= wait_for_decompress_done();
3924 return ret;
3925 }
3926
3927 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3928 {
3929 int ret = 0;
3930 static uint64_t seq_iter;
3931 /*
3932 * If system is running in postcopy mode, page inserts to host memory must
3933 * be atomic
3934 */
3935 bool postcopy_running = postcopy_is_running();
3936
3937 seq_iter++;
3938
3939 if (version_id != 4) {
3940 return -EINVAL;
3941 }
3942
3943 /*
3944 * This RCU critical section can be very long running.
3945 * When RCU reclaims in the code start to become numerous,
3946 * it will be necessary to reduce the granularity of this
3947 * critical section.
3948 */
3949 WITH_RCU_READ_LOCK_GUARD() {
3950 if (postcopy_running) {
3951 ret = ram_load_postcopy(f);
3952 } else {
3953 ret = ram_load_precopy(f);
3954 }
3955 }
3956 trace_ram_load_complete(ret, seq_iter);
3957
3958 return ret;
3959 }
3960
3961 static bool ram_has_postcopy(void *opaque)
3962 {
3963 RAMBlock *rb;
3964 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3965 if (ramblock_is_pmem(rb)) {
3966 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3967 "is not supported now!", rb->idstr, rb->host);
3968 return false;
3969 }
3970 }
3971
3972 return migrate_postcopy_ram();
3973 }
3974
3975 /* Sync all the dirty bitmap with destination VM. */
3976 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3977 {
3978 RAMBlock *block;
3979 QEMUFile *file = s->to_dst_file;
3980 int ramblock_count = 0;
3981
3982 trace_ram_dirty_bitmap_sync_start();
3983
3984 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3985 qemu_savevm_send_recv_bitmap(file, block->idstr);
3986 trace_ram_dirty_bitmap_request(block->idstr);
3987 ramblock_count++;
3988 }
3989
3990 trace_ram_dirty_bitmap_sync_wait();
3991
3992 /* Wait until all the ramblocks' dirty bitmap synced */
3993 while (ramblock_count--) {
3994 qemu_sem_wait(&s->rp_state.rp_sem);
3995 }
3996
3997 trace_ram_dirty_bitmap_sync_complete();
3998
3999 return 0;
4000 }
4001
4002 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4003 {
4004 qemu_sem_post(&s->rp_state.rp_sem);
4005 }
4006
4007 /*
4008 * Read the received bitmap, revert it as the initial dirty bitmap.
4009 * This is only used when the postcopy migration is paused but wants
4010 * to resume from a middle point.
4011 */
4012 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4013 {
4014 int ret = -EINVAL;
4015 QEMUFile *file = s->rp_state.from_dst_file;
4016 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4017 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4018 uint64_t size, end_mark;
4019
4020 trace_ram_dirty_bitmap_reload_begin(block->idstr);
4021
4022 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4023 error_report("%s: incorrect state %s", __func__,
4024 MigrationStatus_str(s->state));
4025 return -EINVAL;
4026 }
4027
4028 /*
4029 * Note: see comments in ramblock_recv_bitmap_send() on why we
4030 * need the endianness conversion, and the paddings.
4031 */
4032 local_size = ROUND_UP(local_size, 8);
4033
4034 /* Add paddings */
4035 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4036
4037 size = qemu_get_be64(file);
4038
4039 /* The size of the bitmap should match with our ramblock */
4040 if (size != local_size) {
4041 error_report("%s: ramblock '%s' bitmap size mismatch "
4042 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4043 block->idstr, size, local_size);
4044 ret = -EINVAL;
4045 goto out;
4046 }
4047
4048 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4049 end_mark = qemu_get_be64(file);
4050
4051 ret = qemu_file_get_error(file);
4052 if (ret || size != local_size) {
4053 error_report("%s: read bitmap failed for ramblock '%s': %d"
4054 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4055 __func__, block->idstr, ret, local_size, size);
4056 ret = -EIO;
4057 goto out;
4058 }
4059
4060 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4061 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4062 __func__, block->idstr, end_mark);
4063 ret = -EINVAL;
4064 goto out;
4065 }
4066
4067 /*
4068 * Endianness conversion. We are during postcopy (though paused).
4069 * The dirty bitmap won't change. We can directly modify it.
4070 */
4071 bitmap_from_le(block->bmap, le_bitmap, nbits);
4072
4073 /*
4074 * What we received is "received bitmap". Revert it as the initial
4075 * dirty bitmap for this ramblock.
4076 */
4077 bitmap_complement(block->bmap, block->bmap, nbits);
4078
4079 trace_ram_dirty_bitmap_reload_complete(block->idstr);
4080
4081 /*
4082 * We succeeded to sync bitmap for current ramblock. If this is
4083 * the last one to sync, we need to notify the main send thread.
4084 */
4085 ram_dirty_bitmap_reload_notify(s);
4086
4087 ret = 0;
4088 out:
4089 g_free(le_bitmap);
4090 return ret;
4091 }
4092
4093 static int ram_resume_prepare(MigrationState *s, void *opaque)
4094 {
4095 RAMState *rs = *(RAMState **)opaque;
4096 int ret;
4097
4098 ret = ram_dirty_bitmap_sync_all(s, rs);
4099 if (ret) {
4100 return ret;
4101 }
4102
4103 ram_state_resume_prepare(rs, s->to_dst_file);
4104
4105 return 0;
4106 }
4107
4108 static SaveVMHandlers savevm_ram_handlers = {
4109 .save_setup = ram_save_setup,
4110 .save_live_iterate = ram_save_iterate,
4111 .save_live_complete_postcopy = ram_save_complete,
4112 .save_live_complete_precopy = ram_save_complete,
4113 .has_postcopy = ram_has_postcopy,
4114 .save_live_pending = ram_save_pending,
4115 .load_state = ram_load,
4116 .save_cleanup = ram_save_cleanup,
4117 .load_setup = ram_load_setup,
4118 .load_cleanup = ram_load_cleanup,
4119 .resume_prepare = ram_resume_prepare,
4120 };
4121
4122 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4123 size_t old_size, size_t new_size)
4124 {
4125 PostcopyState ps = postcopy_state_get();
4126 ram_addr_t offset;
4127 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4128 Error *err = NULL;
4129
4130 if (ramblock_is_ignored(rb)) {
4131 return;
4132 }
4133
4134 if (!migration_is_idle()) {
4135 /*
4136 * Precopy code on the source cannot deal with the size of RAM blocks
4137 * changing at random points in time - especially after sending the
4138 * RAM block sizes in the migration stream, they must no longer change.
4139 * Abort and indicate a proper reason.
4140 */
4141 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4142 migrate_set_error(migrate_get_current(), err);
4143 error_free(err);
4144 migration_cancel();
4145 }
4146
4147 switch (ps) {
4148 case POSTCOPY_INCOMING_ADVISE:
4149 /*
4150 * Update what ram_postcopy_incoming_init()->init_range() does at the
4151 * time postcopy was advised. Syncing RAM blocks with the source will
4152 * result in RAM resizes.
4153 */
4154 if (old_size < new_size) {
4155 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4156 error_report("RAM block '%s' discard of resized RAM failed",
4157 rb->idstr);
4158 }
4159 }
4160 rb->postcopy_length = new_size;
4161 break;
4162 case POSTCOPY_INCOMING_NONE:
4163 case POSTCOPY_INCOMING_RUNNING:
4164 case POSTCOPY_INCOMING_END:
4165 /*
4166 * Once our guest is running, postcopy does no longer care about
4167 * resizes. When growing, the new memory was not available on the
4168 * source, no handler needed.
4169 */
4170 break;
4171 default:
4172 error_report("RAM block '%s' resized during postcopy state: %d",
4173 rb->idstr, ps);
4174 exit(-1);
4175 }
4176 }
4177
4178 static RAMBlockNotifier ram_mig_ram_notifier = {
4179 .ram_block_resized = ram_mig_ram_block_resized,
4180 };
4181
4182 void ram_mig_init(void)
4183 {
4184 qemu_mutex_init(&XBZRLE.lock);
4185 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4186 ram_block_notifier_add(&ram_mig_ram_notifier);
4187 }